From 1df671b9e0537da1a690edc9e2a39b38c5b67642 Mon Sep 17 00:00:00 2001 From: Tom Date: Sat, 16 Oct 2021 21:08:37 -0700 Subject: [PATCH] added puppy linux --- data.json | 7 ++++ projects/puppy_linux.py | 79 +++++++++++++++++++++++++++++++++++++++++ test.py | 4 +-- 3 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 projects/puppy_linux.py diff --git a/data.json b/data.json index dcfb040..0a44746 100644 --- a/data.json +++ b/data.json @@ -387,5 +387,12 @@ "csc": "opensuse/update/", "upstream": "http://download.opensuse.org/update/", "file": "" + }, + "puppy_linux": { + "out_of_sync_since": null, + "out_of_sync_interval": 86400, + "csc": "puppylinux/", + "upstream": "https://distro.ibiblio.org/puppylinux/", + "file": "" } } \ No newline at end of file diff --git a/projects/puppy_linux.py b/projects/puppy_linux.py new file mode 100644 index 0000000..0a183c5 --- /dev/null +++ b/projects/puppy_linux.py @@ -0,0 +1,79 @@ +from bs4 import BeautifulSoup +import requests +import re +import datefinder # another date finding library +from project import Project +from shared import CSC_MIRROR + +class puppy_linux(Project): + """puppy_linux class""" + @staticmethod + def checker(page, file_name): + file_index = page.find(file_name) + # print(page) + + if file_index == -1: + return False + + str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\w{3}-\d{2} \d{2}:\d{2})', page[file_index:]) + + return list(datefinder.find_dates("".join(str_dates[0])))[0] + + @classmethod + def scrape(cls, compare, folders, site1, site2, directory): + # getting the request from url + r = requests.get(site1 + directory) + r1 = requests.get(site2 + directory) + + page1 = r.text + page2 = r1.text + + # converting the text + s = BeautifulSoup(page1,"html.parser") + + for i in s.find_all("a"): # for a href directories + href = i.attrs['href'] + + if href.endswith(".iso"): + date1 = cls.checker(page1, href) + if date1 != False: + # print (site1+directory) + # print (date1) + date2 = cls.checker(page2, href) + if date2 != False: + # print (site2+directory) + # print (date2) + compare.append(date1 <= date2) + return + compare.append(False) + return + elif href.endswith("/") and (href.startswith("puppy-") or directory != "") and href != "../" and href != "/" and not href.startswith("/"): + dir_next = directory+href + # print(dir_next) + # calling it self + if dir_next not in folders: + folders.append(dir_next) + cls.scrape(compare, folders, site1, site2, dir_next) + elif href.endswith(".htm") or href == "Packages.gz": + # print(href) + date2 = cls.checker(page2, href) + if date2 != False: + compare.append(cls.checker(page1, href) <= date2) + continue + compare.append(False) + + + @classmethod + def check(cls, data, project): + """Check if project packages are up-to-date""" + # lists + compare=[] + folders=[] + + csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] + upstream_url = data[project]["upstream"] + data[project]["file"] + + # calling function + cls.scrape(compare, folders, upstream_url, csc_url, "") + + return all(compare) \ No newline at end of file diff --git a/test.py b/test.py index a554a66..b4c799d 100644 --- a/test.py +++ b/test.py @@ -7,7 +7,7 @@ from datetime import timedelta import time import pandas as pd import re # for salt stack specifically -from projects import opensuse +from projects import puppy_linux import json # import json to read project info stored in json file # this function is brute force looping through the whole directory and checking dates @@ -65,7 +65,7 @@ def get_latest_date(web_dir): if __name__ =="__main__": with open("data.json", "r", encoding="utf-8") as file: data = json.load(file) - print(opensuse.check(data, "opensuse")) + print(puppy_linux.check(data, "puppy_linux")) """# website to be scrape site="https://cdimage.ubuntu.com/releases/"