diff --git a/README.md b/README.md index 84ab7e4..d25f0aa 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,6 @@ to find repos of the mirrored projects to check, just search "projectName mirror not done: NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker -opensuse: http://download.opensuse.org/ check Update.repo files in folders inside the update folder puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy done: @@ -67,6 +66,7 @@ mxlinux-iso: this one seems out of sync on the official tracker for 134 days, wh mysql: http://mirrors.sunsite.dk/mysql/ nongnu: http://download.savannah.nongnu.org/releases/ https://savannah.gnu.org/maintenance/Mirmon/ http://download.savannah.gnu.org/mirmon/savannah/ openbsd +opensuse: http://download.opensuse.org/ check Update.repo files in folders inside the update folder, not checking tumbleweed-non-oss/ and tumbleweed/ temporarily parabola: https://repo.parabola.nu/ https://www.parabola.nu/mirrors/status/ pkgsrc qtproject: https://download.qt.io/ @@ -84,6 +84,6 @@ ubuntu-ports: http://ports.ubuntu.com/ubuntu-ports/ checks the file anonster.can ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it ubuntu-releases: https://releases.ubuntu.com/ vlc: http://download.videolan.org/pub/videolan/ -x.org: https://www.x.org/releases/ check all of the files under each directory under /x.org/individual/, and make sure that we have all of the files which the upstream has +x.org: https://www.x.org/releases/ check all of the files under each directory under /x.org/individual/, and make sure that we have all of the files which the upstream has, ignoring the xcb folder Xiph: https://ftp.osuosl.org/pub/xiph/releases/ loop through each directory in xiph/releases/ and trying to compare the timestamp of the checksum files xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders \ No newline at end of file diff --git a/data.json b/data.json index 728f0d3..dcfb040 100644 --- a/data.json +++ b/data.json @@ -14,14 +14,14 @@ "file": "alpine/last-updated" }, "Apache": { - "out_of_sync_since": 1633294718, + "out_of_sync_since": null, "out_of_sync_interval": 86400, "csc": "apache/", "upstream": "https://downloads.apache.org/", "file": "zzz/time.txt" }, "Arch": { - "out_of_sync_since": null, + "out_of_sync_since": 1634433282, "out_of_sync_interval": 86400, "csc": "archlinux/", "upstream": "http://arch.mirror.constant.com/", @@ -94,7 +94,7 @@ "file": "TIME" }, "Fedora": { - "out_of_sync_since": 1633923341, + "out_of_sync_since": null, "out_of_sync_interval": 86400, "csc": "fedora/", "upstream": "http://fedora.mirror.iweb.com/", @@ -380,5 +380,12 @@ "csc": "x.org/individual/", "upstream": "https://www.x.org/releases/individual/", "file": "" + }, + "opensuse": { + "out_of_sync_since": null, + "out_of_sync_interval": 86400, + "csc": "opensuse/update/", + "upstream": "http://download.opensuse.org/update/", + "file": "" } } \ No newline at end of file diff --git a/projects/opensuse.py b/projects/opensuse.py index fee6444..6fe4522 100644 --- a/projects/opensuse.py +++ b/projects/opensuse.py @@ -1,48 +1,66 @@ from bs4 import BeautifulSoup import requests +import re +import datefinder # another date finding library from project import Project from shared import CSC_MIRROR class opensuse(Project): """opensuse class""" + @staticmethod + def checker(directory_URL, file_name): + page = requests.get(directory_URL).text + file_index = page.find(file_name) + # print(page) + + if file_index == -1: + return False + + str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page[file_index:]) + + return list(datefinder.find_dates("".join(str_dates[0])))[0] + @classmethod - def scrape(cls, files, site): + def scrape(cls, compare, folders, site1, site2, directory): + if cls.checker(site1+directory, ".repo") != False: + # print (site1+directory) + # print (cls.checker(site1+directory, ".repo")) + if cls.checker(site2+directory, ".repo") != False: + # print (site2+directory) + # print (cls.checker(site2+directory, ".repo")) + compare.append(cls.checker(site1+directory, ".repo") <= cls.checker(site2+directory, ".repo")) + return + compare.append(False) + return + # getting the request from url - r = requests.get(site) - + r = requests.get(site1 + directory) + # converting the text s = BeautifulSoup(r.text,"html.parser") for i in s.find_all("a"): # for a href directories href = i.attrs['href'] - if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/"): - site_next = site+href + if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and href != "tumbleweed-non-oss/" and href != "tumbleweed/": + dir_next = directory+href + # print(dir_next) + # calling it self + if dir_next not in folders: + folders.append(dir_next) + cls.scrape(compare, folders, site1, site2, dir_next) - if site_next not in files: - files.append(href) - # print(href) - # calling it self - cls.scrape(files, site_next) - elif href != "../" and href != "/" and not href.startswith("/") and href != "?C=N;O=D" and href != "?C=M;O=A" and href != "?C=S;O=A" and href != "?C=D;O=A": - # print(href) - files.append(href) - - @classmethod def check(cls, data, project): """Check if project packages are up-to-date""" # lists - files1=[] - files2=[] + compare=[] + folders=[] csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] upstream_url = data[project]["upstream"] + data[project]["file"] # calling function - cls.scrape(files1, csc_url) - cls.scrape(files2, upstream_url) + cls.scrape(compare, folders, upstream_url, csc_url, "") - # print(set(files2) - set(files1)) - - return set(files1) == set(files2) \ No newline at end of file + return all(compare) \ No newline at end of file diff --git a/test.py b/test.py index 3148b47..a554a66 100644 --- a/test.py +++ b/test.py @@ -7,7 +7,7 @@ from datetime import timedelta import time import pandas as pd import re # for salt stack specifically -from projects import x_org +from projects import opensuse import json # import json to read project info stored in json file # this function is brute force looping through the whole directory and checking dates @@ -65,7 +65,7 @@ def get_latest_date(web_dir): if __name__ =="__main__": with open("data.json", "r", encoding="utf-8") as file: data = json.load(file) - print(x_org.check(data, "x_org")) + print(opensuse.check(data, "opensuse")) """# website to be scrape site="https://cdimage.ubuntu.com/releases/"