From c76ae9c3254f80896fc2e5e24d04f6fe14ab6955 Mon Sep 17 00:00:00 2001 From: Tom Date: Thu, 14 Oct 2021 18:22:39 -0700 Subject: [PATCH] added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases --- README.md | 22 ++++---- data.json | 41 +++++++++++++-- projects/linuxmint.py | 85 +++++++++++++++++++++++++++++++ projects/linuxmint_packages.py | 55 ++++++++++++++++++++ projects/raspberrypi.py | 55 ++++++++++++++++++++ projects/ubuntu_ports_releases.py | 85 +++++++++++++++++++++++++++++++ projects/xubuntu_releases.py | 85 +++++++++++++++++++++++++++++++ test.py | 37 ++++++++------ 8 files changed, 436 insertions(+), 29 deletions(-) create mode 100644 projects/linuxmint.py create mode 100644 projects/linuxmint_packages.py create mode 100644 projects/raspberrypi.py create mode 100644 projects/ubuntu_ports_releases.py create mode 100644 projects/xubuntu_releases.py diff --git a/README.md b/README.md index 0201bb0..d1162e1 100644 --- a/README.md +++ b/README.md @@ -19,24 +19,15 @@ even if the date relies on a specific file in their repo, we can still find the to find repos of the mirrored projects to check, just search "projectName mirrors" not done: -damnsmalllinux: http://distro.ibiblio.org/damnsmall/ this project seems abandoned, candidate for brute force looping -debian-backports: no public repo, no timestamp, no mirror tracker -debian-volatile: no public repo, no timestamp, no mirror tracker -linuxmint: no public repo -linuxmint-packages pool: http://rsync-packages.linuxmint.com/pool/ macPorts: only distfiles has public repo, no timestamp, too large to loop through NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker opensuse: http://download.opensuse.org/ has public repo, a possible timestamp called latest in history, our mirror doesn't have this file tho, no mirror tracker -puppylinux: https://distro.ibiblio.org/puppylinux/ has public repo, no timestamp, too hard to loop through, not likely to have a mirror tracker +puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy racket: no public repo, no timestamp, no mirror status tracker -raspberry pi: https://archive.raspberrypi.org/ no timestamp, no mirror status tracker sagemath: don't know how to deal with this, it's a website salt stack: don't know how to deal with this, it's a website -scientific: https://scientificlinux.org/downloads/sl-mirrors/ would be easy to scrape the mirror status page, except that csc is not listed here -ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker x.org: https://www.x.org/releases/ no timestamp, but candidate for brute force looping since it has few folders, no status tracker Xiph: no timestamp, too big to loop through, no status tracker -xubuntu-releases done: almalinux @@ -50,11 +41,14 @@ CRAN: https://cran.r-project.org/mirmon_report.html has a mirror tracker csclub: for now, this is the upstream itself, so it needs not to be checked CTAN: https://www.ctan.org/mirrors/mirmon has a mirror tracker Cygwin +damnsmalllinux: http://distro.ibiblio.org/damnsmall/ not checking this, since it's abandoned debian +debian-backports: this is a legacy thing, no longer have to check debian-cd debian-multimedia debian-ports debian-security +debian-volatile: this is a legacy thing, no longer have to check eclipse emacsconf: for now, this is the upstream itself, so it needs not to be checked fedora @@ -68,6 +62,8 @@ ipfire kde kde-applicationdata kernel +linuxmint: https://mirrors.edge.kernel.org/linuxmint/ candidate for brute force looping +linuxmint-packages: https://mirrors.edge.kernel.org/linuxmint-packages/ Checking the timestamp of either the Release file or the Packages file should suffice. manjaro mxlinux mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird @@ -77,11 +73,15 @@ openbsd parabola: https://repo.parabola.nu/ https://www.parabola.nu/mirrors/status/ pkgsrc qtproject: https://download.qt.io/ +raspberry pi: https://archive.raspberrypi.org/ Checking the timestamp of either the Release file or the Packages file should suffice. raspbian: http://archive.raspbian.org/raspbian/ snapshotindex.txt is most likely a timestamp, tho i'm not sure. also i think our mirror is completely outdated, it's not listed on official mirror list +scientific: https://scientificlinux.org/downloads/sl-mirrors/ not checking this one since it's abandoned slackware: https://mirrors.slackware.com/mirrorlist/ https://mirrors.slackware.com/slackware/ checking using the last updated date here, don't know if it's entirely accurate tdf: https://download.documentfoundation.org/ trisquel: https://trisquel.info/mirmon/index.html out of date website!? please recheck this!!! ubuntu: https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-archive ubuntu-ports: http://ports.ubuntu.com/ubuntu-ports/ checks the file anonster.canonical.com, which appears to be a timestamp (check it to make sure!!!) +ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it ubuntu-releases: https://releases.ubuntu.com/ -vlc: http://download.videolan.org/pub/videolan/ \ No newline at end of file +vlc: http://download.videolan.org/pub/videolan/ +xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders \ No newline at end of file diff --git a/data.json b/data.json index 03c711f..47fa6de 100644 --- a/data.json +++ b/data.json @@ -147,7 +147,7 @@ "file": "gutenberg.dcs" }, "IPFire": { - "out_of_sync_since": null, + "out_of_sync_since": 1634257890, "out_of_sync_interval": 172800 }, "KDE": { @@ -228,7 +228,7 @@ "file": "last-updated.txt" }, "nongnu": { - "out_of_sync_since": 1633333607, + "out_of_sync_since": null, "out_of_sync_interval": 86400, "csc": "nongnu/", "upstream": "http://download-mirror.savannah.gnu.org/releases/", @@ -249,7 +249,7 @@ "file": "MIRROR-TIMESTAMP" }, "qtproject": { - "out_of_sync_since": null, + "out_of_sync_since": 1634247878, "out_of_sync_interval": 86400, "csc": "qtproject/", "upstream": "https://download.qt.io/", @@ -303,5 +303,40 @@ "csc": "", "upstream": "https://www.ctan.org/mirrors/mirmon", "file": "" + }, + "linuxmint_packages": { + "out_of_sync_since": null, + "out_of_sync_interval": 86400, + "csc": "linuxmint-packages/", + "upstream": "https://mirrors.edge.kernel.org/linuxmint-packages/", + "file": "dists/" + }, + "raspberrypi": { + "out_of_sync_since": 1634249138, + "out_of_sync_interval": 86400, + "csc": "raspberrypi/debian/", + "upstream": "https://archive.raspberrypi.org/debian/", + "file": "dists/" + }, + "ubuntu_ports_releases": { + "out_of_sync_since": 1634257890, + "out_of_sync_interval": 86400, + "csc": "ubuntu-ports-releases/", + "upstream": "https://cdimage.ubuntu.com/releases/", + "file": "" + }, + "xubuntu_releases": { + "out_of_sync_since": null, + "out_of_sync_interval": 86400, + "csc": "xubuntu-releases/", + "upstream": "https://cdimage.ubuntu.com/xubuntu/releases/", + "file": "" + }, + "linuxmint": { + "out_of_sync_since": null, + "out_of_sync_interval": 86400, + "csc": "linuxmint/", + "upstream": "https://mirrors.edge.kernel.org/linuxmint/", + "file": "" } } \ No newline at end of file diff --git a/projects/linuxmint.py b/projects/linuxmint.py new file mode 100644 index 0000000..da69635 --- /dev/null +++ b/projects/linuxmint.py @@ -0,0 +1,85 @@ +from bs4 import BeautifulSoup +import requests +import datefinder # another date finding library +import re +from datetime import datetime +from datetime import timedelta +import time +import pandas as pd +from project import Project +from shared import CSC_MIRROR + +# this function is brute force looping through the whole directory and checking dates +# it may sound horrible, but for certain distros, i believe it's indeed the best solution + +# lists +urls=[] + +class linuxmint(Project): + """linuxmint class""" + @classmethod + def scrape(cls, urls, site): + # getting the request from url + r = requests.get(site) + + # converting the text + s = BeautifulSoup(r.text,"html.parser") + + for i in s.find_all("a"): # for a href directories + href = i.attrs['href'] + + if href.endswith("/") and href != "../" and href != "/": + site_next = site+href + + if site_next not in urls: + urls.append(site_next) + # print(site_next) + # calling it self + cls.scrape(urls, site_next) + + @staticmethod + def get_latest_date(web_dir): + page = requests.get(web_dir).text + + str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', page) + # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!! + # print(str_dates[0]) + if len(str_dates) == 0: + return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates + # for date in str_dates: + # print(date) + dates = [list(datefinder.find_dates(date))[0] for date in str_dates] + + # for date in dates: + # print(date) + return(max(dates)) + + @classmethod + def max_date(cls, urls): + latest_date = cls.get_latest_date(urls[0]) + # get_latest_date(urls[0]) + for dir in urls: + latest_date2 = cls.get_latest_date(dir) + if (latest_date2 >= latest_date): + latest_date = latest_date2 + # print(latest_date) + return latest_date + + + @classmethod + def check(cls, data, project): + """Check if project packages are up-to-date""" + # lists + urls1=[] + urls2=[] + + csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] + upstream_url = data[project]["upstream"] + data[project]["file"] + + # calling function + cls.scrape(urls1, csc_url) + cls.scrape(urls2, upstream_url) + + # print(len(urls1), len(urls2)) + + return cls.max_date(urls1) == cls.max_date(urls2) \ No newline at end of file diff --git a/projects/linuxmint_packages.py b/projects/linuxmint_packages.py new file mode 100644 index 0000000..f69ca83 --- /dev/null +++ b/projects/linuxmint_packages.py @@ -0,0 +1,55 @@ +from bs4 import BeautifulSoup +import requests +from project import Project +from shared import CSC_MIRROR + +# this function is brute force looping through the whole directory and checking dates +# it may sound horrible, but for certain distros, i believe it's indeed the best solution + +class linuxmint_packages(Project): + """linuxmint_packages class""" + @staticmethod + def scrape(urls, site): + # getting the request from url + r = requests.get(site) + + # converting the text + s = BeautifulSoup(r.text,"html.parser") + + # salt stack specific code + # s = s.find("div", {"id": "listing"}) + # print(s) + + for i in s.find_all("a"): # for a href directories + href = i.attrs['href'] + + if href.endswith("/") and href != "../" and href != "/": + site_next = site+href+"Release" + + if site_next not in urls: + urls.append(site_next) + # print(site_next) + + @classmethod + def check(cls, data, project): + """Check if project packages are up-to-date""" + # lists + urls1=[] + urls2=[] + + csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] + upstream_url = data[project]["upstream"] + data[project]["file"] + + # calling function + cls.scrape(urls1, csc_url) + cls.scrape(urls2, upstream_url) + + if (len(urls1) != len(urls2)): + return False + urls1.sort() + urls2.sort() + for index, f in enumerate(urls1): + if requests.get(f).text != requests.get(urls2[index]).text: + # comparing the file content bc that's how the base class does it, but we can speed it up by just comparing the dates + return False + return True \ No newline at end of file diff --git a/projects/raspberrypi.py b/projects/raspberrypi.py new file mode 100644 index 0000000..5fe18ac --- /dev/null +++ b/projects/raspberrypi.py @@ -0,0 +1,55 @@ +from bs4 import BeautifulSoup +import requests +from project import Project +from shared import CSC_MIRROR + +# this function is brute force looping through the whole directory and checking dates +# it may sound horrible, but for certain distros, i believe it's indeed the best solution + +class raspberrypi(Project): + """raspberrypi class""" + @staticmethod + def scrape(urls, site): + # getting the request from url + r = requests.get(site) + + # converting the text + s = BeautifulSoup(r.text,"html.parser") + + # salt stack specific code + # s = s.find("div", {"id": "listing"}) + # print(s) + + for i in s.find_all("a"): # for a href directories + href = i.attrs['href'] + + if href.endswith("/") and href != "../" and href != "/": + site_next = site+href+"Release" + + if site_next not in urls: + urls.append(site_next) + # print(site_next) + + @classmethod + def check(cls, data, project): + """Check if project packages are up-to-date""" + # lists + urls1=[] + urls2=[] + + csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] + upstream_url = data[project]["upstream"] + data[project]["file"] + + # calling function + cls.scrape(urls1, csc_url) + cls.scrape(urls2, upstream_url) + + if (len(urls1) != len(urls2)): + return False + urls1.sort() + urls2.sort() + for index, f in enumerate(urls1): + if requests.get(f).text != requests.get(urls2[index]).text: + # comparing the file content bc that's how the base class does it, but we can speed it up by just comparing the dates + return False + return True \ No newline at end of file diff --git a/projects/ubuntu_ports_releases.py b/projects/ubuntu_ports_releases.py new file mode 100644 index 0000000..2baf703 --- /dev/null +++ b/projects/ubuntu_ports_releases.py @@ -0,0 +1,85 @@ +from bs4 import BeautifulSoup +import requests +import datefinder # another date finding library +import re +from datetime import datetime +from datetime import timedelta +import time +import pandas as pd +from project import Project +from shared import CSC_MIRROR + +# this function is brute force looping through the whole directory and checking dates +# it may sound horrible, but for certain distros, i believe it's indeed the best solution + +# lists +urls=[] + +class ubuntu_ports_releases(Project): + """ubuntu_ports_releases class""" + @classmethod + def scrape(cls, urls, site): + # getting the request from url + r = requests.get(site) + + # converting the text + s = BeautifulSoup(r.text,"html.parser") + + for i in s.find_all("a"): # for a href directories + href = i.attrs['href'] + + if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"): + site_next = site+href + + if site_next not in urls: + urls.append(site_next) + # print(site_next) + # calling it self + cls.scrape(urls, site_next) + + @staticmethod + def get_latest_date(web_dir): + page = requests.get(web_dir).text + + str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page) + # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!! + # print(str_dates[0]) + if len(str_dates) == 0: + return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates + # for date in str_dates: + # print(date) + dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates] + + # for date in dates: + # print(date) + return(max(dates)) + + @classmethod + def max_date(cls, urls): + latest_date = cls.get_latest_date(urls[0]) + # get_latest_date(urls[0]) + for dir in urls: + latest_date2 = cls.get_latest_date(dir) + if (latest_date2 >= latest_date): + latest_date = latest_date2 + # print(latest_date) + return latest_date + + + @classmethod + def check(cls, data, project): + """Check if project packages are up-to-date""" + # lists + urls1=[] + urls2=[] + + csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] + upstream_url = data[project]["upstream"] + data[project]["file"] + + # calling function + cls.scrape(urls1, csc_url) + cls.scrape(urls2, upstream_url) + + # print(len(urls1), len(urls2)) + + return cls.max_date(urls1) == cls.max_date(urls2) \ No newline at end of file diff --git a/projects/xubuntu_releases.py b/projects/xubuntu_releases.py new file mode 100644 index 0000000..171b0d3 --- /dev/null +++ b/projects/xubuntu_releases.py @@ -0,0 +1,85 @@ +from bs4 import BeautifulSoup +import requests +import datefinder # another date finding library +import re +from datetime import datetime +from datetime import timedelta +import time +import pandas as pd +from project import Project +from shared import CSC_MIRROR + +# this function is brute force looping through the whole directory and checking dates +# it may sound horrible, but for certain distros, i believe it's indeed the best solution + +# lists +urls=[] + +class xubuntu_releases(Project): + """xubuntu_releases class""" + @classmethod + def scrape(cls, urls, site): + # getting the request from url + r = requests.get(site) + + # converting the text + s = BeautifulSoup(r.text,"html.parser") + + for i in s.find_all("a"): # for a href directories + href = i.attrs['href'] + + if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"): + site_next = site+href + + if site_next not in urls: + urls.append(site_next) + # print(site_next) + # calling it self + cls.scrape(urls, site_next) + + @staticmethod + def get_latest_date(web_dir): + page = requests.get(web_dir).text + + str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page) + # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!! + # print(str_dates[0]) + if len(str_dates) == 0: + return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates + # for date in str_dates: + # print(date) + dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates] + + # for date in dates: + # print(date) + return(max(dates)) + + @classmethod + def max_date(cls, urls): + latest_date = cls.get_latest_date(urls[0]) + # get_latest_date(urls[0]) + for dir in urls: + latest_date2 = cls.get_latest_date(dir) + if (latest_date2 >= latest_date): + latest_date = latest_date2 + # print(latest_date) + return latest_date + + + @classmethod + def check(cls, data, project): + """Check if project packages are up-to-date""" + # lists + urls1=[] + urls2=[] + + csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] + upstream_url = data[project]["upstream"] + data[project]["file"] + + # calling function + cls.scrape(urls1, csc_url) + cls.scrape(urls2, upstream_url) + + # print(len(urls1), len(urls2)) + + return cls.max_date(urls1) == cls.max_date(urls2) \ No newline at end of file diff --git a/test.py b/test.py index dc98cdd..2cc72a7 100644 --- a/test.py +++ b/test.py @@ -6,6 +6,9 @@ from datetime import datetime from datetime import timedelta import time import pandas as pd +import re # for salt stack specifically +from projects import linuxmint +import json # import json to read project info stored in json file # this function is brute force looping through the whole directory and checking dates # it may sound horrible, but for certain distros, i believe it's indeed the best solution @@ -13,7 +16,7 @@ import pandas as pd # lists urls=[] -home_site = "http://ftp.netbsd.org/pub" +home_site = "https://cdimage.ubuntu.com" # function created def scrape(site): @@ -24,14 +27,18 @@ def scrape(site): # converting the text s = BeautifulSoup(r.text,"html.parser") + # salt stack specific code + # s = s.find("div", {"id": "listing"}) + # print(s) + for i in s.find_all("a"): # for a href directories href = i.attrs['href'] if href.endswith("/") and href != "../" and href != "/": - """if home_site+href in urls: # avoids the link to parent directory - continue""" - if href == "//ftp.netbsd.org/": # netbsd specific code + if home_site+href in urls: # avoids the link to parent directory continue + """if href == "//ftp.netbsd.org/": # netbsd specific code + continue""" site_next = site+href if site_next not in urls: @@ -41,24 +48,29 @@ def scrape(site): scrape(site_next) def get_latest_date(web_dir): - page = requests.get(site).text + page = requests.get(web_dir).text str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page) - dates = [list(datefinder.find_dates(date))[0] for date in str_dates] + # print(str_dates[0]) + dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates] # for date in dates: # print(date) + if len(dates) == 0: + return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates return(max(dates)) # main function if __name__ =="__main__": + with open("data.json", "r", encoding="utf-8") as file: + data = json.load(file) + print(linuxmint.check(data, "linuxmint")) """# website to be scrape - site="http://ftp.netbsd.org/pub/NetBSD/" - # works on: https://www.x.org/releases/ - # https://mirror.csclub.uwaterloo.ca/linuxmint/ #works wonders for linuxmint - # unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on https://linuxmint.com/mirrors.php seems like the best choice + site="https://cdimage.ubuntu.com/releases/" + # works on: + # https://www.x.org/releases/ # calling function scrape(site) @@ -71,9 +83,4 @@ if __name__ =="__main__": latest_date = latest_date2 print(latest_date)""" - - csc_url = "https://mirror.csclub.uwaterloo.ca/ubuntu-ports/project/trace/anonster.canonical.com" - upstream_url = "http://ports.ubuntu.com/ubuntu-ports/project/trace/anonster.canonical.com" - print(requests.get(upstream_url).text) - print(requests.get(csc_url).text == requests.get(upstream_url).text) \ No newline at end of file