From c974d49ffc3626dca1ec7e1ef414f967580b2527 Mon Sep 17 00:00:00 2001 From: Tom Date: Sat, 16 Oct 2021 00:40:14 -0700 Subject: [PATCH] xiph added --- README.md | 10 +++-- data.json | 17 ++++++--- projects/xiph.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++ test.py | 4 +- 4 files changed, 119 insertions(+), 11 deletions(-) create mode 100644 projects/xiph.py diff --git a/README.md b/README.md index db02312..346dac2 100644 --- a/README.md +++ b/README.md @@ -18,13 +18,13 @@ even if the date relies on a specific file in their repo, we can still find the to find repos of the mirrored projects to check, just search "projectName mirrors" +## checker information + not done: -macPorts: only distfiles has public repo, no timestamp, too large to loop through, comparing ports.tar.gz in distfiles NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker -opensuse: http://download.opensuse.org/ has public repo, a possible timestamp called latest in history, our mirror doesn't have this file tho, no mirror tracker +opensuse: http://download.opensuse.org/ check Update.repo files in folders inside the update folder puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy -x.org: https://www.x.org/releases/ no timestamp, but candidate for brute force looping since it has few folders, no status tracker -Xiph: no timestamp, too big to loop through, no status tracker +x.org: https://www.x.org/releases/ check all of the files under each directory under /x.org/individual/, and make sure that we have all of the files which the upstream has done: almalinux @@ -61,6 +61,7 @@ kde-applicationdata kernel linuxmint: https://mirrors.edge.kernel.org/linuxmint/ candidate for brute force looping linuxmint-packages: https://mirrors.edge.kernel.org/linuxmint-packages/ Checking the timestamp of either the Release file or the Packages file should suffice. +macPorts: only distfiles has public repo, no timestamp, too large to loop through, comparing ports.tar.gz in distfiles manjaro mxlinux mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird @@ -84,4 +85,5 @@ ubuntu-ports: http://ports.ubuntu.com/ubuntu-ports/ checks the file anonster.can ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it ubuntu-releases: https://releases.ubuntu.com/ vlc: http://download.videolan.org/pub/videolan/ +Xiph: https://ftp.osuosl.org/pub/xiph/releases/ loop through each directory in xiph/releases/ and trying to compare the timestamp of the checksum files xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders \ No newline at end of file diff --git a/data.json b/data.json index be92462..1a48015 100644 --- a/data.json +++ b/data.json @@ -21,7 +21,7 @@ "file": "zzz/time.txt" }, "Arch": { - "out_of_sync_since": 1634334754, + "out_of_sync_since": null, "out_of_sync_interval": 86400, "csc": "archlinux/", "upstream": "http://arch.mirror.constant.com/", @@ -52,7 +52,7 @@ "file": "x86/sha512.sum" }, "Debian": { - "out_of_sync_since": 1634334754, + "out_of_sync_since": null, "out_of_sync_interval": 86400, "csc": "", "upstream": "https://ftp-master.debian.org/", @@ -66,7 +66,7 @@ "file": "debian-cd/project/trace/cdimage.debian.org" }, "DebianMultimedia": { - "out_of_sync_since": 1634334754, + "out_of_sync_since": null, "out_of_sync_interval": 86400, "csc": "debian-multimedia/", "upstream": "http://debian-mirrors.sdinet.de/deb-multimedia/", @@ -140,7 +140,7 @@ "file": "gnu/mirror-updated-timestamp.txt" }, "Gutenberg": { - "out_of_sync_since": 1633294718, + "out_of_sync_since": null, "out_of_sync_interval": 172800, "csc": "gutenberg/", "upstream": "https://gutenberg.pglaf.org/", @@ -305,7 +305,7 @@ "file": "" }, "linuxmint_packages": { - "out_of_sync_since": 1634334754, + "out_of_sync_since": null, "out_of_sync_interval": 86400, "csc": "linuxmint-packages/", "upstream": "https://mirrors.edge.kernel.org/linuxmint-packages/", @@ -366,5 +366,12 @@ "csc": "MacPorts/mpdistfiles/", "upstream": "https://distfiles.macports.org/", "file": "ports.tar.gz" + }, + "xiph": { + "out_of_sync_since": null, + "out_of_sync_interval": 86400, + "csc": "xiph/releases/", + "upstream": "https://ftp.osuosl.org/pub/xiph/releases/", + "file": "" } } \ No newline at end of file diff --git a/projects/xiph.py b/projects/xiph.py new file mode 100644 index 0000000..0b2f9e7 --- /dev/null +++ b/projects/xiph.py @@ -0,0 +1,99 @@ +from bs4 import BeautifulSoup +import requests +import datefinder # another date finding library +import re +from datetime import datetime +from project import Project +from shared import CSC_MIRROR + +# this function is brute force looping through the whole directory and checking dates +# it may sound horrible, but for certain distros, i believe it's indeed the best solution + +class xiph(Project): + """xiph class""" + @staticmethod + def scrape(releases, site): + # getting the request from url + r = requests.get(site) + + # converting the text + s = BeautifulSoup(r.text,"html.parser") + + for i in s.find_all("a"): # for a href directories + href = i.attrs['href'] + + if href.endswith("/") and href != "../" and href != "/" and href != "/pub/xiph/" and not href.startswith("http://"): + if href not in releases: + releases.append(href) + # print(href) + + @staticmethod + def get_latest_date(web_dir): + page = requests.get(web_dir).text + + str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page) + # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!! + # print(str_dates[0]) + if len(str_dates) == 0: + return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates + # for date in str_dates: + # print(date) + # print("") + dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates] + + # for date in dates: + # print(date) + return(max(dates)) + + def get_checksum_date(directory_URL): + page = requests.get(directory_URL).text + file_index = page.find("SUMS.txt") + # print(page) + + # remove stray numbers (file size numbers in particular) that might interfere with date finding + segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:]) # removes numbers for size + segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:]) # removes numbers + size unit. e.x. 50kb + # print(segment_clean) + + # finds the dates in the segment after the file name + # notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom. + matches = list(datefinder.find_dates(segment_clean)) + # print(matches[0]) + + return matches[0] + + @classmethod + def compare_release(cls, csc_dir, upstream_dir): + page = requests.get(upstream_dir).text + file_index = page.find("SUMS.txt") + if file_index == -1: + return cls.get_latest_date(csc_dir) == cls.get_latest_date(upstream_dir) + else: + return cls.get_checksum_date(csc_dir) == cls.get_checksum_date(upstream_dir) + + @classmethod + def check_mirror(cls, csc_url, upstream_url, releases): + compare = [] + for release in releases: + compare.append(cls.compare_release(csc_url+release, upstream_url+release)) + return all(compare) + + + @classmethod + def check(cls, data, project): + """Check if project packages are up-to-date""" + # lists + releases1=[] + releases2=[] + + csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] + upstream_url = data[project]["upstream"] + data[project]["file"] + + # calling function + cls.scrape(releases1, csc_url) + cls.scrape(releases2, upstream_url) + + if set(releases1) != set(releases2): + return False + + return cls.check_mirror(csc_url, upstream_url, releases2) \ No newline at end of file diff --git a/test.py b/test.py index 0f2c8ea..eeee7ce 100644 --- a/test.py +++ b/test.py @@ -7,7 +7,7 @@ from datetime import timedelta import time import pandas as pd import re # for salt stack specifically -from projects import macports +from projects import xiph import json # import json to read project info stored in json file # this function is brute force looping through the whole directory and checking dates @@ -65,7 +65,7 @@ def get_latest_date(web_dir): if __name__ =="__main__": with open("data.json", "r", encoding="utf-8") as file: data = json.load(file) - print(macports.check(data, "macports")) + print(xiph.check(data, "xiph")) """# website to be scrape site="https://cdimage.ubuntu.com/releases/"