diff --git a/README.md b/README.md index cfd0bcb..db02312 100644 --- a/README.md +++ b/README.md @@ -19,11 +19,10 @@ even if the date relies on a specific file in their repo, we can still find the to find repos of the mirrored projects to check, just search "projectName mirrors" not done: -macPorts: only distfiles has public repo, no timestamp, too large to loop through +macPorts: only distfiles has public repo, no timestamp, too large to loop through, comparing ports.tar.gz in distfiles NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker opensuse: http://download.opensuse.org/ has public repo, a possible timestamp called latest in history, our mirror doesn't have this file tho, no mirror tracker puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy -racket: https://mirror.racket-lang.org/installers/ no public repo, no timestamp, no mirror status tracker make sure that we have the latest version number under racket-installers x.org: https://www.x.org/releases/ no timestamp, but candidate for brute force looping since it has few folders, no status tracker Xiph: no timestamp, too big to loop through, no status tracker @@ -71,6 +70,7 @@ openbsd parabola: https://repo.parabola.nu/ https://www.parabola.nu/mirrors/status/ pkgsrc qtproject: https://download.qt.io/ +racket: https://mirror.racket-lang.org/installers/ no public repo, no timestamp, no mirror status tracker make sure that we have the latest version number under racket-installers raspberry pi: https://archive.raspberrypi.org/ Checking the timestamp of either the Release file or the Packages file should suffice. raspbian: http://archive.raspbian.org/raspbian/ snapshotindex.txt is most likely a timestamp, tho i'm not sure. also i think our mirror is completely outdated, it's not listed on official mirror list sagemath: same source tarballs as them (the sage-*.tar.gz files under 'Source Code') diff --git a/data.json b/data.json index df4f4b6..be92462 100644 --- a/data.json +++ b/data.json @@ -359,5 +359,12 @@ "csc": "racket/racket-installers/", "upstream": "https://mirror.racket-lang.org/installers/", "file": "" + }, + "macports": { + "out_of_sync_since": 1634339590, + "out_of_sync_interval": 86400, + "csc": "MacPorts/mpdistfiles/", + "upstream": "https://distfiles.macports.org/", + "file": "ports.tar.gz" } } \ No newline at end of file diff --git a/projects/macports.py b/projects/macports.py new file mode 100644 index 0000000..fe1aaaa --- /dev/null +++ b/projects/macports.py @@ -0,0 +1,37 @@ +import requests +import re # import regular expressions to remove stray numbers in string that might interfere with date finding +import json # import json to read project info stored in json file +from project import Project +from shared import CSC_MIRROR + +import datefinder # another date finding library + +class macports(Project): + """macports class""" + # checker: gets the timestamp of the file inside the directory at the specified URL and returns it as a string + @staticmethod + def checker(directory_URL, file_name): + page = requests.get(directory_URL).text + file_index = page.find(file_name) + # print(page) + + # remove stray numbers (file size numbers in particular) that might interfere with date finding + segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:]) # removes numbers for size + segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:]) # removes numbers + size unit. e.x. 50kb + # print(segment_clean) + + # finds the dates in the segment after the file name + # notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom. + matches = list(datefinder.find_dates(segment_clean)) + + # print(matches[0]) + return matches[0] + + @classmethod + def check(cls, data, project): + """Check if project packages are up-to-date""" + csc_url = CSC_MIRROR + data[project]["csc"] + upstream_url = data[project]["upstream"] + file_name = data[project]["file"] + + return cls.checker(csc_url, file_name) == cls.checker(upstream_url, file_name) diff --git a/test.py b/test.py index c8b3949..0f2c8ea 100644 --- a/test.py +++ b/test.py @@ -7,7 +7,7 @@ from datetime import timedelta import time import pandas as pd import re # for salt stack specifically -from projects import racket +from projects import macports import json # import json to read project info stored in json file # this function is brute force looping through the whole directory and checking dates @@ -65,7 +65,7 @@ def get_latest_date(web_dir): if __name__ =="__main__": with open("data.json", "r", encoding="utf-8") as file: data = json.load(file) - print(racket.check(data, "racket")) + print(macports.check(data, "macports")) """# website to be scrape site="https://cdimage.ubuntu.com/releases/"