added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases

2021-10-14 18:22:39 -07:00 · 2021-10-14 18:22:39 -07:00 · c76ae9c325
parent 2399f32d39
commit c76ae9c325
8 changed files with 436 additions and 29 deletions
--- a/README.md
+++ b/README.md
@ -19,24 +19,15 @@ even if the date relies on a specific file in their repo, we can still find the
 to find repos of the mirrored projects to check, just search "projectName mirrors"
 not done:
 damnsmalllinux: http://distro.ibiblio.org/damnsmall/ this project seems abandoned, candidate for brute force looping
 debian-backports: no public repo, no timestamp, no mirror tracker
 debian-volatile: no public repo, no timestamp, no mirror tracker
 linuxmint: no public repo
 linuxmint-packages pool: http://rsync-packages.linuxmint.com/pool/
 macPorts: only distfiles has public repo, no timestamp, too large to loop through
 NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker
 opensuse: http://download.opensuse.org/ has public repo, a possible timestamp called latest in history, our mirror doesn't have this file tho, no mirror tracker
-puppylinux: https://distro.ibiblio.org/puppylinux/ has public repo, no timestamp, too hard to loop through, not likely to have a mirror tracker
+puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy
 racket: no public repo, no timestamp, no mirror status tracker
 raspberry pi: https://archive.raspberrypi.org/ no timestamp, no mirror status tracker
 sagemath: don't know how to deal with this, it's a website
 salt stack: don't know how to deal with this, it's a website
 scientific: https://scientificlinux.org/downloads/sl-mirrors/ would be easy to scrape the mirror status page, except that csc is not listed here
 ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker
 x.org: https://www.x.org/releases/ no timestamp, but candidate for brute force looping since it has few folders, no status tracker
 Xiph: no timestamp, too big to loop through, no status tracker
 xubuntu-releases
 done:
 almalinux
@ -50,11 +41,14 @@ CRAN: https://cran.r-project.org/mirmon_report.html has a mirror tracker
 csclub: for now, this is the upstream itself, so it needs not to be checked
 CTAN: https://www.ctan.org/mirrors/mirmon has a mirror tracker
 Cygwin
 damnsmalllinux: http://distro.ibiblio.org/damnsmall/ not checking this, since it's abandoned
 debian
 debian-backports: this is a legacy thing, no longer have to check
 debian-cd
 debian-multimedia
 debian-ports
 debian-security
 debian-volatile: this is a legacy thing, no longer have to check
 eclipse
 emacsconf: for now, this is the upstream itself, so it needs not to be checked
 fedora
@ -68,6 +62,8 @@ ipfire
 kde
 kde-applicationdata
 kernel
 linuxmint: https://mirrors.edge.kernel.org/linuxmint/ candidate for brute force looping
 linuxmint-packages: https://mirrors.edge.kernel.org/linuxmint-packages/ Checking the timestamp of either the Release file or the Packages file should suffice.
 manjaro
 mxlinux
 mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird
@ -77,11 +73,15 @@ openbsd
 parabola: https://repo.parabola.nu/ https://www.parabola.nu/mirrors/status/
 pkgsrc
 qtproject: https://download.qt.io/
 raspberry pi: https://archive.raspberrypi.org/ Checking the timestamp of either the Release file or the Packages file should suffice.
 raspbian: http://archive.raspbian.org/raspbian/ snapshotindex.txt is most likely a timestamp, tho i'm not sure. also i think our mirror is completely outdated, it's not listed on official mirror list
 scientific: https://scientificlinux.org/downloads/sl-mirrors/ not checking this one since it's abandoned
 slackware: https://mirrors.slackware.com/mirrorlist/ https://mirrors.slackware.com/slackware/ checking using the last updated date here, don't know if it's entirely accurate
 tdf: https://download.documentfoundation.org/
 trisquel: https://trisquel.info/mirmon/index.html out of date website!? please recheck this!!!
 ubuntu: https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-archive
 ubuntu-ports: http://ports.ubuntu.com/ubuntu-ports/ checks the file anonster.canonical.com, which appears to be a timestamp (check it to make sure!!!)
 ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it
 ubuntu-releases: https://releases.ubuntu.com/
-vlc: http://download.videolan.org/pub/videolan/
+vlc: http://download.videolan.org/pub/videolan/
 xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders
--- a/data.json
+++ b/data.json
@ -147,7 +147,7 @@
 		"file": "gutenberg.dcs"
 	},
 	"IPFire": {
-		"out_of_sync_since": null,
+		"out_of_sync_since": 1634257890,
 		"out_of_sync_interval": 172800
 	},
 	"KDE": {
@ -228,7 +228,7 @@
 		"file": "last-updated.txt"
 	},
 	"nongnu": {
-		"out_of_sync_since": 1633333607,
+		"out_of_sync_since": null,
 		"out_of_sync_interval": 86400,
 		"csc": "nongnu/",
 		"upstream": "http://download-mirror.savannah.gnu.org/releases/",
@ -249,7 +249,7 @@
 		"file": "MIRROR-TIMESTAMP"
 	},
 	"qtproject": {
-		"out_of_sync_since": null,
+		"out_of_sync_since": 1634247878,
 		"out_of_sync_interval": 86400,
 		"csc": "qtproject/",
 		"upstream": "https://download.qt.io/",
@ -303,5 +303,40 @@
 		"csc": "",
 		"upstream": "https://www.ctan.org/mirrors/mirmon",
 		"file": ""
 	},
 	"linuxmint_packages": {
 		"out_of_sync_since": null,
 		"out_of_sync_interval": 86400,
 		"csc": "linuxmint-packages/",
 		"upstream": "https://mirrors.edge.kernel.org/linuxmint-packages/",
 		"file": "dists/"
 	},
 	"raspberrypi": {
 		"out_of_sync_since": 1634249138,
 		"out_of_sync_interval": 86400,
 		"csc": "raspberrypi/debian/",
 		"upstream": "https://archive.raspberrypi.org/debian/",
 		"file": "dists/"
 	},
 	"ubuntu_ports_releases": {
 		"out_of_sync_since": 1634257890,
 		"out_of_sync_interval": 86400,
 		"csc": "ubuntu-ports-releases/",
 		"upstream": "https://cdimage.ubuntu.com/releases/",
 		"file": ""
 	},
 	"xubuntu_releases": {
 		"out_of_sync_since": null,
 		"out_of_sync_interval": 86400,
 		"csc": "xubuntu-releases/",
 		"upstream": "https://cdimage.ubuntu.com/xubuntu/releases/",
 		"file": ""
 	},
 	"linuxmint": {
 		"out_of_sync_since": null,
 		"out_of_sync_interval": 86400,
 		"csc": "linuxmint/",
 		"upstream": "https://mirrors.edge.kernel.org/linuxmint/",
 		"file": ""
 	}
 }
--- a/projects/linuxmint.py
+++ b/projects/linuxmint.py
@ -0,0 +1,85 @@
 from bs4 import BeautifulSoup
 import requests
 import datefinder  # another date finding library
 import re
 from datetime import datetime
 from datetime import timedelta
 import time
 import pandas as pd
 from project import Project
 from shared import CSC_MIRROR
 # this function is brute force looping through the whole directory and checking dates
 # it may sound horrible, but for certain distros, i believe it's indeed the best solution
 # lists
 urls=[]
 class linuxmint(Project):
    """linuxmint class"""
    @classmethod
    def scrape(cls, urls, site):
        # getting the request from url
        r = requests.get(site)
        # converting the text
        s = BeautifulSoup(r.text,"html.parser")
        for i in s.find_all("a"): # for a href directories
            href = i.attrs['href']
            if href.endswith("/") and href != "../" and href != "/":
                site_next = site+href
                if site_next not in  urls:
                    urls.append(site_next) 
                    # print(site_next)
                    # calling it self
                    cls.scrape(urls, site_next)
    @staticmethod
    def get_latest_date(web_dir):
        page = requests.get(web_dir).text
        str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', page)
        # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
        # print(str_dates[0])
        if len(str_dates) == 0:
            return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
        # for date in str_dates:
        #     print(date)
        dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
        # for date in dates:
        #     print(date)
        return(max(dates))
    @classmethod
    def max_date(cls, urls):
        latest_date = cls.get_latest_date(urls[0])
        # get_latest_date(urls[0])
        for dir in urls:
            latest_date2 = cls.get_latest_date(dir)
            if (latest_date2 >= latest_date):
                latest_date = latest_date2
        # print(latest_date)
        return latest_date
    @classmethod
    def check(cls, data, project):
        """Check if project packages are up-to-date"""
        # lists
        urls1=[]
        urls2=[]
        csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
        upstream_url = data[project]["upstream"] + data[project]["file"]
        # calling function
        cls.scrape(urls1, csc_url)
        cls.scrape(urls2, upstream_url)
        # print(len(urls1), len(urls2))
        return cls.max_date(urls1) == cls.max_date(urls2)
--- a/projects/linuxmint_packages.py
+++ b/projects/linuxmint_packages.py
@ -0,0 +1,55 @@
 from bs4 import BeautifulSoup
 import requests
 from project import Project
 from shared import CSC_MIRROR
 # this function is brute force looping through the whole directory and checking dates
 # it may sound horrible, but for certain distros, i believe it's indeed the best solution
 class linuxmint_packages(Project):
    """linuxmint_packages class"""
    @staticmethod
    def scrape(urls, site):
        # getting the request from url
        r = requests.get(site)
        # converting the text
        s = BeautifulSoup(r.text,"html.parser")
        # salt stack specific code
        # s = s.find("div", {"id": "listing"})
        # print(s)
        for i in s.find_all("a"): # for a href directories
            href = i.attrs['href']
            if href.endswith("/") and href != "../" and href != "/":
                site_next = site+href+"Release"
                if site_next not in  urls:
                    urls.append(site_next) 
                    # print(site_next)
    @classmethod
    def check(cls, data, project):
        """Check if project packages are up-to-date"""
        # lists
        urls1=[]
        urls2=[]
        csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
        upstream_url = data[project]["upstream"] + data[project]["file"]
        # calling function
        cls.scrape(urls1, csc_url)
        cls.scrape(urls2, upstream_url)
        if (len(urls1) != len(urls2)):
            return False
        urls1.sort()
        urls2.sort()
        for index, f in enumerate(urls1):
            if requests.get(f).text != requests.get(urls2[index]).text:
                # comparing the file content bc that's how the base class does it, but we can speed it up by just comparing the dates
                return False
        return True
--- a/projects/raspberrypi.py
+++ b/projects/raspberrypi.py
@ -0,0 +1,55 @@
 from bs4 import BeautifulSoup
 import requests
 from project import Project
 from shared import CSC_MIRROR
 # this function is brute force looping through the whole directory and checking dates
 # it may sound horrible, but for certain distros, i believe it's indeed the best solution
 class raspberrypi(Project):
    """raspberrypi class"""
    @staticmethod
    def scrape(urls, site):
        # getting the request from url
        r = requests.get(site)
        # converting the text
        s = BeautifulSoup(r.text,"html.parser")
        # salt stack specific code
        # s = s.find("div", {"id": "listing"})
        # print(s)
        for i in s.find_all("a"): # for a href directories
            href = i.attrs['href']
            if href.endswith("/") and href != "../" and href != "/":
                site_next = site+href+"Release"
                if site_next not in  urls:
                    urls.append(site_next) 
                    # print(site_next)
    @classmethod
    def check(cls, data, project):
        """Check if project packages are up-to-date"""
        # lists
        urls1=[]
        urls2=[]
        csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
        upstream_url = data[project]["upstream"] + data[project]["file"]
        # calling function
        cls.scrape(urls1, csc_url)
        cls.scrape(urls2, upstream_url)
        if (len(urls1) != len(urls2)):
            return False
        urls1.sort()
        urls2.sort()
        for index, f in enumerate(urls1):
            if requests.get(f).text != requests.get(urls2[index]).text:
                # comparing the file content bc that's how the base class does it, but we can speed it up by just comparing the dates
                return False
        return True
--- a/projects/ubuntu_ports_releases.py
+++ b/projects/ubuntu_ports_releases.py
@ -0,0 +1,85 @@
 from bs4 import BeautifulSoup
 import requests
 import datefinder  # another date finding library
 import re
 from datetime import datetime
 from datetime import timedelta
 import time
 import pandas as pd
 from project import Project
 from shared import CSC_MIRROR
 # this function is brute force looping through the whole directory and checking dates
 # it may sound horrible, but for certain distros, i believe it's indeed the best solution
 # lists
 urls=[]
 class ubuntu_ports_releases(Project):
    """ubuntu_ports_releases class"""
    @classmethod
    def scrape(cls, urls, site):
        # getting the request from url
        r = requests.get(site)
        # converting the text
        s = BeautifulSoup(r.text,"html.parser")
        for i in s.find_all("a"): # for a href directories
            href = i.attrs['href']
            if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"):
                site_next = site+href
                if site_next not in  urls:
                    urls.append(site_next) 
                    # print(site_next)
                    # calling it self
                    cls.scrape(urls, site_next)
    @staticmethod
    def get_latest_date(web_dir):
        page = requests.get(web_dir).text
        str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
        # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
        # print(str_dates[0])
        if len(str_dates) == 0:
            return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
        # for date in str_dates:
        #     print(date)
        dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
        # for date in dates:
        #     print(date)
        return(max(dates))
    @classmethod
    def max_date(cls, urls):
        latest_date = cls.get_latest_date(urls[0])
        # get_latest_date(urls[0])
        for dir in urls:
            latest_date2 = cls.get_latest_date(dir)
            if (latest_date2 >= latest_date):
                latest_date = latest_date2
        # print(latest_date)
        return latest_date
    @classmethod
    def check(cls, data, project):
        """Check if project packages are up-to-date"""
        # lists
        urls1=[]
        urls2=[]
        csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
        upstream_url = data[project]["upstream"] + data[project]["file"]
        # calling function
        cls.scrape(urls1, csc_url)
        cls.scrape(urls2, upstream_url)
        # print(len(urls1), len(urls2))
        return cls.max_date(urls1) == cls.max_date(urls2)
--- a/projects/xubuntu_releases.py
+++ b/projects/xubuntu_releases.py
@ -0,0 +1,85 @@
 from bs4 import BeautifulSoup
 import requests
 import datefinder  # another date finding library
 import re
 from datetime import datetime
 from datetime import timedelta
 import time
 import pandas as pd
 from project import Project
 from shared import CSC_MIRROR
 # this function is brute force looping through the whole directory and checking dates
 # it may sound horrible, but for certain distros, i believe it's indeed the best solution
 # lists
 urls=[]
 class xubuntu_releases(Project):
    """xubuntu_releases class"""
    @classmethod
    def scrape(cls, urls, site):
        # getting the request from url
        r = requests.get(site)
        # converting the text
        s = BeautifulSoup(r.text,"html.parser")
        for i in s.find_all("a"): # for a href directories
            href = i.attrs['href']
            if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"):
                site_next = site+href
                if site_next not in  urls:
                    urls.append(site_next) 
                    # print(site_next)
                    # calling it self
                    cls.scrape(urls, site_next)
    @staticmethod
    def get_latest_date(web_dir):
        page = requests.get(web_dir).text
        str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
        # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
        # print(str_dates[0])
        if len(str_dates) == 0:
            return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
        # for date in str_dates:
        #     print(date)
        dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
        # for date in dates:
        #     print(date)
        return(max(dates))
    @classmethod
    def max_date(cls, urls):
        latest_date = cls.get_latest_date(urls[0])
        # get_latest_date(urls[0])
        for dir in urls:
            latest_date2 = cls.get_latest_date(dir)
            if (latest_date2 >= latest_date):
                latest_date = latest_date2
        # print(latest_date)
        return latest_date
    @classmethod
    def check(cls, data, project):
        """Check if project packages are up-to-date"""
        # lists
        urls1=[]
        urls2=[]
        csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
        upstream_url = data[project]["upstream"] + data[project]["file"]
        # calling function
        cls.scrape(urls1, csc_url)
        cls.scrape(urls2, upstream_url)
        # print(len(urls1), len(urls2))
        return cls.max_date(urls1) == cls.max_date(urls2)
--- a/test.py
+++ b/test.py
@ -6,6 +6,9 @@ from datetime import datetime
 from datetime import timedelta
 import time
 import pandas as pd
 import re # for salt stack specifically
 from projects import linuxmint
 import json  # import json to read project info stored in json file
 # this function is brute force looping through the whole directory and checking dates
 # it may sound horrible, but for certain distros, i believe it's indeed the best solution
@ -13,7 +16,7 @@ import pandas as pd
 # lists
 urls=[]
-home_site = "http://ftp.netbsd.org/pub"
+home_site = "https://cdimage.ubuntu.com"
 # function created
 def scrape(site):
@ -24,14 +27,18 @@ def scrape(site):
    # converting the text
    s = BeautifulSoup(r.text,"html.parser")
    # salt stack specific code
    # s = s.find("div", {"id": "listing"})
    # print(s)
    for i in s.find_all("a"): # for a href directories
        href = i.attrs['href']
        if href.endswith("/") and href != "../" and href != "/":
-            """if home_site+href in urls: # avoids the link to parent directory
+            if home_site+href in urls: # avoids the link to parent directory
                continue"""
            if href == "//ftp.netbsd.org/": # netbsd specific code
                continue
            """if href == "//ftp.netbsd.org/": # netbsd specific code
                continue"""
            site_next = site+href
            if site_next not in  urls:
@ -41,24 +48,29 @@ def scrape(site):
                scrape(site_next)
 def get_latest_date(web_dir):
-    page = requests.get(site).text
+    page = requests.get(web_dir).text
    str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
-    dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
+    # print(str_dates[0])
    dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates]
    # for date in dates:
    #     print(date)
    if len(dates) == 0:
        return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
    return(max(dates))
 # main function
 if __name__ =="__main__":
    with open("data.json", "r", encoding="utf-8") as file:
        data = json.load(file)
        print(linuxmint.check(data, "linuxmint"))
    """# website to be scrape
-    site="http://ftp.netbsd.org/pub/NetBSD/"
+    site="https://cdimage.ubuntu.com/releases/"
-    # works on: https://www.x.org/releases/
+    # works on: 
-    #           https://mirror.csclub.uwaterloo.ca/linuxmint/ #works wonders for linuxmint
+    #           https://www.x.org/releases/
    #           unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on https://linuxmint.com/mirrors.php seems like the best choice
    # calling function
    scrape(site)
@ -71,9 +83,4 @@ if __name__ =="__main__":
            latest_date = latest_date2
    print(latest_date)"""
    csc_url = "https://mirror.csclub.uwaterloo.ca/ubuntu-ports/project/trace/anonster.canonical.com"
    upstream_url = "http://ports.ubuntu.com/ubuntu-ports/project/trace/anonster.canonical.com"
    print(requests.get(upstream_url).text)
    print(requests.get(csc_url).text == requests.get(upstream_url).text)