added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases

This commit is contained in:
Tom 2021-10-14 18:22:39 -07:00
parent 2399f32d39
commit c76ae9c325
8 changed files with 436 additions and 29 deletions

View File

@ -19,24 +19,15 @@ even if the date relies on a specific file in their repo, we can still find the
to find repos of the mirrored projects to check, just search "projectName mirrors"
not done:
damnsmalllinux: http://distro.ibiblio.org/damnsmall/ this project seems abandoned, candidate for brute force looping
debian-backports: no public repo, no timestamp, no mirror tracker
debian-volatile: no public repo, no timestamp, no mirror tracker
linuxmint: no public repo
linuxmint-packages pool: http://rsync-packages.linuxmint.com/pool/
macPorts: only distfiles has public repo, no timestamp, too large to loop through
NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker
opensuse: http://download.opensuse.org/ has public repo, a possible timestamp called latest in history, our mirror doesn't have this file tho, no mirror tracker
puppylinux: https://distro.ibiblio.org/puppylinux/ has public repo, no timestamp, too hard to loop through, not likely to have a mirror tracker
puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy
racket: no public repo, no timestamp, no mirror status tracker
raspberry pi: https://archive.raspberrypi.org/ no timestamp, no mirror status tracker
sagemath: don't know how to deal with this, it's a website
salt stack: don't know how to deal with this, it's a website
scientific: https://scientificlinux.org/downloads/sl-mirrors/ would be easy to scrape the mirror status page, except that csc is not listed here
ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker
x.org: https://www.x.org/releases/ no timestamp, but candidate for brute force looping since it has few folders, no status tracker
Xiph: no timestamp, too big to loop through, no status tracker
xubuntu-releases
done:
almalinux
@ -50,11 +41,14 @@ CRAN: https://cran.r-project.org/mirmon_report.html has a mirror tracker
csclub: for now, this is the upstream itself, so it needs not to be checked
CTAN: https://www.ctan.org/mirrors/mirmon has a mirror tracker
Cygwin
damnsmalllinux: http://distro.ibiblio.org/damnsmall/ not checking this, since it's abandoned
debian
debian-backports: this is a legacy thing, no longer have to check
debian-cd
debian-multimedia
debian-ports
debian-security
debian-volatile: this is a legacy thing, no longer have to check
eclipse
emacsconf: for now, this is the upstream itself, so it needs not to be checked
fedora
@ -68,6 +62,8 @@ ipfire
kde
kde-applicationdata
kernel
linuxmint: https://mirrors.edge.kernel.org/linuxmint/ candidate for brute force looping
linuxmint-packages: https://mirrors.edge.kernel.org/linuxmint-packages/ Checking the timestamp of either the Release file or the Packages file should suffice.
manjaro
mxlinux
mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird
@ -77,11 +73,15 @@ openbsd
parabola: https://repo.parabola.nu/ https://www.parabola.nu/mirrors/status/
pkgsrc
qtproject: https://download.qt.io/
raspberry pi: https://archive.raspberrypi.org/ Checking the timestamp of either the Release file or the Packages file should suffice.
raspbian: http://archive.raspbian.org/raspbian/ snapshotindex.txt is most likely a timestamp, tho i'm not sure. also i think our mirror is completely outdated, it's not listed on official mirror list
scientific: https://scientificlinux.org/downloads/sl-mirrors/ not checking this one since it's abandoned
slackware: https://mirrors.slackware.com/mirrorlist/ https://mirrors.slackware.com/slackware/ checking using the last updated date here, don't know if it's entirely accurate
tdf: https://download.documentfoundation.org/
trisquel: https://trisquel.info/mirmon/index.html out of date website!? please recheck this!!!
ubuntu: https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-archive
ubuntu-ports: http://ports.ubuntu.com/ubuntu-ports/ checks the file anonster.canonical.com, which appears to be a timestamp (check it to make sure!!!)
ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it
ubuntu-releases: https://releases.ubuntu.com/
vlc: http://download.videolan.org/pub/videolan/
vlc: http://download.videolan.org/pub/videolan/
xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders

View File

@ -147,7 +147,7 @@
"file": "gutenberg.dcs"
},
"IPFire": {
"out_of_sync_since": null,
"out_of_sync_since": 1634257890,
"out_of_sync_interval": 172800
},
"KDE": {
@ -228,7 +228,7 @@
"file": "last-updated.txt"
},
"nongnu": {
"out_of_sync_since": 1633333607,
"out_of_sync_since": null,
"out_of_sync_interval": 86400,
"csc": "nongnu/",
"upstream": "http://download-mirror.savannah.gnu.org/releases/",
@ -249,7 +249,7 @@
"file": "MIRROR-TIMESTAMP"
},
"qtproject": {
"out_of_sync_since": null,
"out_of_sync_since": 1634247878,
"out_of_sync_interval": 86400,
"csc": "qtproject/",
"upstream": "https://download.qt.io/",
@ -303,5 +303,40 @@
"csc": "",
"upstream": "https://www.ctan.org/mirrors/mirmon",
"file": ""
},
"linuxmint_packages": {
"out_of_sync_since": null,
"out_of_sync_interval": 86400,
"csc": "linuxmint-packages/",
"upstream": "https://mirrors.edge.kernel.org/linuxmint-packages/",
"file": "dists/"
},
"raspberrypi": {
"out_of_sync_since": 1634249138,
"out_of_sync_interval": 86400,
"csc": "raspberrypi/debian/",
"upstream": "https://archive.raspberrypi.org/debian/",
"file": "dists/"
},
"ubuntu_ports_releases": {
"out_of_sync_since": 1634257890,
"out_of_sync_interval": 86400,
"csc": "ubuntu-ports-releases/",
"upstream": "https://cdimage.ubuntu.com/releases/",
"file": ""
},
"xubuntu_releases": {
"out_of_sync_since": null,
"out_of_sync_interval": 86400,
"csc": "xubuntu-releases/",
"upstream": "https://cdimage.ubuntu.com/xubuntu/releases/",
"file": ""
},
"linuxmint": {
"out_of_sync_since": null,
"out_of_sync_interval": 86400,
"csc": "linuxmint/",
"upstream": "https://mirrors.edge.kernel.org/linuxmint/",
"file": ""
}
}

85
projects/linuxmint.py Normal file
View File

@ -0,0 +1,85 @@
from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
from project import Project
from shared import CSC_MIRROR
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
# lists
urls=[]
class linuxmint(Project):
"""linuxmint class"""
@classmethod
def scrape(cls, urls, site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/":
site_next = site+href
if site_next not in urls:
urls.append(site_next)
# print(site_next)
# calling it self
cls.scrape(urls, site_next)
@staticmethod
def get_latest_date(web_dir):
page = requests.get(web_dir).text
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', page)
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
# print(str_dates[0])
if len(str_dates) == 0:
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
# for date in str_dates:
# print(date)
dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
# for date in dates:
# print(date)
return(max(dates))
@classmethod
def max_date(cls, urls):
latest_date = cls.get_latest_date(urls[0])
# get_latest_date(urls[0])
for dir in urls:
latest_date2 = cls.get_latest_date(dir)
if (latest_date2 >= latest_date):
latest_date = latest_date2
# print(latest_date)
return latest_date
@classmethod
def check(cls, data, project):
"""Check if project packages are up-to-date"""
# lists
urls1=[]
urls2=[]
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
# calling function
cls.scrape(urls1, csc_url)
cls.scrape(urls2, upstream_url)
# print(len(urls1), len(urls2))
return cls.max_date(urls1) == cls.max_date(urls2)

View File

@ -0,0 +1,55 @@
from bs4 import BeautifulSoup
import requests
from project import Project
from shared import CSC_MIRROR
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
class linuxmint_packages(Project):
"""linuxmint_packages class"""
@staticmethod
def scrape(urls, site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
# salt stack specific code
# s = s.find("div", {"id": "listing"})
# print(s)
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/":
site_next = site+href+"Release"
if site_next not in urls:
urls.append(site_next)
# print(site_next)
@classmethod
def check(cls, data, project):
"""Check if project packages are up-to-date"""
# lists
urls1=[]
urls2=[]
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
# calling function
cls.scrape(urls1, csc_url)
cls.scrape(urls2, upstream_url)
if (len(urls1) != len(urls2)):
return False
urls1.sort()
urls2.sort()
for index, f in enumerate(urls1):
if requests.get(f).text != requests.get(urls2[index]).text:
# comparing the file content bc that's how the base class does it, but we can speed it up by just comparing the dates
return False
return True

55
projects/raspberrypi.py Normal file
View File

@ -0,0 +1,55 @@
from bs4 import BeautifulSoup
import requests
from project import Project
from shared import CSC_MIRROR
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
class raspberrypi(Project):
"""raspberrypi class"""
@staticmethod
def scrape(urls, site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
# salt stack specific code
# s = s.find("div", {"id": "listing"})
# print(s)
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/":
site_next = site+href+"Release"
if site_next not in urls:
urls.append(site_next)
# print(site_next)
@classmethod
def check(cls, data, project):
"""Check if project packages are up-to-date"""
# lists
urls1=[]
urls2=[]
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
# calling function
cls.scrape(urls1, csc_url)
cls.scrape(urls2, upstream_url)
if (len(urls1) != len(urls2)):
return False
urls1.sort()
urls2.sort()
for index, f in enumerate(urls1):
if requests.get(f).text != requests.get(urls2[index]).text:
# comparing the file content bc that's how the base class does it, but we can speed it up by just comparing the dates
return False
return True

View File

@ -0,0 +1,85 @@
from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
from project import Project
from shared import CSC_MIRROR
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
# lists
urls=[]
class ubuntu_ports_releases(Project):
"""ubuntu_ports_releases class"""
@classmethod
def scrape(cls, urls, site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"):
site_next = site+href
if site_next not in urls:
urls.append(site_next)
# print(site_next)
# calling it self
cls.scrape(urls, site_next)
@staticmethod
def get_latest_date(web_dir):
page = requests.get(web_dir).text
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
# print(str_dates[0])
if len(str_dates) == 0:
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
# for date in str_dates:
# print(date)
dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
# for date in dates:
# print(date)
return(max(dates))
@classmethod
def max_date(cls, urls):
latest_date = cls.get_latest_date(urls[0])
# get_latest_date(urls[0])
for dir in urls:
latest_date2 = cls.get_latest_date(dir)
if (latest_date2 >= latest_date):
latest_date = latest_date2
# print(latest_date)
return latest_date
@classmethod
def check(cls, data, project):
"""Check if project packages are up-to-date"""
# lists
urls1=[]
urls2=[]
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
# calling function
cls.scrape(urls1, csc_url)
cls.scrape(urls2, upstream_url)
# print(len(urls1), len(urls2))
return cls.max_date(urls1) == cls.max_date(urls2)

View File

@ -0,0 +1,85 @@
from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
from project import Project
from shared import CSC_MIRROR
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
# lists
urls=[]
class xubuntu_releases(Project):
"""xubuntu_releases class"""
@classmethod
def scrape(cls, urls, site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"):
site_next = site+href
if site_next not in urls:
urls.append(site_next)
# print(site_next)
# calling it self
cls.scrape(urls, site_next)
@staticmethod
def get_latest_date(web_dir):
page = requests.get(web_dir).text
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
# print(str_dates[0])
if len(str_dates) == 0:
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
# for date in str_dates:
# print(date)
dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
# for date in dates:
# print(date)
return(max(dates))
@classmethod
def max_date(cls, urls):
latest_date = cls.get_latest_date(urls[0])
# get_latest_date(urls[0])
for dir in urls:
latest_date2 = cls.get_latest_date(dir)
if (latest_date2 >= latest_date):
latest_date = latest_date2
# print(latest_date)
return latest_date
@classmethod
def check(cls, data, project):
"""Check if project packages are up-to-date"""
# lists
urls1=[]
urls2=[]
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
# calling function
cls.scrape(urls1, csc_url)
cls.scrape(urls2, upstream_url)
# print(len(urls1), len(urls2))
return cls.max_date(urls1) == cls.max_date(urls2)

37
test.py
View File

@ -6,6 +6,9 @@ from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
import re # for salt stack specifically
from projects import linuxmint
import json # import json to read project info stored in json file
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
@ -13,7 +16,7 @@ import pandas as pd
# lists
urls=[]
home_site = "http://ftp.netbsd.org/pub"
home_site = "https://cdimage.ubuntu.com"
# function created
def scrape(site):
@ -24,14 +27,18 @@ def scrape(site):
# converting the text
s = BeautifulSoup(r.text,"html.parser")
# salt stack specific code
# s = s.find("div", {"id": "listing"})
# print(s)
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/":
"""if home_site+href in urls: # avoids the link to parent directory
continue"""
if href == "//ftp.netbsd.org/": # netbsd specific code
if home_site+href in urls: # avoids the link to parent directory
continue
"""if href == "//ftp.netbsd.org/": # netbsd specific code
continue"""
site_next = site+href
if site_next not in urls:
@ -41,24 +48,29 @@ def scrape(site):
scrape(site_next)
def get_latest_date(web_dir):
page = requests.get(site).text
page = requests.get(web_dir).text
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
# print(str_dates[0])
dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates]
# for date in dates:
# print(date)
if len(dates) == 0:
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
return(max(dates))
# main function
if __name__ =="__main__":
with open("data.json", "r", encoding="utf-8") as file:
data = json.load(file)
print(linuxmint.check(data, "linuxmint"))
"""# website to be scrape
site="http://ftp.netbsd.org/pub/NetBSD/"
# works on: https://www.x.org/releases/
# https://mirror.csclub.uwaterloo.ca/linuxmint/ #works wonders for linuxmint
# unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on https://linuxmint.com/mirrors.php seems like the best choice
site="https://cdimage.ubuntu.com/releases/"
# works on:
# https://www.x.org/releases/
# calling function
scrape(site)
@ -71,9 +83,4 @@ if __name__ =="__main__":
latest_date = latest_date2
print(latest_date)"""
csc_url = "https://mirror.csclub.uwaterloo.ca/ubuntu-ports/project/trace/anonster.canonical.com"
upstream_url = "http://ports.ubuntu.com/ubuntu-ports/project/trace/anonster.canonical.com"
print(requests.get(upstream_url).text)
print(requests.get(csc_url).text == requests.get(upstream_url).text)