forked from public/mirror-checker
added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases
This commit is contained in:
parent
2399f32d39
commit
c76ae9c325
22
README.md
22
README.md
|
@ -19,24 +19,15 @@ even if the date relies on a specific file in their repo, we can still find the
|
||||||
to find repos of the mirrored projects to check, just search "projectName mirrors"
|
to find repos of the mirrored projects to check, just search "projectName mirrors"
|
||||||
|
|
||||||
not done:
|
not done:
|
||||||
damnsmalllinux: http://distro.ibiblio.org/damnsmall/ this project seems abandoned, candidate for brute force looping
|
|
||||||
debian-backports: no public repo, no timestamp, no mirror tracker
|
|
||||||
debian-volatile: no public repo, no timestamp, no mirror tracker
|
|
||||||
linuxmint: no public repo
|
|
||||||
linuxmint-packages pool: http://rsync-packages.linuxmint.com/pool/
|
|
||||||
macPorts: only distfiles has public repo, no timestamp, too large to loop through
|
macPorts: only distfiles has public repo, no timestamp, too large to loop through
|
||||||
NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker
|
NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker
|
||||||
opensuse: http://download.opensuse.org/ has public repo, a possible timestamp called latest in history, our mirror doesn't have this file tho, no mirror tracker
|
opensuse: http://download.opensuse.org/ has public repo, a possible timestamp called latest in history, our mirror doesn't have this file tho, no mirror tracker
|
||||||
puppylinux: https://distro.ibiblio.org/puppylinux/ has public repo, no timestamp, too hard to loop through, not likely to have a mirror tracker
|
puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy
|
||||||
racket: no public repo, no timestamp, no mirror status tracker
|
racket: no public repo, no timestamp, no mirror status tracker
|
||||||
raspberry pi: https://archive.raspberrypi.org/ no timestamp, no mirror status tracker
|
|
||||||
sagemath: don't know how to deal with this, it's a website
|
sagemath: don't know how to deal with this, it's a website
|
||||||
salt stack: don't know how to deal with this, it's a website
|
salt stack: don't know how to deal with this, it's a website
|
||||||
scientific: https://scientificlinux.org/downloads/sl-mirrors/ would be easy to scrape the mirror status page, except that csc is not listed here
|
|
||||||
ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker
|
|
||||||
x.org: https://www.x.org/releases/ no timestamp, but candidate for brute force looping since it has few folders, no status tracker
|
x.org: https://www.x.org/releases/ no timestamp, but candidate for brute force looping since it has few folders, no status tracker
|
||||||
Xiph: no timestamp, too big to loop through, no status tracker
|
Xiph: no timestamp, too big to loop through, no status tracker
|
||||||
xubuntu-releases
|
|
||||||
|
|
||||||
done:
|
done:
|
||||||
almalinux
|
almalinux
|
||||||
|
@ -50,11 +41,14 @@ CRAN: https://cran.r-project.org/mirmon_report.html has a mirror tracker
|
||||||
csclub: for now, this is the upstream itself, so it needs not to be checked
|
csclub: for now, this is the upstream itself, so it needs not to be checked
|
||||||
CTAN: https://www.ctan.org/mirrors/mirmon has a mirror tracker
|
CTAN: https://www.ctan.org/mirrors/mirmon has a mirror tracker
|
||||||
Cygwin
|
Cygwin
|
||||||
|
damnsmalllinux: http://distro.ibiblio.org/damnsmall/ not checking this, since it's abandoned
|
||||||
debian
|
debian
|
||||||
|
debian-backports: this is a legacy thing, no longer have to check
|
||||||
debian-cd
|
debian-cd
|
||||||
debian-multimedia
|
debian-multimedia
|
||||||
debian-ports
|
debian-ports
|
||||||
debian-security
|
debian-security
|
||||||
|
debian-volatile: this is a legacy thing, no longer have to check
|
||||||
eclipse
|
eclipse
|
||||||
emacsconf: for now, this is the upstream itself, so it needs not to be checked
|
emacsconf: for now, this is the upstream itself, so it needs not to be checked
|
||||||
fedora
|
fedora
|
||||||
|
@ -68,6 +62,8 @@ ipfire
|
||||||
kde
|
kde
|
||||||
kde-applicationdata
|
kde-applicationdata
|
||||||
kernel
|
kernel
|
||||||
|
linuxmint: https://mirrors.edge.kernel.org/linuxmint/ candidate for brute force looping
|
||||||
|
linuxmint-packages: https://mirrors.edge.kernel.org/linuxmint-packages/ Checking the timestamp of either the Release file or the Packages file should suffice.
|
||||||
manjaro
|
manjaro
|
||||||
mxlinux
|
mxlinux
|
||||||
mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird
|
mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird
|
||||||
|
@ -77,11 +73,15 @@ openbsd
|
||||||
parabola: https://repo.parabola.nu/ https://www.parabola.nu/mirrors/status/
|
parabola: https://repo.parabola.nu/ https://www.parabola.nu/mirrors/status/
|
||||||
pkgsrc
|
pkgsrc
|
||||||
qtproject: https://download.qt.io/
|
qtproject: https://download.qt.io/
|
||||||
|
raspberry pi: https://archive.raspberrypi.org/ Checking the timestamp of either the Release file or the Packages file should suffice.
|
||||||
raspbian: http://archive.raspbian.org/raspbian/ snapshotindex.txt is most likely a timestamp, tho i'm not sure. also i think our mirror is completely outdated, it's not listed on official mirror list
|
raspbian: http://archive.raspbian.org/raspbian/ snapshotindex.txt is most likely a timestamp, tho i'm not sure. also i think our mirror is completely outdated, it's not listed on official mirror list
|
||||||
|
scientific: https://scientificlinux.org/downloads/sl-mirrors/ not checking this one since it's abandoned
|
||||||
slackware: https://mirrors.slackware.com/mirrorlist/ https://mirrors.slackware.com/slackware/ checking using the last updated date here, don't know if it's entirely accurate
|
slackware: https://mirrors.slackware.com/mirrorlist/ https://mirrors.slackware.com/slackware/ checking using the last updated date here, don't know if it's entirely accurate
|
||||||
tdf: https://download.documentfoundation.org/
|
tdf: https://download.documentfoundation.org/
|
||||||
trisquel: https://trisquel.info/mirmon/index.html out of date website!? please recheck this!!!
|
trisquel: https://trisquel.info/mirmon/index.html out of date website!? please recheck this!!!
|
||||||
ubuntu: https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-archive
|
ubuntu: https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-archive
|
||||||
ubuntu-ports: http://ports.ubuntu.com/ubuntu-ports/ checks the file anonster.canonical.com, which appears to be a timestamp (check it to make sure!!!)
|
ubuntu-ports: http://ports.ubuntu.com/ubuntu-ports/ checks the file anonster.canonical.com, which appears to be a timestamp (check it to make sure!!!)
|
||||||
|
ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it
|
||||||
ubuntu-releases: https://releases.ubuntu.com/
|
ubuntu-releases: https://releases.ubuntu.com/
|
||||||
vlc: http://download.videolan.org/pub/videolan/
|
vlc: http://download.videolan.org/pub/videolan/
|
||||||
|
xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders
|
41
data.json
41
data.json
|
@ -147,7 +147,7 @@
|
||||||
"file": "gutenberg.dcs"
|
"file": "gutenberg.dcs"
|
||||||
},
|
},
|
||||||
"IPFire": {
|
"IPFire": {
|
||||||
"out_of_sync_since": null,
|
"out_of_sync_since": 1634257890,
|
||||||
"out_of_sync_interval": 172800
|
"out_of_sync_interval": 172800
|
||||||
},
|
},
|
||||||
"KDE": {
|
"KDE": {
|
||||||
|
@ -228,7 +228,7 @@
|
||||||
"file": "last-updated.txt"
|
"file": "last-updated.txt"
|
||||||
},
|
},
|
||||||
"nongnu": {
|
"nongnu": {
|
||||||
"out_of_sync_since": 1633333607,
|
"out_of_sync_since": null,
|
||||||
"out_of_sync_interval": 86400,
|
"out_of_sync_interval": 86400,
|
||||||
"csc": "nongnu/",
|
"csc": "nongnu/",
|
||||||
"upstream": "http://download-mirror.savannah.gnu.org/releases/",
|
"upstream": "http://download-mirror.savannah.gnu.org/releases/",
|
||||||
|
@ -249,7 +249,7 @@
|
||||||
"file": "MIRROR-TIMESTAMP"
|
"file": "MIRROR-TIMESTAMP"
|
||||||
},
|
},
|
||||||
"qtproject": {
|
"qtproject": {
|
||||||
"out_of_sync_since": null,
|
"out_of_sync_since": 1634247878,
|
||||||
"out_of_sync_interval": 86400,
|
"out_of_sync_interval": 86400,
|
||||||
"csc": "qtproject/",
|
"csc": "qtproject/",
|
||||||
"upstream": "https://download.qt.io/",
|
"upstream": "https://download.qt.io/",
|
||||||
|
@ -303,5 +303,40 @@
|
||||||
"csc": "",
|
"csc": "",
|
||||||
"upstream": "https://www.ctan.org/mirrors/mirmon",
|
"upstream": "https://www.ctan.org/mirrors/mirmon",
|
||||||
"file": ""
|
"file": ""
|
||||||
|
},
|
||||||
|
"linuxmint_packages": {
|
||||||
|
"out_of_sync_since": null,
|
||||||
|
"out_of_sync_interval": 86400,
|
||||||
|
"csc": "linuxmint-packages/",
|
||||||
|
"upstream": "https://mirrors.edge.kernel.org/linuxmint-packages/",
|
||||||
|
"file": "dists/"
|
||||||
|
},
|
||||||
|
"raspberrypi": {
|
||||||
|
"out_of_sync_since": 1634249138,
|
||||||
|
"out_of_sync_interval": 86400,
|
||||||
|
"csc": "raspberrypi/debian/",
|
||||||
|
"upstream": "https://archive.raspberrypi.org/debian/",
|
||||||
|
"file": "dists/"
|
||||||
|
},
|
||||||
|
"ubuntu_ports_releases": {
|
||||||
|
"out_of_sync_since": 1634257890,
|
||||||
|
"out_of_sync_interval": 86400,
|
||||||
|
"csc": "ubuntu-ports-releases/",
|
||||||
|
"upstream": "https://cdimage.ubuntu.com/releases/",
|
||||||
|
"file": ""
|
||||||
|
},
|
||||||
|
"xubuntu_releases": {
|
||||||
|
"out_of_sync_since": null,
|
||||||
|
"out_of_sync_interval": 86400,
|
||||||
|
"csc": "xubuntu-releases/",
|
||||||
|
"upstream": "https://cdimage.ubuntu.com/xubuntu/releases/",
|
||||||
|
"file": ""
|
||||||
|
},
|
||||||
|
"linuxmint": {
|
||||||
|
"out_of_sync_since": null,
|
||||||
|
"out_of_sync_interval": 86400,
|
||||||
|
"csc": "linuxmint/",
|
||||||
|
"upstream": "https://mirrors.edge.kernel.org/linuxmint/",
|
||||||
|
"file": ""
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,85 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
import datefinder # another date finding library
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timedelta
|
||||||
|
import time
|
||||||
|
import pandas as pd
|
||||||
|
from project import Project
|
||||||
|
from shared import CSC_MIRROR
|
||||||
|
|
||||||
|
# this function is brute force looping through the whole directory and checking dates
|
||||||
|
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||||
|
|
||||||
|
# lists
|
||||||
|
urls=[]
|
||||||
|
|
||||||
|
class linuxmint(Project):
|
||||||
|
"""linuxmint class"""
|
||||||
|
@classmethod
|
||||||
|
def scrape(cls, urls, site):
|
||||||
|
# getting the request from url
|
||||||
|
r = requests.get(site)
|
||||||
|
|
||||||
|
# converting the text
|
||||||
|
s = BeautifulSoup(r.text,"html.parser")
|
||||||
|
|
||||||
|
for i in s.find_all("a"): # for a href directories
|
||||||
|
href = i.attrs['href']
|
||||||
|
|
||||||
|
if href.endswith("/") and href != "../" and href != "/":
|
||||||
|
site_next = site+href
|
||||||
|
|
||||||
|
if site_next not in urls:
|
||||||
|
urls.append(site_next)
|
||||||
|
# print(site_next)
|
||||||
|
# calling it self
|
||||||
|
cls.scrape(urls, site_next)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_latest_date(web_dir):
|
||||||
|
page = requests.get(web_dir).text
|
||||||
|
|
||||||
|
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', page)
|
||||||
|
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
|
||||||
|
# print(str_dates[0])
|
||||||
|
if len(str_dates) == 0:
|
||||||
|
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||||||
|
# for date in str_dates:
|
||||||
|
# print(date)
|
||||||
|
dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
|
||||||
|
|
||||||
|
# for date in dates:
|
||||||
|
# print(date)
|
||||||
|
return(max(dates))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def max_date(cls, urls):
|
||||||
|
latest_date = cls.get_latest_date(urls[0])
|
||||||
|
# get_latest_date(urls[0])
|
||||||
|
for dir in urls:
|
||||||
|
latest_date2 = cls.get_latest_date(dir)
|
||||||
|
if (latest_date2 >= latest_date):
|
||||||
|
latest_date = latest_date2
|
||||||
|
# print(latest_date)
|
||||||
|
return latest_date
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def check(cls, data, project):
|
||||||
|
"""Check if project packages are up-to-date"""
|
||||||
|
# lists
|
||||||
|
urls1=[]
|
||||||
|
urls2=[]
|
||||||
|
|
||||||
|
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||||
|
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||||
|
|
||||||
|
# calling function
|
||||||
|
cls.scrape(urls1, csc_url)
|
||||||
|
cls.scrape(urls2, upstream_url)
|
||||||
|
|
||||||
|
# print(len(urls1), len(urls2))
|
||||||
|
|
||||||
|
return cls.max_date(urls1) == cls.max_date(urls2)
|
|
@ -0,0 +1,55 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
from project import Project
|
||||||
|
from shared import CSC_MIRROR
|
||||||
|
|
||||||
|
# this function is brute force looping through the whole directory and checking dates
|
||||||
|
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||||
|
|
||||||
|
class linuxmint_packages(Project):
|
||||||
|
"""linuxmint_packages class"""
|
||||||
|
@staticmethod
|
||||||
|
def scrape(urls, site):
|
||||||
|
# getting the request from url
|
||||||
|
r = requests.get(site)
|
||||||
|
|
||||||
|
# converting the text
|
||||||
|
s = BeautifulSoup(r.text,"html.parser")
|
||||||
|
|
||||||
|
# salt stack specific code
|
||||||
|
# s = s.find("div", {"id": "listing"})
|
||||||
|
# print(s)
|
||||||
|
|
||||||
|
for i in s.find_all("a"): # for a href directories
|
||||||
|
href = i.attrs['href']
|
||||||
|
|
||||||
|
if href.endswith("/") and href != "../" and href != "/":
|
||||||
|
site_next = site+href+"Release"
|
||||||
|
|
||||||
|
if site_next not in urls:
|
||||||
|
urls.append(site_next)
|
||||||
|
# print(site_next)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def check(cls, data, project):
|
||||||
|
"""Check if project packages are up-to-date"""
|
||||||
|
# lists
|
||||||
|
urls1=[]
|
||||||
|
urls2=[]
|
||||||
|
|
||||||
|
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||||
|
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||||
|
|
||||||
|
# calling function
|
||||||
|
cls.scrape(urls1, csc_url)
|
||||||
|
cls.scrape(urls2, upstream_url)
|
||||||
|
|
||||||
|
if (len(urls1) != len(urls2)):
|
||||||
|
return False
|
||||||
|
urls1.sort()
|
||||||
|
urls2.sort()
|
||||||
|
for index, f in enumerate(urls1):
|
||||||
|
if requests.get(f).text != requests.get(urls2[index]).text:
|
||||||
|
# comparing the file content bc that's how the base class does it, but we can speed it up by just comparing the dates
|
||||||
|
return False
|
||||||
|
return True
|
|
@ -0,0 +1,55 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
from project import Project
|
||||||
|
from shared import CSC_MIRROR
|
||||||
|
|
||||||
|
# this function is brute force looping through the whole directory and checking dates
|
||||||
|
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||||
|
|
||||||
|
class raspberrypi(Project):
|
||||||
|
"""raspberrypi class"""
|
||||||
|
@staticmethod
|
||||||
|
def scrape(urls, site):
|
||||||
|
# getting the request from url
|
||||||
|
r = requests.get(site)
|
||||||
|
|
||||||
|
# converting the text
|
||||||
|
s = BeautifulSoup(r.text,"html.parser")
|
||||||
|
|
||||||
|
# salt stack specific code
|
||||||
|
# s = s.find("div", {"id": "listing"})
|
||||||
|
# print(s)
|
||||||
|
|
||||||
|
for i in s.find_all("a"): # for a href directories
|
||||||
|
href = i.attrs['href']
|
||||||
|
|
||||||
|
if href.endswith("/") and href != "../" and href != "/":
|
||||||
|
site_next = site+href+"Release"
|
||||||
|
|
||||||
|
if site_next not in urls:
|
||||||
|
urls.append(site_next)
|
||||||
|
# print(site_next)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def check(cls, data, project):
|
||||||
|
"""Check if project packages are up-to-date"""
|
||||||
|
# lists
|
||||||
|
urls1=[]
|
||||||
|
urls2=[]
|
||||||
|
|
||||||
|
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||||
|
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||||
|
|
||||||
|
# calling function
|
||||||
|
cls.scrape(urls1, csc_url)
|
||||||
|
cls.scrape(urls2, upstream_url)
|
||||||
|
|
||||||
|
if (len(urls1) != len(urls2)):
|
||||||
|
return False
|
||||||
|
urls1.sort()
|
||||||
|
urls2.sort()
|
||||||
|
for index, f in enumerate(urls1):
|
||||||
|
if requests.get(f).text != requests.get(urls2[index]).text:
|
||||||
|
# comparing the file content bc that's how the base class does it, but we can speed it up by just comparing the dates
|
||||||
|
return False
|
||||||
|
return True
|
|
@ -0,0 +1,85 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
import datefinder # another date finding library
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timedelta
|
||||||
|
import time
|
||||||
|
import pandas as pd
|
||||||
|
from project import Project
|
||||||
|
from shared import CSC_MIRROR
|
||||||
|
|
||||||
|
# this function is brute force looping through the whole directory and checking dates
|
||||||
|
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||||
|
|
||||||
|
# lists
|
||||||
|
urls=[]
|
||||||
|
|
||||||
|
class ubuntu_ports_releases(Project):
|
||||||
|
"""ubuntu_ports_releases class"""
|
||||||
|
@classmethod
|
||||||
|
def scrape(cls, urls, site):
|
||||||
|
# getting the request from url
|
||||||
|
r = requests.get(site)
|
||||||
|
|
||||||
|
# converting the text
|
||||||
|
s = BeautifulSoup(r.text,"html.parser")
|
||||||
|
|
||||||
|
for i in s.find_all("a"): # for a href directories
|
||||||
|
href = i.attrs['href']
|
||||||
|
|
||||||
|
if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"):
|
||||||
|
site_next = site+href
|
||||||
|
|
||||||
|
if site_next not in urls:
|
||||||
|
urls.append(site_next)
|
||||||
|
# print(site_next)
|
||||||
|
# calling it self
|
||||||
|
cls.scrape(urls, site_next)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_latest_date(web_dir):
|
||||||
|
page = requests.get(web_dir).text
|
||||||
|
|
||||||
|
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
|
||||||
|
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
|
||||||
|
# print(str_dates[0])
|
||||||
|
if len(str_dates) == 0:
|
||||||
|
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||||||
|
# for date in str_dates:
|
||||||
|
# print(date)
|
||||||
|
dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
|
||||||
|
|
||||||
|
# for date in dates:
|
||||||
|
# print(date)
|
||||||
|
return(max(dates))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def max_date(cls, urls):
|
||||||
|
latest_date = cls.get_latest_date(urls[0])
|
||||||
|
# get_latest_date(urls[0])
|
||||||
|
for dir in urls:
|
||||||
|
latest_date2 = cls.get_latest_date(dir)
|
||||||
|
if (latest_date2 >= latest_date):
|
||||||
|
latest_date = latest_date2
|
||||||
|
# print(latest_date)
|
||||||
|
return latest_date
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def check(cls, data, project):
|
||||||
|
"""Check if project packages are up-to-date"""
|
||||||
|
# lists
|
||||||
|
urls1=[]
|
||||||
|
urls2=[]
|
||||||
|
|
||||||
|
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||||
|
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||||
|
|
||||||
|
# calling function
|
||||||
|
cls.scrape(urls1, csc_url)
|
||||||
|
cls.scrape(urls2, upstream_url)
|
||||||
|
|
||||||
|
# print(len(urls1), len(urls2))
|
||||||
|
|
||||||
|
return cls.max_date(urls1) == cls.max_date(urls2)
|
|
@ -0,0 +1,85 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
import datefinder # another date finding library
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timedelta
|
||||||
|
import time
|
||||||
|
import pandas as pd
|
||||||
|
from project import Project
|
||||||
|
from shared import CSC_MIRROR
|
||||||
|
|
||||||
|
# this function is brute force looping through the whole directory and checking dates
|
||||||
|
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||||
|
|
||||||
|
# lists
|
||||||
|
urls=[]
|
||||||
|
|
||||||
|
class xubuntu_releases(Project):
|
||||||
|
"""xubuntu_releases class"""
|
||||||
|
@classmethod
|
||||||
|
def scrape(cls, urls, site):
|
||||||
|
# getting the request from url
|
||||||
|
r = requests.get(site)
|
||||||
|
|
||||||
|
# converting the text
|
||||||
|
s = BeautifulSoup(r.text,"html.parser")
|
||||||
|
|
||||||
|
for i in s.find_all("a"): # for a href directories
|
||||||
|
href = i.attrs['href']
|
||||||
|
|
||||||
|
if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"):
|
||||||
|
site_next = site+href
|
||||||
|
|
||||||
|
if site_next not in urls:
|
||||||
|
urls.append(site_next)
|
||||||
|
# print(site_next)
|
||||||
|
# calling it self
|
||||||
|
cls.scrape(urls, site_next)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_latest_date(web_dir):
|
||||||
|
page = requests.get(web_dir).text
|
||||||
|
|
||||||
|
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
|
||||||
|
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
|
||||||
|
# print(str_dates[0])
|
||||||
|
if len(str_dates) == 0:
|
||||||
|
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||||||
|
# for date in str_dates:
|
||||||
|
# print(date)
|
||||||
|
dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
|
||||||
|
|
||||||
|
# for date in dates:
|
||||||
|
# print(date)
|
||||||
|
return(max(dates))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def max_date(cls, urls):
|
||||||
|
latest_date = cls.get_latest_date(urls[0])
|
||||||
|
# get_latest_date(urls[0])
|
||||||
|
for dir in urls:
|
||||||
|
latest_date2 = cls.get_latest_date(dir)
|
||||||
|
if (latest_date2 >= latest_date):
|
||||||
|
latest_date = latest_date2
|
||||||
|
# print(latest_date)
|
||||||
|
return latest_date
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def check(cls, data, project):
|
||||||
|
"""Check if project packages are up-to-date"""
|
||||||
|
# lists
|
||||||
|
urls1=[]
|
||||||
|
urls2=[]
|
||||||
|
|
||||||
|
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||||
|
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||||
|
|
||||||
|
# calling function
|
||||||
|
cls.scrape(urls1, csc_url)
|
||||||
|
cls.scrape(urls2, upstream_url)
|
||||||
|
|
||||||
|
# print(len(urls1), len(urls2))
|
||||||
|
|
||||||
|
return cls.max_date(urls1) == cls.max_date(urls2)
|
37
test.py
37
test.py
|
@ -6,6 +6,9 @@ from datetime import datetime
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
import time
|
import time
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import re # for salt stack specifically
|
||||||
|
from projects import linuxmint
|
||||||
|
import json # import json to read project info stored in json file
|
||||||
|
|
||||||
# this function is brute force looping through the whole directory and checking dates
|
# this function is brute force looping through the whole directory and checking dates
|
||||||
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||||
|
@ -13,7 +16,7 @@ import pandas as pd
|
||||||
# lists
|
# lists
|
||||||
urls=[]
|
urls=[]
|
||||||
|
|
||||||
home_site = "http://ftp.netbsd.org/pub"
|
home_site = "https://cdimage.ubuntu.com"
|
||||||
|
|
||||||
# function created
|
# function created
|
||||||
def scrape(site):
|
def scrape(site):
|
||||||
|
@ -24,14 +27,18 @@ def scrape(site):
|
||||||
# converting the text
|
# converting the text
|
||||||
s = BeautifulSoup(r.text,"html.parser")
|
s = BeautifulSoup(r.text,"html.parser")
|
||||||
|
|
||||||
|
# salt stack specific code
|
||||||
|
# s = s.find("div", {"id": "listing"})
|
||||||
|
# print(s)
|
||||||
|
|
||||||
for i in s.find_all("a"): # for a href directories
|
for i in s.find_all("a"): # for a href directories
|
||||||
href = i.attrs['href']
|
href = i.attrs['href']
|
||||||
|
|
||||||
if href.endswith("/") and href != "../" and href != "/":
|
if href.endswith("/") and href != "../" and href != "/":
|
||||||
"""if home_site+href in urls: # avoids the link to parent directory
|
if home_site+href in urls: # avoids the link to parent directory
|
||||||
continue"""
|
|
||||||
if href == "//ftp.netbsd.org/": # netbsd specific code
|
|
||||||
continue
|
continue
|
||||||
|
"""if href == "//ftp.netbsd.org/": # netbsd specific code
|
||||||
|
continue"""
|
||||||
site_next = site+href
|
site_next = site+href
|
||||||
|
|
||||||
if site_next not in urls:
|
if site_next not in urls:
|
||||||
|
@ -41,24 +48,29 @@ def scrape(site):
|
||||||
scrape(site_next)
|
scrape(site_next)
|
||||||
|
|
||||||
def get_latest_date(web_dir):
|
def get_latest_date(web_dir):
|
||||||
page = requests.get(site).text
|
page = requests.get(web_dir).text
|
||||||
|
|
||||||
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
|
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
|
||||||
dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
|
# print(str_dates[0])
|
||||||
|
dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates]
|
||||||
|
|
||||||
# for date in dates:
|
# for date in dates:
|
||||||
# print(date)
|
# print(date)
|
||||||
|
|
||||||
|
if len(dates) == 0:
|
||||||
|
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||||||
return(max(dates))
|
return(max(dates))
|
||||||
|
|
||||||
# main function
|
# main function
|
||||||
if __name__ =="__main__":
|
if __name__ =="__main__":
|
||||||
|
with open("data.json", "r", encoding="utf-8") as file:
|
||||||
|
data = json.load(file)
|
||||||
|
print(linuxmint.check(data, "linuxmint"))
|
||||||
|
|
||||||
"""# website to be scrape
|
"""# website to be scrape
|
||||||
site="http://ftp.netbsd.org/pub/NetBSD/"
|
site="https://cdimage.ubuntu.com/releases/"
|
||||||
# works on: https://www.x.org/releases/
|
# works on:
|
||||||
# https://mirror.csclub.uwaterloo.ca/linuxmint/ #works wonders for linuxmint
|
# https://www.x.org/releases/
|
||||||
# unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on https://linuxmint.com/mirrors.php seems like the best choice
|
|
||||||
|
|
||||||
# calling function
|
# calling function
|
||||||
scrape(site)
|
scrape(site)
|
||||||
|
@ -71,9 +83,4 @@ if __name__ =="__main__":
|
||||||
latest_date = latest_date2
|
latest_date = latest_date2
|
||||||
|
|
||||||
print(latest_date)"""
|
print(latest_date)"""
|
||||||
|
|
||||||
csc_url = "https://mirror.csclub.uwaterloo.ca/ubuntu-ports/project/trace/anonster.canonical.com"
|
|
||||||
upstream_url = "http://ports.ubuntu.com/ubuntu-ports/project/trace/anonster.canonical.com"
|
|
||||||
print(requests.get(upstream_url).text)
|
|
||||||
print(requests.get(csc_url).text == requests.get(upstream_url).text)
|
|
||||||
|
|
Loading…
Reference in New Issue