added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases
This commit is contained in:
parent
2399f32d39
commit
c76ae9c325
22
README.md
22
README.md
|
@ -19,24 +19,15 @@ even if the date relies on a specific file in their repo, we can still find the
|
|||
to find repos of the mirrored projects to check, just search "projectName mirrors"
|
||||
|
||||
not done:
|
||||
damnsmalllinux: http://distro.ibiblio.org/damnsmall/ this project seems abandoned, candidate for brute force looping
|
||||
debian-backports: no public repo, no timestamp, no mirror tracker
|
||||
debian-volatile: no public repo, no timestamp, no mirror tracker
|
||||
linuxmint: no public repo
|
||||
linuxmint-packages pool: http://rsync-packages.linuxmint.com/pool/
|
||||
macPorts: only distfiles has public repo, no timestamp, too large to loop through
|
||||
NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker
|
||||
opensuse: http://download.opensuse.org/ has public repo, a possible timestamp called latest in history, our mirror doesn't have this file tho, no mirror tracker
|
||||
puppylinux: https://distro.ibiblio.org/puppylinux/ has public repo, no timestamp, too hard to loop through, not likely to have a mirror tracker
|
||||
puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy
|
||||
racket: no public repo, no timestamp, no mirror status tracker
|
||||
raspberry pi: https://archive.raspberrypi.org/ no timestamp, no mirror status tracker
|
||||
sagemath: don't know how to deal with this, it's a website
|
||||
salt stack: don't know how to deal with this, it's a website
|
||||
scientific: https://scientificlinux.org/downloads/sl-mirrors/ would be easy to scrape the mirror status page, except that csc is not listed here
|
||||
ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker
|
||||
x.org: https://www.x.org/releases/ no timestamp, but candidate for brute force looping since it has few folders, no status tracker
|
||||
Xiph: no timestamp, too big to loop through, no status tracker
|
||||
xubuntu-releases
|
||||
|
||||
done:
|
||||
almalinux
|
||||
|
@ -50,11 +41,14 @@ CRAN: https://cran.r-project.org/mirmon_report.html has a mirror tracker
|
|||
csclub: for now, this is the upstream itself, so it needs not to be checked
|
||||
CTAN: https://www.ctan.org/mirrors/mirmon has a mirror tracker
|
||||
Cygwin
|
||||
damnsmalllinux: http://distro.ibiblio.org/damnsmall/ not checking this, since it's abandoned
|
||||
debian
|
||||
debian-backports: this is a legacy thing, no longer have to check
|
||||
debian-cd
|
||||
debian-multimedia
|
||||
debian-ports
|
||||
debian-security
|
||||
debian-volatile: this is a legacy thing, no longer have to check
|
||||
eclipse
|
||||
emacsconf: for now, this is the upstream itself, so it needs not to be checked
|
||||
fedora
|
||||
|
@ -68,6 +62,8 @@ ipfire
|
|||
kde
|
||||
kde-applicationdata
|
||||
kernel
|
||||
linuxmint: https://mirrors.edge.kernel.org/linuxmint/ candidate for brute force looping
|
||||
linuxmint-packages: https://mirrors.edge.kernel.org/linuxmint-packages/ Checking the timestamp of either the Release file or the Packages file should suffice.
|
||||
manjaro
|
||||
mxlinux
|
||||
mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird
|
||||
|
@ -77,11 +73,15 @@ openbsd
|
|||
parabola: https://repo.parabola.nu/ https://www.parabola.nu/mirrors/status/
|
||||
pkgsrc
|
||||
qtproject: https://download.qt.io/
|
||||
raspberry pi: https://archive.raspberrypi.org/ Checking the timestamp of either the Release file or the Packages file should suffice.
|
||||
raspbian: http://archive.raspbian.org/raspbian/ snapshotindex.txt is most likely a timestamp, tho i'm not sure. also i think our mirror is completely outdated, it's not listed on official mirror list
|
||||
scientific: https://scientificlinux.org/downloads/sl-mirrors/ not checking this one since it's abandoned
|
||||
slackware: https://mirrors.slackware.com/mirrorlist/ https://mirrors.slackware.com/slackware/ checking using the last updated date here, don't know if it's entirely accurate
|
||||
tdf: https://download.documentfoundation.org/
|
||||
trisquel: https://trisquel.info/mirmon/index.html out of date website!? please recheck this!!!
|
||||
ubuntu: https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-archive
|
||||
ubuntu-ports: http://ports.ubuntu.com/ubuntu-ports/ checks the file anonster.canonical.com, which appears to be a timestamp (check it to make sure!!!)
|
||||
ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it
|
||||
ubuntu-releases: https://releases.ubuntu.com/
|
||||
vlc: http://download.videolan.org/pub/videolan/
|
||||
vlc: http://download.videolan.org/pub/videolan/
|
||||
xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders
|
41
data.json
41
data.json
|
@ -147,7 +147,7 @@
|
|||
"file": "gutenberg.dcs"
|
||||
},
|
||||
"IPFire": {
|
||||
"out_of_sync_since": null,
|
||||
"out_of_sync_since": 1634257890,
|
||||
"out_of_sync_interval": 172800
|
||||
},
|
||||
"KDE": {
|
||||
|
@ -228,7 +228,7 @@
|
|||
"file": "last-updated.txt"
|
||||
},
|
||||
"nongnu": {
|
||||
"out_of_sync_since": 1633333607,
|
||||
"out_of_sync_since": null,
|
||||
"out_of_sync_interval": 86400,
|
||||
"csc": "nongnu/",
|
||||
"upstream": "http://download-mirror.savannah.gnu.org/releases/",
|
||||
|
@ -249,7 +249,7 @@
|
|||
"file": "MIRROR-TIMESTAMP"
|
||||
},
|
||||
"qtproject": {
|
||||
"out_of_sync_since": null,
|
||||
"out_of_sync_since": 1634247878,
|
||||
"out_of_sync_interval": 86400,
|
||||
"csc": "qtproject/",
|
||||
"upstream": "https://download.qt.io/",
|
||||
|
@ -303,5 +303,40 @@
|
|||
"csc": "",
|
||||
"upstream": "https://www.ctan.org/mirrors/mirmon",
|
||||
"file": ""
|
||||
},
|
||||
"linuxmint_packages": {
|
||||
"out_of_sync_since": null,
|
||||
"out_of_sync_interval": 86400,
|
||||
"csc": "linuxmint-packages/",
|
||||
"upstream": "https://mirrors.edge.kernel.org/linuxmint-packages/",
|
||||
"file": "dists/"
|
||||
},
|
||||
"raspberrypi": {
|
||||
"out_of_sync_since": 1634249138,
|
||||
"out_of_sync_interval": 86400,
|
||||
"csc": "raspberrypi/debian/",
|
||||
"upstream": "https://archive.raspberrypi.org/debian/",
|
||||
"file": "dists/"
|
||||
},
|
||||
"ubuntu_ports_releases": {
|
||||
"out_of_sync_since": 1634257890,
|
||||
"out_of_sync_interval": 86400,
|
||||
"csc": "ubuntu-ports-releases/",
|
||||
"upstream": "https://cdimage.ubuntu.com/releases/",
|
||||
"file": ""
|
||||
},
|
||||
"xubuntu_releases": {
|
||||
"out_of_sync_since": null,
|
||||
"out_of_sync_interval": 86400,
|
||||
"csc": "xubuntu-releases/",
|
||||
"upstream": "https://cdimage.ubuntu.com/xubuntu/releases/",
|
||||
"file": ""
|
||||
},
|
||||
"linuxmint": {
|
||||
"out_of_sync_since": null,
|
||||
"out_of_sync_interval": 86400,
|
||||
"csc": "linuxmint/",
|
||||
"upstream": "https://mirrors.edge.kernel.org/linuxmint/",
|
||||
"file": ""
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import datefinder # another date finding library
|
||||
import re
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
import time
|
||||
import pandas as pd
|
||||
from project import Project
|
||||
from shared import CSC_MIRROR
|
||||
|
||||
# this function is brute force looping through the whole directory and checking dates
|
||||
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||
|
||||
# lists
|
||||
urls=[]
|
||||
|
||||
class linuxmint(Project):
|
||||
"""linuxmint class"""
|
||||
@classmethod
|
||||
def scrape(cls, urls, site):
|
||||
# getting the request from url
|
||||
r = requests.get(site)
|
||||
|
||||
# converting the text
|
||||
s = BeautifulSoup(r.text,"html.parser")
|
||||
|
||||
for i in s.find_all("a"): # for a href directories
|
||||
href = i.attrs['href']
|
||||
|
||||
if href.endswith("/") and href != "../" and href != "/":
|
||||
site_next = site+href
|
||||
|
||||
if site_next not in urls:
|
||||
urls.append(site_next)
|
||||
# print(site_next)
|
||||
# calling it self
|
||||
cls.scrape(urls, site_next)
|
||||
|
||||
@staticmethod
|
||||
def get_latest_date(web_dir):
|
||||
page = requests.get(web_dir).text
|
||||
|
||||
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', page)
|
||||
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
|
||||
# print(str_dates[0])
|
||||
if len(str_dates) == 0:
|
||||
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||||
# for date in str_dates:
|
||||
# print(date)
|
||||
dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
|
||||
|
||||
# for date in dates:
|
||||
# print(date)
|
||||
return(max(dates))
|
||||
|
||||
@classmethod
|
||||
def max_date(cls, urls):
|
||||
latest_date = cls.get_latest_date(urls[0])
|
||||
# get_latest_date(urls[0])
|
||||
for dir in urls:
|
||||
latest_date2 = cls.get_latest_date(dir)
|
||||
if (latest_date2 >= latest_date):
|
||||
latest_date = latest_date2
|
||||
# print(latest_date)
|
||||
return latest_date
|
||||
|
||||
|
||||
@classmethod
|
||||
def check(cls, data, project):
|
||||
"""Check if project packages are up-to-date"""
|
||||
# lists
|
||||
urls1=[]
|
||||
urls2=[]
|
||||
|
||||
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||
|
||||
# calling function
|
||||
cls.scrape(urls1, csc_url)
|
||||
cls.scrape(urls2, upstream_url)
|
||||
|
||||
# print(len(urls1), len(urls2))
|
||||
|
||||
return cls.max_date(urls1) == cls.max_date(urls2)
|
|
@ -0,0 +1,55 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from project import Project
|
||||
from shared import CSC_MIRROR
|
||||
|
||||
# this function is brute force looping through the whole directory and checking dates
|
||||
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||
|
||||
class linuxmint_packages(Project):
|
||||
"""linuxmint_packages class"""
|
||||
@staticmethod
|
||||
def scrape(urls, site):
|
||||
# getting the request from url
|
||||
r = requests.get(site)
|
||||
|
||||
# converting the text
|
||||
s = BeautifulSoup(r.text,"html.parser")
|
||||
|
||||
# salt stack specific code
|
||||
# s = s.find("div", {"id": "listing"})
|
||||
# print(s)
|
||||
|
||||
for i in s.find_all("a"): # for a href directories
|
||||
href = i.attrs['href']
|
||||
|
||||
if href.endswith("/") and href != "../" and href != "/":
|
||||
site_next = site+href+"Release"
|
||||
|
||||
if site_next not in urls:
|
||||
urls.append(site_next)
|
||||
# print(site_next)
|
||||
|
||||
@classmethod
|
||||
def check(cls, data, project):
|
||||
"""Check if project packages are up-to-date"""
|
||||
# lists
|
||||
urls1=[]
|
||||
urls2=[]
|
||||
|
||||
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||
|
||||
# calling function
|
||||
cls.scrape(urls1, csc_url)
|
||||
cls.scrape(urls2, upstream_url)
|
||||
|
||||
if (len(urls1) != len(urls2)):
|
||||
return False
|
||||
urls1.sort()
|
||||
urls2.sort()
|
||||
for index, f in enumerate(urls1):
|
||||
if requests.get(f).text != requests.get(urls2[index]).text:
|
||||
# comparing the file content bc that's how the base class does it, but we can speed it up by just comparing the dates
|
||||
return False
|
||||
return True
|
|
@ -0,0 +1,55 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from project import Project
|
||||
from shared import CSC_MIRROR
|
||||
|
||||
# this function is brute force looping through the whole directory and checking dates
|
||||
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||
|
||||
class raspberrypi(Project):
|
||||
"""raspberrypi class"""
|
||||
@staticmethod
|
||||
def scrape(urls, site):
|
||||
# getting the request from url
|
||||
r = requests.get(site)
|
||||
|
||||
# converting the text
|
||||
s = BeautifulSoup(r.text,"html.parser")
|
||||
|
||||
# salt stack specific code
|
||||
# s = s.find("div", {"id": "listing"})
|
||||
# print(s)
|
||||
|
||||
for i in s.find_all("a"): # for a href directories
|
||||
href = i.attrs['href']
|
||||
|
||||
if href.endswith("/") and href != "../" and href != "/":
|
||||
site_next = site+href+"Release"
|
||||
|
||||
if site_next not in urls:
|
||||
urls.append(site_next)
|
||||
# print(site_next)
|
||||
|
||||
@classmethod
|
||||
def check(cls, data, project):
|
||||
"""Check if project packages are up-to-date"""
|
||||
# lists
|
||||
urls1=[]
|
||||
urls2=[]
|
||||
|
||||
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||
|
||||
# calling function
|
||||
cls.scrape(urls1, csc_url)
|
||||
cls.scrape(urls2, upstream_url)
|
||||
|
||||
if (len(urls1) != len(urls2)):
|
||||
return False
|
||||
urls1.sort()
|
||||
urls2.sort()
|
||||
for index, f in enumerate(urls1):
|
||||
if requests.get(f).text != requests.get(urls2[index]).text:
|
||||
# comparing the file content bc that's how the base class does it, but we can speed it up by just comparing the dates
|
||||
return False
|
||||
return True
|
|
@ -0,0 +1,85 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import datefinder # another date finding library
|
||||
import re
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
import time
|
||||
import pandas as pd
|
||||
from project import Project
|
||||
from shared import CSC_MIRROR
|
||||
|
||||
# this function is brute force looping through the whole directory and checking dates
|
||||
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||
|
||||
# lists
|
||||
urls=[]
|
||||
|
||||
class ubuntu_ports_releases(Project):
|
||||
"""ubuntu_ports_releases class"""
|
||||
@classmethod
|
||||
def scrape(cls, urls, site):
|
||||
# getting the request from url
|
||||
r = requests.get(site)
|
||||
|
||||
# converting the text
|
||||
s = BeautifulSoup(r.text,"html.parser")
|
||||
|
||||
for i in s.find_all("a"): # for a href directories
|
||||
href = i.attrs['href']
|
||||
|
||||
if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"):
|
||||
site_next = site+href
|
||||
|
||||
if site_next not in urls:
|
||||
urls.append(site_next)
|
||||
# print(site_next)
|
||||
# calling it self
|
||||
cls.scrape(urls, site_next)
|
||||
|
||||
@staticmethod
|
||||
def get_latest_date(web_dir):
|
||||
page = requests.get(web_dir).text
|
||||
|
||||
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
|
||||
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
|
||||
# print(str_dates[0])
|
||||
if len(str_dates) == 0:
|
||||
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||||
# for date in str_dates:
|
||||
# print(date)
|
||||
dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
|
||||
|
||||
# for date in dates:
|
||||
# print(date)
|
||||
return(max(dates))
|
||||
|
||||
@classmethod
|
||||
def max_date(cls, urls):
|
||||
latest_date = cls.get_latest_date(urls[0])
|
||||
# get_latest_date(urls[0])
|
||||
for dir in urls:
|
||||
latest_date2 = cls.get_latest_date(dir)
|
||||
if (latest_date2 >= latest_date):
|
||||
latest_date = latest_date2
|
||||
# print(latest_date)
|
||||
return latest_date
|
||||
|
||||
|
||||
@classmethod
|
||||
def check(cls, data, project):
|
||||
"""Check if project packages are up-to-date"""
|
||||
# lists
|
||||
urls1=[]
|
||||
urls2=[]
|
||||
|
||||
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||
|
||||
# calling function
|
||||
cls.scrape(urls1, csc_url)
|
||||
cls.scrape(urls2, upstream_url)
|
||||
|
||||
# print(len(urls1), len(urls2))
|
||||
|
||||
return cls.max_date(urls1) == cls.max_date(urls2)
|
|
@ -0,0 +1,85 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import datefinder # another date finding library
|
||||
import re
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
import time
|
||||
import pandas as pd
|
||||
from project import Project
|
||||
from shared import CSC_MIRROR
|
||||
|
||||
# this function is brute force looping through the whole directory and checking dates
|
||||
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||
|
||||
# lists
|
||||
urls=[]
|
||||
|
||||
class xubuntu_releases(Project):
|
||||
"""xubuntu_releases class"""
|
||||
@classmethod
|
||||
def scrape(cls, urls, site):
|
||||
# getting the request from url
|
||||
r = requests.get(site)
|
||||
|
||||
# converting the text
|
||||
s = BeautifulSoup(r.text,"html.parser")
|
||||
|
||||
for i in s.find_all("a"): # for a href directories
|
||||
href = i.attrs['href']
|
||||
|
||||
if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"):
|
||||
site_next = site+href
|
||||
|
||||
if site_next not in urls:
|
||||
urls.append(site_next)
|
||||
# print(site_next)
|
||||
# calling it self
|
||||
cls.scrape(urls, site_next)
|
||||
|
||||
@staticmethod
|
||||
def get_latest_date(web_dir):
|
||||
page = requests.get(web_dir).text
|
||||
|
||||
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
|
||||
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
|
||||
# print(str_dates[0])
|
||||
if len(str_dates) == 0:
|
||||
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||||
# for date in str_dates:
|
||||
# print(date)
|
||||
dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
|
||||
|
||||
# for date in dates:
|
||||
# print(date)
|
||||
return(max(dates))
|
||||
|
||||
@classmethod
|
||||
def max_date(cls, urls):
|
||||
latest_date = cls.get_latest_date(urls[0])
|
||||
# get_latest_date(urls[0])
|
||||
for dir in urls:
|
||||
latest_date2 = cls.get_latest_date(dir)
|
||||
if (latest_date2 >= latest_date):
|
||||
latest_date = latest_date2
|
||||
# print(latest_date)
|
||||
return latest_date
|
||||
|
||||
|
||||
@classmethod
|
||||
def check(cls, data, project):
|
||||
"""Check if project packages are up-to-date"""
|
||||
# lists
|
||||
urls1=[]
|
||||
urls2=[]
|
||||
|
||||
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||
|
||||
# calling function
|
||||
cls.scrape(urls1, csc_url)
|
||||
cls.scrape(urls2, upstream_url)
|
||||
|
||||
# print(len(urls1), len(urls2))
|
||||
|
||||
return cls.max_date(urls1) == cls.max_date(urls2)
|
37
test.py
37
test.py
|
@ -6,6 +6,9 @@ from datetime import datetime
|
|||
from datetime import timedelta
|
||||
import time
|
||||
import pandas as pd
|
||||
import re # for salt stack specifically
|
||||
from projects import linuxmint
|
||||
import json # import json to read project info stored in json file
|
||||
|
||||
# this function is brute force looping through the whole directory and checking dates
|
||||
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||
|
@ -13,7 +16,7 @@ import pandas as pd
|
|||
# lists
|
||||
urls=[]
|
||||
|
||||
home_site = "http://ftp.netbsd.org/pub"
|
||||
home_site = "https://cdimage.ubuntu.com"
|
||||
|
||||
# function created
|
||||
def scrape(site):
|
||||
|
@ -24,14 +27,18 @@ def scrape(site):
|
|||
# converting the text
|
||||
s = BeautifulSoup(r.text,"html.parser")
|
||||
|
||||
# salt stack specific code
|
||||
# s = s.find("div", {"id": "listing"})
|
||||
# print(s)
|
||||
|
||||
for i in s.find_all("a"): # for a href directories
|
||||
href = i.attrs['href']
|
||||
|
||||
if href.endswith("/") and href != "../" and href != "/":
|
||||
"""if home_site+href in urls: # avoids the link to parent directory
|
||||
continue"""
|
||||
if href == "//ftp.netbsd.org/": # netbsd specific code
|
||||
if home_site+href in urls: # avoids the link to parent directory
|
||||
continue
|
||||
"""if href == "//ftp.netbsd.org/": # netbsd specific code
|
||||
continue"""
|
||||
site_next = site+href
|
||||
|
||||
if site_next not in urls:
|
||||
|
@ -41,24 +48,29 @@ def scrape(site):
|
|||
scrape(site_next)
|
||||
|
||||
def get_latest_date(web_dir):
|
||||
page = requests.get(site).text
|
||||
page = requests.get(web_dir).text
|
||||
|
||||
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
|
||||
dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
|
||||
# print(str_dates[0])
|
||||
dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates]
|
||||
|
||||
# for date in dates:
|
||||
# print(date)
|
||||
|
||||
if len(dates) == 0:
|
||||
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||||
return(max(dates))
|
||||
|
||||
# main function
|
||||
if __name__ =="__main__":
|
||||
with open("data.json", "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
print(linuxmint.check(data, "linuxmint"))
|
||||
|
||||
"""# website to be scrape
|
||||
site="http://ftp.netbsd.org/pub/NetBSD/"
|
||||
# works on: https://www.x.org/releases/
|
||||
# https://mirror.csclub.uwaterloo.ca/linuxmint/ #works wonders for linuxmint
|
||||
# unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on https://linuxmint.com/mirrors.php seems like the best choice
|
||||
site="https://cdimage.ubuntu.com/releases/"
|
||||
# works on:
|
||||
# https://www.x.org/releases/
|
||||
|
||||
# calling function
|
||||
scrape(site)
|
||||
|
@ -71,9 +83,4 @@ if __name__ =="__main__":
|
|||
latest_date = latest_date2
|
||||
|
||||
print(latest_date)"""
|
||||
|
||||
csc_url = "https://mirror.csclub.uwaterloo.ca/ubuntu-ports/project/trace/anonster.canonical.com"
|
||||
upstream_url = "http://ports.ubuntu.com/ubuntu-ports/project/trace/anonster.canonical.com"
|
||||
print(requests.get(upstream_url).text)
|
||||
print(requests.get(csc_url).text == requests.get(upstream_url).text)
|
||||
|
Loading…
Reference in New Issue