forked from public/mirror-checker
xiph added
This commit is contained in:
parent
26e57b4d4d
commit
c974d49ffc
10
README.md
10
README.md
|
@ -18,13 +18,13 @@ even if the date relies on a specific file in their repo, we can still find the
|
||||||
|
|
||||||
to find repos of the mirrored projects to check, just search "projectName mirrors"
|
to find repos of the mirrored projects to check, just search "projectName mirrors"
|
||||||
|
|
||||||
|
## checker information
|
||||||
|
|
||||||
not done:
|
not done:
|
||||||
macPorts: only distfiles has public repo, no timestamp, too large to loop through, comparing ports.tar.gz in distfiles
|
|
||||||
NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker
|
NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker
|
||||||
opensuse: http://download.opensuse.org/ has public repo, a possible timestamp called latest in history, our mirror doesn't have this file tho, no mirror tracker
|
opensuse: http://download.opensuse.org/ check Update.repo files in folders inside the update folder
|
||||||
puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy
|
puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy
|
||||||
x.org: https://www.x.org/releases/ no timestamp, but candidate for brute force looping since it has few folders, no status tracker
|
x.org: https://www.x.org/releases/ check all of the files under each directory under /x.org/individual/, and make sure that we have all of the files which the upstream has
|
||||||
Xiph: no timestamp, too big to loop through, no status tracker
|
|
||||||
|
|
||||||
done:
|
done:
|
||||||
almalinux
|
almalinux
|
||||||
|
@ -61,6 +61,7 @@ kde-applicationdata
|
||||||
kernel
|
kernel
|
||||||
linuxmint: https://mirrors.edge.kernel.org/linuxmint/ candidate for brute force looping
|
linuxmint: https://mirrors.edge.kernel.org/linuxmint/ candidate for brute force looping
|
||||||
linuxmint-packages: https://mirrors.edge.kernel.org/linuxmint-packages/ Checking the timestamp of either the Release file or the Packages file should suffice.
|
linuxmint-packages: https://mirrors.edge.kernel.org/linuxmint-packages/ Checking the timestamp of either the Release file or the Packages file should suffice.
|
||||||
|
macPorts: only distfiles has public repo, no timestamp, too large to loop through, comparing ports.tar.gz in distfiles
|
||||||
manjaro
|
manjaro
|
||||||
mxlinux
|
mxlinux
|
||||||
mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird
|
mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird
|
||||||
|
@ -84,4 +85,5 @@ ubuntu-ports: http://ports.ubuntu.com/ubuntu-ports/ checks the file anonster.can
|
||||||
ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it
|
ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it
|
||||||
ubuntu-releases: https://releases.ubuntu.com/
|
ubuntu-releases: https://releases.ubuntu.com/
|
||||||
vlc: http://download.videolan.org/pub/videolan/
|
vlc: http://download.videolan.org/pub/videolan/
|
||||||
|
Xiph: https://ftp.osuosl.org/pub/xiph/releases/ loop through each directory in xiph/releases/ and trying to compare the timestamp of the checksum files
|
||||||
xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders
|
xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders
|
17
data.json
17
data.json
|
@ -21,7 +21,7 @@
|
||||||
"file": "zzz/time.txt"
|
"file": "zzz/time.txt"
|
||||||
},
|
},
|
||||||
"Arch": {
|
"Arch": {
|
||||||
"out_of_sync_since": 1634334754,
|
"out_of_sync_since": null,
|
||||||
"out_of_sync_interval": 86400,
|
"out_of_sync_interval": 86400,
|
||||||
"csc": "archlinux/",
|
"csc": "archlinux/",
|
||||||
"upstream": "http://arch.mirror.constant.com/",
|
"upstream": "http://arch.mirror.constant.com/",
|
||||||
|
@ -52,7 +52,7 @@
|
||||||
"file": "x86/sha512.sum"
|
"file": "x86/sha512.sum"
|
||||||
},
|
},
|
||||||
"Debian": {
|
"Debian": {
|
||||||
"out_of_sync_since": 1634334754,
|
"out_of_sync_since": null,
|
||||||
"out_of_sync_interval": 86400,
|
"out_of_sync_interval": 86400,
|
||||||
"csc": "",
|
"csc": "",
|
||||||
"upstream": "https://ftp-master.debian.org/",
|
"upstream": "https://ftp-master.debian.org/",
|
||||||
|
@ -66,7 +66,7 @@
|
||||||
"file": "debian-cd/project/trace/cdimage.debian.org"
|
"file": "debian-cd/project/trace/cdimage.debian.org"
|
||||||
},
|
},
|
||||||
"DebianMultimedia": {
|
"DebianMultimedia": {
|
||||||
"out_of_sync_since": 1634334754,
|
"out_of_sync_since": null,
|
||||||
"out_of_sync_interval": 86400,
|
"out_of_sync_interval": 86400,
|
||||||
"csc": "debian-multimedia/",
|
"csc": "debian-multimedia/",
|
||||||
"upstream": "http://debian-mirrors.sdinet.de/deb-multimedia/",
|
"upstream": "http://debian-mirrors.sdinet.de/deb-multimedia/",
|
||||||
|
@ -140,7 +140,7 @@
|
||||||
"file": "gnu/mirror-updated-timestamp.txt"
|
"file": "gnu/mirror-updated-timestamp.txt"
|
||||||
},
|
},
|
||||||
"Gutenberg": {
|
"Gutenberg": {
|
||||||
"out_of_sync_since": 1633294718,
|
"out_of_sync_since": null,
|
||||||
"out_of_sync_interval": 172800,
|
"out_of_sync_interval": 172800,
|
||||||
"csc": "gutenberg/",
|
"csc": "gutenberg/",
|
||||||
"upstream": "https://gutenberg.pglaf.org/",
|
"upstream": "https://gutenberg.pglaf.org/",
|
||||||
|
@ -305,7 +305,7 @@
|
||||||
"file": ""
|
"file": ""
|
||||||
},
|
},
|
||||||
"linuxmint_packages": {
|
"linuxmint_packages": {
|
||||||
"out_of_sync_since": 1634334754,
|
"out_of_sync_since": null,
|
||||||
"out_of_sync_interval": 86400,
|
"out_of_sync_interval": 86400,
|
||||||
"csc": "linuxmint-packages/",
|
"csc": "linuxmint-packages/",
|
||||||
"upstream": "https://mirrors.edge.kernel.org/linuxmint-packages/",
|
"upstream": "https://mirrors.edge.kernel.org/linuxmint-packages/",
|
||||||
|
@ -366,5 +366,12 @@
|
||||||
"csc": "MacPorts/mpdistfiles/",
|
"csc": "MacPorts/mpdistfiles/",
|
||||||
"upstream": "https://distfiles.macports.org/",
|
"upstream": "https://distfiles.macports.org/",
|
||||||
"file": "ports.tar.gz"
|
"file": "ports.tar.gz"
|
||||||
|
},
|
||||||
|
"xiph": {
|
||||||
|
"out_of_sync_since": null,
|
||||||
|
"out_of_sync_interval": 86400,
|
||||||
|
"csc": "xiph/releases/",
|
||||||
|
"upstream": "https://ftp.osuosl.org/pub/xiph/releases/",
|
||||||
|
"file": ""
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,99 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
import datefinder # another date finding library
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from project import Project
|
||||||
|
from shared import CSC_MIRROR
|
||||||
|
|
||||||
|
# this function is brute force looping through the whole directory and checking dates
|
||||||
|
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||||
|
|
||||||
|
class xiph(Project):
|
||||||
|
"""xiph class"""
|
||||||
|
@staticmethod
|
||||||
|
def scrape(releases, site):
|
||||||
|
# getting the request from url
|
||||||
|
r = requests.get(site)
|
||||||
|
|
||||||
|
# converting the text
|
||||||
|
s = BeautifulSoup(r.text,"html.parser")
|
||||||
|
|
||||||
|
for i in s.find_all("a"): # for a href directories
|
||||||
|
href = i.attrs['href']
|
||||||
|
|
||||||
|
if href.endswith("/") and href != "../" and href != "/" and href != "/pub/xiph/" and not href.startswith("http://"):
|
||||||
|
if href not in releases:
|
||||||
|
releases.append(href)
|
||||||
|
# print(href)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_latest_date(web_dir):
|
||||||
|
page = requests.get(web_dir).text
|
||||||
|
|
||||||
|
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
|
||||||
|
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
|
||||||
|
# print(str_dates[0])
|
||||||
|
if len(str_dates) == 0:
|
||||||
|
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||||||
|
# for date in str_dates:
|
||||||
|
# print(date)
|
||||||
|
# print("")
|
||||||
|
dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
|
||||||
|
|
||||||
|
# for date in dates:
|
||||||
|
# print(date)
|
||||||
|
return(max(dates))
|
||||||
|
|
||||||
|
def get_checksum_date(directory_URL):
|
||||||
|
page = requests.get(directory_URL).text
|
||||||
|
file_index = page.find("SUMS.txt")
|
||||||
|
# print(page)
|
||||||
|
|
||||||
|
# remove stray numbers (file size numbers in particular) that might interfere with date finding
|
||||||
|
segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:]) # removes numbers for size
|
||||||
|
segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:]) # removes numbers + size unit. e.x. 50kb
|
||||||
|
# print(segment_clean)
|
||||||
|
|
||||||
|
# finds the dates in the segment after the file name
|
||||||
|
# notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.
|
||||||
|
matches = list(datefinder.find_dates(segment_clean))
|
||||||
|
# print(matches[0])
|
||||||
|
|
||||||
|
return matches[0]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def compare_release(cls, csc_dir, upstream_dir):
|
||||||
|
page = requests.get(upstream_dir).text
|
||||||
|
file_index = page.find("SUMS.txt")
|
||||||
|
if file_index == -1:
|
||||||
|
return cls.get_latest_date(csc_dir) == cls.get_latest_date(upstream_dir)
|
||||||
|
else:
|
||||||
|
return cls.get_checksum_date(csc_dir) == cls.get_checksum_date(upstream_dir)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def check_mirror(cls, csc_url, upstream_url, releases):
|
||||||
|
compare = []
|
||||||
|
for release in releases:
|
||||||
|
compare.append(cls.compare_release(csc_url+release, upstream_url+release))
|
||||||
|
return all(compare)
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def check(cls, data, project):
|
||||||
|
"""Check if project packages are up-to-date"""
|
||||||
|
# lists
|
||||||
|
releases1=[]
|
||||||
|
releases2=[]
|
||||||
|
|
||||||
|
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||||||
|
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||||||
|
|
||||||
|
# calling function
|
||||||
|
cls.scrape(releases1, csc_url)
|
||||||
|
cls.scrape(releases2, upstream_url)
|
||||||
|
|
||||||
|
if set(releases1) != set(releases2):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return cls.check_mirror(csc_url, upstream_url, releases2)
|
4
test.py
4
test.py
|
@ -7,7 +7,7 @@ from datetime import timedelta
|
||||||
import time
|
import time
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import re # for salt stack specifically
|
import re # for salt stack specifically
|
||||||
from projects import macports
|
from projects import xiph
|
||||||
import json # import json to read project info stored in json file
|
import json # import json to read project info stored in json file
|
||||||
|
|
||||||
# this function is brute force looping through the whole directory and checking dates
|
# this function is brute force looping through the whole directory and checking dates
|
||||||
|
@ -65,7 +65,7 @@ def get_latest_date(web_dir):
|
||||||
if __name__ =="__main__":
|
if __name__ =="__main__":
|
||||||
with open("data.json", "r", encoding="utf-8") as file:
|
with open("data.json", "r", encoding="utf-8") as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
print(macports.check(data, "macports"))
|
print(xiph.check(data, "xiph"))
|
||||||
|
|
||||||
"""# website to be scrape
|
"""# website to be scrape
|
||||||
site="https://cdimage.ubuntu.com/releases/"
|
site="https://cdimage.ubuntu.com/releases/"
|
||||||
|
|
Loading…
Reference in New Issue