xiph added

This commit is contained in:
Tom 2021-10-16 00:40:14 -07:00
parent 26e57b4d4d
commit c974d49ffc
4 changed files with 119 additions and 11 deletions

View File

@ -18,13 +18,13 @@ even if the date relies on a specific file in their repo, we can still find the
to find repos of the mirrored projects to check, just search "projectName mirrors" to find repos of the mirrored projects to check, just search "projectName mirrors"
## checker information
not done: not done:
macPorts: only distfiles has public repo, no timestamp, too large to loop through, comparing ports.tar.gz in distfiles
NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker NetBSD: http://ftp.netbsd.org/pub/NetBSD/ has public repo, no timestamp, web directory hard to loop through, no mirror tracker
opensuse: http://download.opensuse.org/ has public repo, a possible timestamp called latest in history, our mirror doesn't have this file tho, no mirror tracker opensuse: http://download.opensuse.org/ check Update.repo files in folders inside the update folder
puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy puppylinux: https://distro.ibiblio.org/puppylinux/ check the ISO files in the folders starting with puppy
x.org: https://www.x.org/releases/ no timestamp, but candidate for brute force looping since it has few folders, no status tracker x.org: https://www.x.org/releases/ check all of the files under each directory under /x.org/individual/, and make sure that we have all of the files which the upstream has
Xiph: no timestamp, too big to loop through, no status tracker
done: done:
almalinux almalinux
@ -61,6 +61,7 @@ kde-applicationdata
kernel kernel
linuxmint: https://mirrors.edge.kernel.org/linuxmint/ candidate for brute force looping linuxmint: https://mirrors.edge.kernel.org/linuxmint/ candidate for brute force looping
linuxmint-packages: https://mirrors.edge.kernel.org/linuxmint-packages/ Checking the timestamp of either the Release file or the Packages file should suffice. linuxmint-packages: https://mirrors.edge.kernel.org/linuxmint-packages/ Checking the timestamp of either the Release file or the Packages file should suffice.
macPorts: only distfiles has public repo, no timestamp, too large to loop through, comparing ports.tar.gz in distfiles
manjaro manjaro
mxlinux mxlinux
mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird mxlinux-iso: this one seems out of sync on the official tracker for 134 days, which is weird
@ -84,4 +85,5 @@ ubuntu-ports: http://ports.ubuntu.com/ubuntu-ports/ checks the file anonster.can
ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it ubuntu-ports-releases: https://cdimage.ubuntu.com/releases/ has public repo, no timestamp, no status tracker, brute force looped it
ubuntu-releases: https://releases.ubuntu.com/ ubuntu-releases: https://releases.ubuntu.com/
vlc: http://download.videolan.org/pub/videolan/ vlc: http://download.videolan.org/pub/videolan/
Xiph: https://ftp.osuosl.org/pub/xiph/releases/ loop through each directory in xiph/releases/ and trying to compare the timestamp of the checksum files
xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders xubuntu-releases: https://cdimage.ubuntu.com/xubuntu/releases/ candidate for brute force looping since it has few folders

View File

@ -21,7 +21,7 @@
"file": "zzz/time.txt" "file": "zzz/time.txt"
}, },
"Arch": { "Arch": {
"out_of_sync_since": 1634334754, "out_of_sync_since": null,
"out_of_sync_interval": 86400, "out_of_sync_interval": 86400,
"csc": "archlinux/", "csc": "archlinux/",
"upstream": "http://arch.mirror.constant.com/", "upstream": "http://arch.mirror.constant.com/",
@ -52,7 +52,7 @@
"file": "x86/sha512.sum" "file": "x86/sha512.sum"
}, },
"Debian": { "Debian": {
"out_of_sync_since": 1634334754, "out_of_sync_since": null,
"out_of_sync_interval": 86400, "out_of_sync_interval": 86400,
"csc": "", "csc": "",
"upstream": "https://ftp-master.debian.org/", "upstream": "https://ftp-master.debian.org/",
@ -66,7 +66,7 @@
"file": "debian-cd/project/trace/cdimage.debian.org" "file": "debian-cd/project/trace/cdimage.debian.org"
}, },
"DebianMultimedia": { "DebianMultimedia": {
"out_of_sync_since": 1634334754, "out_of_sync_since": null,
"out_of_sync_interval": 86400, "out_of_sync_interval": 86400,
"csc": "debian-multimedia/", "csc": "debian-multimedia/",
"upstream": "http://debian-mirrors.sdinet.de/deb-multimedia/", "upstream": "http://debian-mirrors.sdinet.de/deb-multimedia/",
@ -140,7 +140,7 @@
"file": "gnu/mirror-updated-timestamp.txt" "file": "gnu/mirror-updated-timestamp.txt"
}, },
"Gutenberg": { "Gutenberg": {
"out_of_sync_since": 1633294718, "out_of_sync_since": null,
"out_of_sync_interval": 172800, "out_of_sync_interval": 172800,
"csc": "gutenberg/", "csc": "gutenberg/",
"upstream": "https://gutenberg.pglaf.org/", "upstream": "https://gutenberg.pglaf.org/",
@ -305,7 +305,7 @@
"file": "" "file": ""
}, },
"linuxmint_packages": { "linuxmint_packages": {
"out_of_sync_since": 1634334754, "out_of_sync_since": null,
"out_of_sync_interval": 86400, "out_of_sync_interval": 86400,
"csc": "linuxmint-packages/", "csc": "linuxmint-packages/",
"upstream": "https://mirrors.edge.kernel.org/linuxmint-packages/", "upstream": "https://mirrors.edge.kernel.org/linuxmint-packages/",
@ -366,5 +366,12 @@
"csc": "MacPorts/mpdistfiles/", "csc": "MacPorts/mpdistfiles/",
"upstream": "https://distfiles.macports.org/", "upstream": "https://distfiles.macports.org/",
"file": "ports.tar.gz" "file": "ports.tar.gz"
},
"xiph": {
"out_of_sync_since": null,
"out_of_sync_interval": 86400,
"csc": "xiph/releases/",
"upstream": "https://ftp.osuosl.org/pub/xiph/releases/",
"file": ""
} }
} }

99
projects/xiph.py Normal file
View File

@ -0,0 +1,99 @@
from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from project import Project
from shared import CSC_MIRROR
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
class xiph(Project):
"""xiph class"""
@staticmethod
def scrape(releases, site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/" and href != "/pub/xiph/" and not href.startswith("http://"):
if href not in releases:
releases.append(href)
# print(href)
@staticmethod
def get_latest_date(web_dir):
page = requests.get(web_dir).text
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
# print(str_dates[0])
if len(str_dates) == 0:
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
# for date in str_dates:
# print(date)
# print("")
dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
# for date in dates:
# print(date)
return(max(dates))
def get_checksum_date(directory_URL):
page = requests.get(directory_URL).text
file_index = page.find("SUMS.txt")
# print(page)
# remove stray numbers (file size numbers in particular) that might interfere with date finding
segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:]) # removes numbers for size
segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:]) # removes numbers + size unit. e.x. 50kb
# print(segment_clean)
# finds the dates in the segment after the file name
# notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.
matches = list(datefinder.find_dates(segment_clean))
# print(matches[0])
return matches[0]
@classmethod
def compare_release(cls, csc_dir, upstream_dir):
page = requests.get(upstream_dir).text
file_index = page.find("SUMS.txt")
if file_index == -1:
return cls.get_latest_date(csc_dir) == cls.get_latest_date(upstream_dir)
else:
return cls.get_checksum_date(csc_dir) == cls.get_checksum_date(upstream_dir)
@classmethod
def check_mirror(cls, csc_url, upstream_url, releases):
compare = []
for release in releases:
compare.append(cls.compare_release(csc_url+release, upstream_url+release))
return all(compare)
@classmethod
def check(cls, data, project):
"""Check if project packages are up-to-date"""
# lists
releases1=[]
releases2=[]
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
# calling function
cls.scrape(releases1, csc_url)
cls.scrape(releases2, upstream_url)
if set(releases1) != set(releases2):
return False
return cls.check_mirror(csc_url, upstream_url, releases2)

View File

@ -7,7 +7,7 @@ from datetime import timedelta
import time import time
import pandas as pd import pandas as pd
import re # for salt stack specifically import re # for salt stack specifically
from projects import macports from projects import xiph
import json # import json to read project info stored in json file import json # import json to read project info stored in json file
# this function is brute force looping through the whole directory and checking dates # this function is brute force looping through the whole directory and checking dates
@ -65,7 +65,7 @@ def get_latest_date(web_dir):
if __name__ =="__main__": if __name__ =="__main__":
with open("data.json", "r", encoding="utf-8") as file: with open("data.json", "r", encoding="utf-8") as file:
data = json.load(file) data = json.load(file)
print(macports.check(data, "macports")) print(xiph.check(data, "xiph"))
"""# website to be scrape """# website to be scrape
site="https://cdimage.ubuntu.com/releases/" site="https://cdimage.ubuntu.com/releases/"