You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
99 lines
3.7 KiB
99 lines
3.7 KiB
1 year ago
|
from bs4 import BeautifulSoup
|
||
|
import requests
|
||
|
import datefinder # another date finding library
|
||
|
import re
|
||
|
from datetime import datetime
|
||
|
from project import Project
|
||
|
from shared import CSC_MIRROR
|
||
|
|
||
|
# this function is brute force looping through the whole directory and checking dates
|
||
|
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||
|
|
||
|
class xiph(Project):
|
||
|
"""xiph class"""
|
||
|
@staticmethod
|
||
|
def scrape(releases, site):
|
||
|
# getting the request from url
|
||
|
r = requests.get(site)
|
||
|
|
||
|
# converting the text
|
||
|
s = BeautifulSoup(r.text,"html.parser")
|
||
|
|
||
|
for i in s.find_all("a"): # for a href directories
|
||
|
href = i.attrs['href']
|
||
|
|
||
|
if href.endswith("/") and href != "../" and href != "/" and href != "/pub/xiph/" and not href.startswith("http://"):
|
||
|
if href not in releases:
|
||
|
releases.append(href)
|
||
|
# print(href)
|
||
|
|
||
|
@staticmethod
|
||
|
def get_latest_date(web_dir):
|
||
|
page = requests.get(web_dir).text
|
||
|
|
||
|
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
|
||
|
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
|
||
|
# print(str_dates[0])
|
||
|
if len(str_dates) == 0:
|
||
|
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||
|
# for date in str_dates:
|
||
|
# print(date)
|
||
|
# print("")
|
||
|
dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]
|
||
|
|
||
|
# for date in dates:
|
||
|
# print(date)
|
||
|
return(max(dates))
|
||
|
|
||
|
def get_checksum_date(directory_URL):
|
||
|
page = requests.get(directory_URL).text
|
||
|
file_index = page.find("SUMS.txt")
|
||
|
# print(page)
|
||
|
|
||
|
# remove stray numbers (file size numbers in particular) that might interfere with date finding
|
||
|
segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:]) # removes numbers for size
|
||
|
segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:]) # removes numbers + size unit. e.x. 50kb
|
||
|
# print(segment_clean)
|
||
|
|
||
|
# finds the dates in the segment after the file name
|
||
|
# notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.
|
||
|
matches = list(datefinder.find_dates(segment_clean))
|
||
|
# print(matches[0])
|
||
|
|
||
|
return matches[0]
|
||
|
|
||
|
@classmethod
|
||
|
def compare_release(cls, csc_dir, upstream_dir):
|
||
|
page = requests.get(upstream_dir).text
|
||
|
file_index = page.find("SUMS.txt")
|
||
|
if file_index == -1:
|
||
|
return cls.get_latest_date(csc_dir) == cls.get_latest_date(upstream_dir)
|
||
|
else:
|
||
|
return cls.get_checksum_date(csc_dir) == cls.get_checksum_date(upstream_dir)
|
||
|
|
||
|
@classmethod
|
||
|
def check_mirror(cls, csc_url, upstream_url, releases):
|
||
|
compare = []
|
||
|
for release in releases:
|
||
|
compare.append(cls.compare_release(csc_url+release, upstream_url+release))
|
||
|
return all(compare)
|
||
|
|
||
|
|
||
|
@classmethod
|
||
|
def check(cls, data, project):
|
||
|
"""Check if project packages are up-to-date"""
|
||
|
# lists
|
||
|
releases1=[]
|
||
|
releases2=[]
|
||
|
|
||
|
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||
|
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||
|
|
||
|
# calling function
|
||
|
cls.scrape(releases1, csc_url)
|
||
|
cls.scrape(releases2, upstream_url)
|
||
|
|
||
|
if set(releases1) != set(releases2):
|
||
|
return False
|
||
|
|
||
|
return cls.check_mirror(csc_url, upstream_url, releases2)
|