forked from public/mirror-checker
parent
26e57b4d4d
commit
c974d49ffc
@ -0,0 +1,99 @@ |
||||
from bs4 import BeautifulSoup |
||||
import requests |
||||
import datefinder # another date finding library |
||||
import re |
||||
from datetime import datetime |
||||
from project import Project |
||||
from shared import CSC_MIRROR |
||||
|
||||
# this function is brute force looping through the whole directory and checking dates |
||||
# it may sound horrible, but for certain distros, i believe it's indeed the best solution |
||||
|
||||
class xiph(Project): |
||||
"""xiph class""" |
||||
@staticmethod |
||||
def scrape(releases, site): |
||||
# getting the request from url |
||||
r = requests.get(site) |
||||
|
||||
# converting the text |
||||
s = BeautifulSoup(r.text,"html.parser") |
||||
|
||||
for i in s.find_all("a"): # for a href directories |
||||
href = i.attrs['href'] |
||||
|
||||
if href.endswith("/") and href != "../" and href != "/" and href != "/pub/xiph/" and not href.startswith("http://"): |
||||
if href not in releases: |
||||
releases.append(href) |
||||
# print(href) |
||||
|
||||
@staticmethod |
||||
def get_latest_date(web_dir): |
||||
page = requests.get(web_dir).text |
||||
|
||||
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page) |
||||
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!! |
||||
# print(str_dates[0]) |
||||
if len(str_dates) == 0: |
||||
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates |
||||
# for date in str_dates: |
||||
# print(date) |
||||
# print("") |
||||
dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates] |
||||
|
||||
# for date in dates: |
||||
# print(date) |
||||
return(max(dates)) |
||||
|
||||
def get_checksum_date(directory_URL): |
||||
page = requests.get(directory_URL).text |
||||
file_index = page.find("SUMS.txt") |
||||
# print(page) |
||||
|
||||
# remove stray numbers (file size numbers in particular) that might interfere with date finding |
||||
segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:]) # removes numbers for size |
||||
segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:]) # removes numbers + size unit. e.x. 50kb |
||||
# print(segment_clean) |
||||
|
||||
# finds the dates in the segment after the file name |
||||
# notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom. |
||||
matches = list(datefinder.find_dates(segment_clean)) |
||||
# print(matches[0]) |
||||
|
||||
return matches[0] |
||||
|
||||
@classmethod |
||||
def compare_release(cls, csc_dir, upstream_dir): |
||||
page = requests.get(upstream_dir).text |
||||
file_index = page.find("SUMS.txt") |
||||
if file_index == -1: |
||||
return cls.get_latest_date(csc_dir) == cls.get_latest_date(upstream_dir) |
||||
else: |
||||
return cls.get_checksum_date(csc_dir) == cls.get_checksum_date(upstream_dir) |
||||
|
||||
@classmethod |
||||
def check_mirror(cls, csc_url, upstream_url, releases): |
||||
compare = [] |
||||
for release in releases: |
||||
compare.append(cls.compare_release(csc_url+release, upstream_url+release)) |
||||
return all(compare) |
||||
|
||||
|
||||
@classmethod |
||||
def check(cls, data, project): |
||||
"""Check if project packages are up-to-date""" |
||||
# lists |
||||
releases1=[] |
||||
releases2=[] |
||||
|
||||
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] |
||||
upstream_url = data[project]["upstream"] + data[project]["file"] |
||||
|
||||
# calling function |
||||
cls.scrape(releases1, csc_url) |
||||
cls.scrape(releases2, upstream_url) |
||||
|
||||
if set(releases1) != set(releases2): |
||||
return False |
||||
|
||||
return cls.check_mirror(csc_url, upstream_url, releases2) |
Loading…
Reference in new issue