from bs4 import BeautifulSoup import requests import datefinder # another date finding library import re from datetime import datetime from project import Project from shared import CSC_MIRROR # this function is brute force looping through the whole directory and checking dates # it may sound horrible, but for certain distros, i believe it's indeed the best solution class xiph(Project): """xiph class""" @staticmethod def scrape(releases, site): # getting the request from url r = requests.get(site) # converting the text s = BeautifulSoup(r.text,"html.parser") for i in s.find_all("a"): # for a href directories href = i.attrs['href'] if href.endswith("/") and href != "../" and href != "/" and href != "/pub/xiph/" and not href.startswith("http://"): if href not in releases: releases.append(href) # print(href) @staticmethod def get_latest_date(web_dir): page = requests.get(web_dir).text str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page) # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!! # print(str_dates[0]) if len(str_dates) == 0: return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates # for date in str_dates: # print(date) # print("") dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates] # for date in dates: # print(date) return(max(dates)) def get_checksum_date(directory_URL): page = requests.get(directory_URL).text file_index = page.find("SUMS.txt") # print(page) # remove stray numbers (file size numbers in particular) that might interfere with date finding segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:]) # removes numbers for size segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:]) # removes numbers + size unit. e.x. 50kb # print(segment_clean) # finds the dates in the segment after the file name # notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom. matches = list(datefinder.find_dates(segment_clean)) # print(matches[0]) return matches[0] @classmethod def compare_release(cls, csc_dir, upstream_dir): page = requests.get(upstream_dir).text file_index = page.find("SUMS.txt") if file_index == -1: return cls.get_latest_date(csc_dir) == cls.get_latest_date(upstream_dir) else: return cls.get_checksum_date(csc_dir) == cls.get_checksum_date(upstream_dir) @classmethod def check_mirror(cls, csc_url, upstream_url, releases): compare = [] for release in releases: compare.append(cls.compare_release(csc_url+release, upstream_url+release)) return all(compare) @classmethod def check(cls, data, project, current_time): """Check if project packages are up-to-date""" # lists releases1=[] releases2=[] csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] upstream_url = data[project]["upstream"] + data[project]["file"] # calling function cls.scrape(releases1, csc_url) cls.scrape(releases2, upstream_url) if set(releases1) != set(releases2): return False return cls.check_mirror(csc_url, upstream_url, releases2)