mirror-checker/projects/xiph.py

from bs4 import BeautifulSoup
import requests
import datefinder  # another date finding library
import re
from datetime import datetime
from project import Project
from shared import CSC_MIRROR

# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution

class xiph(Project):
    """xiph class"""
    @staticmethod
    def scrape(releases, site):
        # getting the request from url
        r = requests.get(site)
        
        # converting the text
        s = BeautifulSoup(r.text,"html.parser")

        for i in s.find_all("a"): # for a href directories
            href = i.attrs['href']
            
            if href.endswith("/") and href != "../" and href != "/" and href != "/pub/xiph/" and not href.startswith("http://"):
                if href not in releases:
                    releases.append(href) 
                    # print(href)
    
    @staticmethod
    def get_latest_date(web_dir):
        page = requests.get(web_dir).text

        str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
        # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
        # print(str_dates[0])
        if len(str_dates) == 0:
            return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
        # for date in str_dates:
        #     print(date)
        # print("")
        dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]

        # for date in dates:
        #     print(date)
        return(max(dates))
    
    def get_checksum_date(directory_URL):
        page = requests.get(directory_URL).text
        file_index = page.find("SUMS.txt")
        # print(page)

        # remove stray numbers (file size numbers in particular) that might interfere with date finding
        segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:])  # removes numbers for size
        segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:])  # removes numbers + size unit. e.x. 50kb
        # print(segment_clean)

        # finds the dates in the segment after the file name
        # notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.
        matches = list(datefinder.find_dates(segment_clean))
        # print(matches[0])

        return matches[0]
    
    @classmethod
    def compare_release(cls, csc_dir, upstream_dir):
        page = requests.get(upstream_dir).text
        file_index = page.find("SUMS.txt")
        if file_index == -1:
            return cls.get_latest_date(csc_dir) == cls.get_latest_date(upstream_dir)
        else:
            return cls.get_checksum_date(csc_dir) == cls.get_checksum_date(upstream_dir)
    
    @classmethod
    def check_mirror(cls, csc_url, upstream_url, releases):
        compare = []
        for release in releases:
            compare.append(cls.compare_release(csc_url+release, upstream_url+release))
        return all(compare)

    
    @classmethod
    def check(cls, data, project, current_time):
        """Check if project packages are up-to-date"""
        # lists
        releases1=[]
        releases2=[]

        csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
        upstream_url = data[project]["upstream"] + data[project]["file"]

        # calling function
        cls.scrape(releases1, csc_url)
        cls.scrape(releases2, upstream_url)

        if set(releases1) != set(releases2):
            return False

        return cls.check_mirror(csc_url, upstream_url, releases2)
xiph added 2021-10-16 03:40:14 -04:00			`from bs4 import BeautifulSoup`
			`import requests`
			`import datefinder # another date finding library`
			`import re`
			`from datetime import datetime`
			`from project import Project`
			`from shared import CSC_MIRROR`

			`# this function is brute force looping through the whole directory and checking dates`
			`# it may sound horrible, but for certain distros, i believe it's indeed the best solution`

			`class xiph(Project):`
			`"""xiph class"""`
			`@staticmethod`
			`def scrape(releases, site):`
			`# getting the request from url`
			`r = requests.get(site)`

			`# converting the text`
			`s = BeautifulSoup(r.text,"html.parser")`

			`for i in s.find_all("a"): # for a href directories`
			`href = i.attrs['href']`

			`if href.endswith("/") and href != "../" and href != "/" and href != "/pub/xiph/" and not href.startswith("http://"):`
			`if href not in releases:`
			`releases.append(href)`
			`# print(href)`

			`@staticmethod`
			`def get_latest_date(web_dir):`
			`page = requests.get(web_dir).text`

			`str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})\|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)`
			`# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})\|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!`
			`# print(str_dates[0])`
			`if len(str_dates) == 0:`
			`return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates`
			`# for date in str_dates:`
			`# print(date)`
			`# print("")`
			`dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates]`

			`# for date in dates:`
			`# print(date)`
			`return(max(dates))`

			`def get_checksum_date(directory_URL):`
			`page = requests.get(directory_URL).text`
			`file_index = page.find("SUMS.txt")`
			`# print(page)`

			`# remove stray numbers (file size numbers in particular) that might interfere with date finding`
			`segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:]) # removes numbers for size`
			`segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:]) # removes numbers + size unit. e.x. 50kb`
			`# print(segment_clean)`

			`# finds the dates in the segment after the file name`
			`# notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.`
			`matches = list(datefinder.find_dates(segment_clean))`
			`# print(matches[0])`

			`return matches[0]`

			`@classmethod`
			`def compare_release(cls, csc_dir, upstream_dir):`
			`page = requests.get(upstream_dir).text`
			`file_index = page.find("SUMS.txt")`
			`if file_index == -1:`
			`return cls.get_latest_date(csc_dir) == cls.get_latest_date(upstream_dir)`
			`else:`
			`return cls.get_checksum_date(csc_dir) == cls.get_checksum_date(upstream_dir)`

			`@classmethod`
			`def check_mirror(cls, csc_url, upstream_url, releases):`
			`compare = []`
			`for release in releases:`
			`compare.append(cls.compare_release(csc_url+release, upstream_url+release))`
			`return all(compare)`


			`@classmethod`
Fix parameter mismatch 2022-01-01 14:54:28 -05:00			`def check(cls, data, project, current_time):`
xiph added 2021-10-16 03:40:14 -04:00			`"""Check if project packages are up-to-date"""`
			`# lists`
			`releases1=[]`
			`releases2=[]`

			`csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]`
			`upstream_url = data[project]["upstream"] + data[project]["file"]`

			`# calling function`
			`cls.scrape(releases1, csc_url)`
			`cls.scrape(releases2, upstream_url)`

			`if set(releases1) != set(releases2):`
			`return False`

Fix parameter mismatch 2022-01-01 14:54:28 -05:00			`return cls.check_mirror(csc_url, upstream_url, releases2)`