mirror-checker/projects/linuxmint.py

from bs4 import BeautifulSoup
import requests
import datefinder  # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
from project import Project
from shared import CSC_MIRROR

# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution

# lists
urls=[]

class linuxmint(Project):
    """linuxmint class"""
    @classmethod
    def scrape(cls, urls, site):
        # getting the request from url
        r = requests.get(site)

        # converting the text
        s = BeautifulSoup(r.text,"html.parser")

        for i in s.find_all("a"): # for a href directories
            href = i.attrs['href']

            if href.endswith("/") and href != "../" and href != "/":
                site_next = site+href

                if site_next not in  urls:
                    urls.append(site_next)
                    # print(site_next)
                    # calling it self
                    cls.scrape(urls, site_next)

    @staticmethod
    def get_latest_date(web_dir):
        page = requests.get(web_dir).text

        str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', page)
        # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
        # print(str_dates[0])
        if len(str_dates) == 0:
            return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
        # for date in str_dates:
        #     print(date)
        dates = [list(datefinder.find_dates(date))[0] for date in str_dates]

        # for date in dates:
        #     print(date)
        return(max(dates))

    @classmethod
    def max_date(cls, urls):
        latest_date = cls.get_latest_date(urls[0])
        # get_latest_date(urls[0])
        for dir in urls:
            latest_date2 = cls.get_latest_date(dir)
            if (latest_date2 >= latest_date):
                latest_date = latest_date2
        # print(latest_date)
        return latest_date


    @classmethod
    def check(cls, data, project):
        """Check if project packages are up-to-date"""
        # lists
        urls1=[]
        urls2=[]

        csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
        upstream_url = data[project]["upstream"] + data[project]["file"]

        # calling function
        cls.scrape(urls1, csc_url)
        cls.scrape(urls2, upstream_url)

        # print(len(urls1), len(urls2))

        return cls.max_date(urls1) == cls.max_date(urls2)