mirror-checker/projects/linuxmint.py

from bs4 import BeautifulSoup
import requests
import datefinder  # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
from project import Project
from shared import CSC_MIRROR

# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution

# lists
urls=[]

class linuxmint(Project):
    """linuxmint class"""
    @classmethod
    def scrape(cls, urls, site):
        # getting the request from url
        r = requests.get(site)
        
        # converting the text
        s = BeautifulSoup(r.text,"html.parser")

        for i in s.find_all("a"): # for a href directories
            href = i.attrs['href']
            
            if href.endswith("/") and href != "../" and href != "/":
                site_next = site+href

                if site_next not in  urls:
                    urls.append(site_next) 
                    # print(site_next)
                    # calling it self
                    cls.scrape(urls, site_next)
    
    @staticmethod
    def get_latest_date(web_dir):
        page = requests.get(web_dir).text

        str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', page)
        # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
        # print(str_dates[0])
        if len(str_dates) == 0:
            return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
        # for date in str_dates:
        #     print(date)
        dates = [list(datefinder.find_dates(date))[0] for date in str_dates]

        # for date in dates:
        #     print(date)
        return(max(dates))
    
    @classmethod
    def max_date(cls, urls):
        latest_date = cls.get_latest_date(urls[0])
        # get_latest_date(urls[0])
        for dir in urls:
            latest_date2 = cls.get_latest_date(dir)
            if (latest_date2 >= latest_date):
                latest_date = latest_date2
        # print(latest_date)
        return latest_date

    
    @classmethod
    def check(cls, data, project):
        """Check if project packages are up-to-date"""
        # lists
        urls1=[]
        urls2=[]

        csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
        upstream_url = data[project]["upstream"] + data[project]["file"]

        # calling function
        cls.scrape(urls1, csc_url)
        cls.scrape(urls2, upstream_url)

        # print(len(urls1), len(urls2))

        return cls.max_date(urls1) == cls.max_date(urls2)
added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`from bs4 import BeautifulSoup`
			`import requests`
			`import datefinder # another date finding library`
			`import re`
			`from datetime import datetime`
			`from datetime import timedelta`
			`import time`
			`import pandas as pd`
			`from project import Project`
			`from shared import CSC_MIRROR`

			`# this function is brute force looping through the whole directory and checking dates`
			`# it may sound horrible, but for certain distros, i believe it's indeed the best solution`

			`# lists`
			`urls=[]`

			`class linuxmint(Project):`
			`"""linuxmint class"""`
			`@classmethod`
			`def scrape(cls, urls, site):`
			`# getting the request from url`
			`r = requests.get(site)`

			`# converting the text`
			`s = BeautifulSoup(r.text,"html.parser")`

			`for i in s.find_all("a"): # for a href directories`
			`href = i.attrs['href']`

			`if href.endswith("/") and href != "../" and href != "/":`
			`site_next = site+href`

			`if site_next not in urls:`
			`urls.append(site_next)`
			`# print(site_next)`
			`# calling it self`
			`cls.scrape(urls, site_next)`

			`@staticmethod`
			`def get_latest_date(web_dir):`
			`page = requests.get(web_dir).text`

			`str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', page)`
			`# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})\|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!`
			`# print(str_dates[0])`
			`if len(str_dates) == 0:`
			`return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates`
			`# for date in str_dates:`
			`# print(date)`
			`dates = [list(datefinder.find_dates(date))[0] for date in str_dates]`

			`# for date in dates:`
			`# print(date)`
			`return(max(dates))`

			`@classmethod`
			`def max_date(cls, urls):`
			`latest_date = cls.get_latest_date(urls[0])`
			`# get_latest_date(urls[0])`
			`for dir in urls:`
			`latest_date2 = cls.get_latest_date(dir)`
			`if (latest_date2 >= latest_date):`
			`latest_date = latest_date2`
			`# print(latest_date)`
			`return latest_date`


			`@classmethod`
			`def check(cls, data, project):`
			`"""Check if project packages are up-to-date"""`
			`# lists`
			`urls1=[]`
			`urls2=[]`

			`csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]`
			`upstream_url = data[project]["upstream"] + data[project]["file"]`

			`# calling function`
			`cls.scrape(urls1, csc_url)`
			`cls.scrape(urls2, upstream_url)`

			`# print(len(urls1), len(urls2))`

			`return cls.max_date(urls1) == cls.max_date(urls2)`