from bs4 import BeautifulSoup import requests import datefinder # another date finding library import re from datetime import datetime from datetime import timedelta import time import pandas as pd from project import Project from shared import CSC_MIRROR # this function is brute force looping through the whole directory and checking dates # it may sound horrible, but for certain distros, i believe it's indeed the best solution # lists urls=[] class ubuntu_ports_releases(Project): """ubuntu_ports_releases class""" @classmethod def scrape(cls, urls, site): # getting the request from url r = requests.get(site) # converting the text s = BeautifulSoup(r.text,"html.parser") for i in s.find_all("a"): # for a href directories href = i.attrs['href'] if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not href.startswith("http://"): site_next = site+href if site_next not in urls: urls.append(site_next) # print(site_next) # calling it self cls.scrape(urls, site_next) @staticmethod def get_latest_date(web_dir): page = requests.get(web_dir).text str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page) # if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!! # print(str_dates[0]) if len(str_dates) == 0: return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates # for date in str_dates: # print(date) dates = [list(datefinder.find_dates("".join(date)))[0] for date in str_dates] # for date in dates: # print(date) return(max(dates)) @classmethod def max_date(cls, urls): latest_date = cls.get_latest_date(urls[0]) # get_latest_date(urls[0]) for dir in urls: latest_date2 = cls.get_latest_date(dir) if (latest_date2 >= latest_date): latest_date = latest_date2 # print(latest_date) return latest_date @classmethod def check(cls, data, project): """Check if project packages are up-to-date""" # lists urls1=[] urls2=[] csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"] upstream_url = data[project]["upstream"] + data[project]["file"] # calling function cls.scrape(urls1, csc_url) cls.scrape(urls2, upstream_url) # print(len(urls1), len(urls2)) return cls.max_date(urls1) == cls.max_date(urls2)