mirror-checker/projects/linuxmint.py

85 lines
2.7 KiB
Python

from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
from project import Project
from shared import CSC_MIRROR
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
# lists
urls=[]
class linuxmint(Project):
"""linuxmint class"""
@classmethod
def scrape(cls, urls, site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/":
site_next = site+href
if site_next not in urls:
urls.append(site_next)
# print(site_next)
# calling it self
cls.scrape(urls, site_next)
@staticmethod
def get_latest_date(web_dir):
page = requests.get(web_dir).text
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', page)
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
# print(str_dates[0])
if len(str_dates) == 0:
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
# for date in str_dates:
# print(date)
dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
# for date in dates:
# print(date)
return(max(dates))
@classmethod
def max_date(cls, urls):
latest_date = cls.get_latest_date(urls[0])
# get_latest_date(urls[0])
for dir in urls:
latest_date2 = cls.get_latest_date(dir)
if (latest_date2 >= latest_date):
latest_date = latest_date2
# print(latest_date)
return latest_date
@classmethod
def check(cls, data, project):
"""Check if project packages are up-to-date"""
# lists
urls1=[]
urls2=[]
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
# calling function
cls.scrape(urls1, csc_url)
cls.scrape(urls2, upstream_url)
# print(len(urls1), len(urls2))
return cls.max_date(urls1) == cls.max_date(urls2)