forked from public/mirror-checker
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
85 lines
2.7 KiB
85 lines
2.7 KiB
1 year ago
|
from bs4 import BeautifulSoup
|
||
|
import requests
|
||
|
import datefinder # another date finding library
|
||
|
import re
|
||
|
from datetime import datetime
|
||
|
from datetime import timedelta
|
||
|
import time
|
||
|
import pandas as pd
|
||
|
from project import Project
|
||
|
from shared import CSC_MIRROR
|
||
|
|
||
|
# this function is brute force looping through the whole directory and checking dates
|
||
|
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||
|
|
||
|
# lists
|
||
|
urls=[]
|
||
|
|
||
|
class linuxmint(Project):
|
||
|
"""linuxmint class"""
|
||
|
@classmethod
|
||
|
def scrape(cls, urls, site):
|
||
|
# getting the request from url
|
||
|
r = requests.get(site)
|
||
|
|
||
|
# converting the text
|
||
|
s = BeautifulSoup(r.text,"html.parser")
|
||
|
|
||
|
for i in s.find_all("a"): # for a href directories
|
||
|
href = i.attrs['href']
|
||
|
|
||
|
if href.endswith("/") and href != "../" and href != "/":
|
||
|
site_next = site+href
|
||
|
|
||
|
if site_next not in urls:
|
||
|
urls.append(site_next)
|
||
|
# print(site_next)
|
||
|
# calling it self
|
||
|
cls.scrape(urls, site_next)
|
||
|
|
||
|
@staticmethod
|
||
|
def get_latest_date(web_dir):
|
||
|
page = requests.get(web_dir).text
|
||
|
|
||
|
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})', page)
|
||
|
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
|
||
|
# print(str_dates[0])
|
||
|
if len(str_dates) == 0:
|
||
|
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||
|
# for date in str_dates:
|
||
|
# print(date)
|
||
|
dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
|
||
|
|
||
|
# for date in dates:
|
||
|
# print(date)
|
||
|
return(max(dates))
|
||
|
|
||
|
@classmethod
|
||
|
def max_date(cls, urls):
|
||
|
latest_date = cls.get_latest_date(urls[0])
|
||
|
# get_latest_date(urls[0])
|
||
|
for dir in urls:
|
||
|
latest_date2 = cls.get_latest_date(dir)
|
||
|
if (latest_date2 >= latest_date):
|
||
|
latest_date = latest_date2
|
||
|
# print(latest_date)
|
||
|
return latest_date
|
||
|
|
||
|
|
||
|
@classmethod
|
||
|
def check(cls, data, project):
|
||
|
"""Check if project packages are up-to-date"""
|
||
|
# lists
|
||
|
urls1=[]
|
||
|
urls2=[]
|
||
|
|
||
|
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
|
||
|
upstream_url = data[project]["upstream"] + data[project]["file"]
|
||
|
|
||
|
# calling function
|
||
|
cls.scrape(urls1, csc_url)
|
||
|
cls.scrape(urls2, upstream_url)
|
||
|
|
||
|
# print(len(urls1), len(urls2))
|
||
|
|
||
|
return cls.max_date(urls1) == cls.max_date(urls2)
|