Check whether our mirror packages are up to date.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

79 lines
2.5 KiB

from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
# lists
2 years ago
home_site = ""
# function created
def scrape(site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/":
"""if home_site+href in urls: # avoids the link to parent directory
2 years ago
if href == "//": # netbsd specific code
site_next = site+href
if site_next not in urls:
# calling it self
def get_latest_date(web_dir):
page = requests.get(site).text
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
# for date in dates:
# print(date)
# main function
if __name__ =="__main__":
"""# website to be scrape
2 years ago
# works on:
# #works wonders for linuxmint
# unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on seems like the best choice
# calling function
2 years ago
2 years ago
latest_date = get_latest_date(urls[0])
# get_latest_date(urls[0])
2 years ago
for dir in urls:
latest_date2 = get_latest_date(dir)
if (latest_date2 >= latest_date):
latest_date = latest_date2
csc_url = ""
upstream_url = ""
print(requests.get(csc_url).text == requests.get(upstream_url).text)