from bs4 import BeautifulSoup import requests import datefinder # another date finding library import re from datetime import datetime from datetime import timedelta import time import pandas as pd # this function is brute force looping through the whole directory and checking dates # it may sound horrible, but for certain distros, i believe it's indeed the best solution # lists urls=[] home_site = "http://ftp.netbsd.org/pub" # function created def scrape(site): # getting the request from url r = requests.get(site) # converting the text s = BeautifulSoup(r.text,"html.parser") for i in s.find_all("a"): # for a href directories href = i.attrs['href'] if href.endswith("/") and href != "../" and href != "/": """if home_site+href in urls: # avoids the link to parent directory continue""" if href == "//ftp.netbsd.org/": # netbsd specific code continue site_next = site+href if site_next not in urls: urls.append(site_next) print(site_next) # calling it self scrape(site_next) def get_latest_date(web_dir): page = requests.get(site).text str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page) dates = [list(datefinder.find_dates(date))[0] for date in str_dates] # for date in dates: # print(date) return(max(dates)) # main function if __name__ =="__main__": # website to be scrape site="http://ftp.netbsd.org/pub/NetBSD/" # works on: https://www.x.org/releases/ # https://mirror.csclub.uwaterloo.ca/linuxmint/ #works wonders for linuxmint # unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on https://linuxmint.com/mirrors.php seems like the best choice # calling function scrape(site) latest_date = get_latest_date(urls[0]) # get_latest_date(urls[0]) for dir in urls: latest_date2 = get_latest_date(dir) if (latest_date2 >= latest_date): latest_date = latest_date2 print(latest_date) """page = requests.get("http://rsync-mxlinux.org/mirmon/index.html").text indexOfFile = page.find("mirror.csclub.uwaterloo.ca") m = re.search(r'(\d+ hours)|(\d+(\.)?\d+ days)', page[indexOfFile:]) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460 duration = pd.to_timedelta(m.group(0)) print (duration <= pd.to_timedelta(86400, unit='s'))"""