mirror-checker/test.py

from bs4 import BeautifulSoup
import requests
import datefinder  # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd

# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution

# lists
urls=[]

home_site = "http://ftp.netbsd.org/pub"

# function created
def scrape(site):

    # getting the request from url
    r = requests.get(site)

    # converting the text
    s = BeautifulSoup(r.text,"html.parser")

    for i in s.find_all("a"): # for a href directories
        href = i.attrs['href']

        if href.endswith("/") and href != "../" and href != "/":
            """if home_site+href in urls: # avoids the link to parent directory
                continue"""
            if href == "//ftp.netbsd.org/": # netbsd specific code
                continue
            site_next = site+href

            if site_next not in  urls:
                urls.append(site_next)
                print(site_next)
                # calling it self
                scrape(site_next)

def get_latest_date(web_dir):
    page = requests.get(site).text

    str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
    dates = [list(datefinder.find_dates(date))[0] for date in str_dates]

    # for date in dates:
    #     print(date)

    return(max(dates))

# main function
if __name__ =="__main__":

    # website to be scrape
    site="http://ftp.netbsd.org/pub/NetBSD/"
    # works on: https://www.x.org/releases/
    #           https://mirror.csclub.uwaterloo.ca/linuxmint/ #works wonders for linuxmint
    #           unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on https://linuxmint.com/mirrors.php seems like the best choice

    # calling function
    scrape(site)

    latest_date = get_latest_date(urls[0])
    # get_latest_date(urls[0])
    for dir in urls:
        latest_date2 = get_latest_date(dir)
        if (latest_date2 >= latest_date):
            latest_date = latest_date2

    print(latest_date)

    """page = requests.get("http://rsync-mxlinux.org/mirmon/index.html").text
    indexOfFile = page.find("mirror.csclub.uwaterloo.ca")

    m = re.search(r'(\d+ hours)|(\d+(\.)?\d+ days)', page[indexOfFile:]) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460

    duration = pd.to_timedelta(m.group(0))

    print (duration <= pd.to_timedelta(86400, unit='s'))"""