mirror-checker/test.py

from bs4 import BeautifulSoup
import requests
import datefinder  # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
   
# lists
urls=[]

home_site = "http://ports.ubuntu.com"

# function created
def scrape(site):
       
    # getting the request from url
    r = requests.get(site)
       
    # converting the text
    s = BeautifulSoup(r.text,"html.parser")

    for i in s.find_all("a"): # for a href directories
        href = i.attrs['href']
           
        if href.endswith("/") and href != "../" and href != "/":
            if home_site+href in urls: # avoids the link to parent directory
                continue
            site_next = site+href

            if site_next not in  urls:
                urls.append(site_next) 
                print(site_next)
                # calling it self
                scrape(site_next)

def get_latest_date(web_dir):
    page = requests.get(site).text

    str_dates = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}', page)
    dates = [list(datefinder.find_dates(date))[0] for date in str_dates]

    # for date in dates:
    #     print(date)

    return(max(dates))
   
# main function
if __name__ =="__main__":
   
    # website to be scrape
    # site="http://ports.ubuntu.com/ubuntu-ports/"
    # works on: https://www.x.org/releases/

    # calling function
    # scrape(site)

    # latest_date = get_latest_date(urls[0])
    # get_latest_date(urls[0])
    # for dir in urls:
    #     latest_date2 = get_latest_date(dir)
    #     if (latest_date2 >= latest_date):
    #         latest_date = latest_date2

    # print(latest_date)

    page = requests.get("https://repo.manjaro.org/").text
    indexOfFile = page.find("mirror.csclub.uwaterloo.ca/manjaro")

    m = re.search(r'(?P<hours>\d+):(?P<minutes>\d+)', page[indexOfFile:]) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460
    duration = timedelta(**{key: float(val) for key, val in m.groupdict().items()})
    print(duration)

    print (duration <= pd.to_timedelta(86400, unit='s'))
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00			`from bs4 import BeautifulSoup`
			`import requests`
			`import datefinder # another date finding library`
			`import re`
			`from datetime import datetime`
			`from datetime import timedelta`
			`import time`
			`import pandas as pd`

			`# lists`
			`urls=[]`

			`home_site = "http://ports.ubuntu.com"`

			`# function created`
			`def scrape(site):`

			`# getting the request from url`
			`r = requests.get(site)`

			`# converting the text`
			`s = BeautifulSoup(r.text,"html.parser")`

			`for i in s.find_all("a"): # for a href directories`
			`href = i.attrs['href']`

			`if href.endswith("/") and href != "../" and href != "/":`
			`if home_site+href in urls: # avoids the link to parent directory`
			`continue`
			`site_next = site+href`

			`if site_next not in urls:`
			`urls.append(site_next)`
			`print(site_next)`
			`# calling it self`
			`scrape(site_next)`

			`def get_latest_date(web_dir):`
			`page = requests.get(site).text`

			`str_dates = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}', page)`
			`dates = [list(datefinder.find_dates(date))[0] for date in str_dates]`

			`# for date in dates:`
			`# print(date)`

			`return(max(dates))`

			`# main function`
			`if __name__ =="__main__":`

			`# website to be scrape`
			`# site="http://ports.ubuntu.com/ubuntu-ports/"`
			`# works on: https://www.x.org/releases/`

			`# calling function`
			`# scrape(site)`

			`# latest_date = get_latest_date(urls[0])`
			`# get_latest_date(urls[0])`
			`# for dir in urls:`
			`# latest_date2 = get_latest_date(dir)`
			`# if (latest_date2 >= latest_date):`
			`# latest_date = latest_date2`

			`# print(latest_date)`

			`page = requests.get("https://repo.manjaro.org/").text`
			`indexOfFile = page.find("mirror.csclub.uwaterloo.ca/manjaro")`

			`m = re.search(r'(?P<hours>\d+):(?P<minutes>\d+)', page[indexOfFile:]) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460`
			`duration = timedelta(**{key: float(val) for key, val in m.groupdict().items()})`
			`print(duration)`

			`print (duration <= pd.to_timedelta(86400, unit='s'))`