76 lines
2.1 KiB
Python
76 lines
2.1 KiB
Python
|
from bs4 import BeautifulSoup
|
||
|
import requests
|
||
|
import datefinder # another date finding library
|
||
|
import re
|
||
|
from datetime import datetime
|
||
|
from datetime import timedelta
|
||
|
import time
|
||
|
import pandas as pd
|
||
|
|
||
|
# lists
|
||
|
urls=[]
|
||
|
|
||
|
home_site = "http://ports.ubuntu.com"
|
||
|
|
||
|
# function created
|
||
|
def scrape(site):
|
||
|
|
||
|
# getting the request from url
|
||
|
r = requests.get(site)
|
||
|
|
||
|
# converting the text
|
||
|
s = BeautifulSoup(r.text,"html.parser")
|
||
|
|
||
|
for i in s.find_all("a"): # for a href directories
|
||
|
href = i.attrs['href']
|
||
|
|
||
|
if href.endswith("/") and href != "../" and href != "/":
|
||
|
if home_site+href in urls: # avoids the link to parent directory
|
||
|
continue
|
||
|
site_next = site+href
|
||
|
|
||
|
if site_next not in urls:
|
||
|
urls.append(site_next)
|
||
|
print(site_next)
|
||
|
# calling it self
|
||
|
scrape(site_next)
|
||
|
|
||
|
def get_latest_date(web_dir):
|
||
|
page = requests.get(site).text
|
||
|
|
||
|
str_dates = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}', page)
|
||
|
dates = [list(datefinder.find_dates(date))[0] for date in str_dates]
|
||
|
|
||
|
# for date in dates:
|
||
|
# print(date)
|
||
|
|
||
|
return(max(dates))
|
||
|
|
||
|
# main function
|
||
|
if __name__ =="__main__":
|
||
|
|
||
|
# website to be scrape
|
||
|
# site="http://ports.ubuntu.com/ubuntu-ports/"
|
||
|
# works on: https://www.x.org/releases/
|
||
|
|
||
|
# calling function
|
||
|
# scrape(site)
|
||
|
|
||
|
# latest_date = get_latest_date(urls[0])
|
||
|
# get_latest_date(urls[0])
|
||
|
# for dir in urls:
|
||
|
# latest_date2 = get_latest_date(dir)
|
||
|
# if (latest_date2 >= latest_date):
|
||
|
# latest_date = latest_date2
|
||
|
|
||
|
# print(latest_date)
|
||
|
|
||
|
page = requests.get("https://repo.manjaro.org/").text
|
||
|
indexOfFile = page.find("mirror.csclub.uwaterloo.ca/manjaro")
|
||
|
|
||
|
m = re.search(r'(?P<hours>\d+):(?P<minutes>\d+)', page[indexOfFile:]) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460
|
||
|
duration = timedelta(**{key: float(val) for key, val in m.groupdict().items()})
|
||
|
print(duration)
|
||
|
|
||
|
print (duration <= pd.to_timedelta(86400, unit='s'))
|
||
|
|