2021-10-04 01:18:07 -04:00
from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
2021-10-04 02:47:55 -04:00
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
2021-10-04 01:18:07 -04:00
# lists
urls = [ ]
2021-10-04 02:47:55 -04:00
home_site = " http://ykf.ca.distfiles.macports.org "
2021-10-04 01:18:07 -04:00
# function created
def scrape ( site ) :
# getting the request from url
r = requests . get ( site )
# converting the text
s = BeautifulSoup ( r . text , " html.parser " )
for i in s . find_all ( " a " ) : # for a href directories
href = i . attrs [ ' href ' ]
if href . endswith ( " / " ) and href != " ../ " and href != " / " :
2021-10-04 02:47:55 -04:00
""" if home_site+href in urls: # avoids the link to parent directory
continue """
2021-10-04 01:18:07 -04:00
site_next = site + href
if site_next not in urls :
urls . append ( site_next )
print ( site_next )
# calling it self
scrape ( site_next )
def get_latest_date ( web_dir ) :
page = requests . get ( site ) . text
2021-10-04 02:47:55 -04:00
str_dates = re . findall ( r ' ( \ d {2} - \ w {3} - \ d {4} \ d {2} : \ d {2} )|( \ d {4} - \ d {2} - \ d {2} \ d {2} : \ d {2} ) ' , page )
2021-10-04 01:18:07 -04:00
dates = [ list ( datefinder . find_dates ( date ) ) [ 0 ] for date in str_dates ]
# for date in dates:
# print(date)
return ( max ( dates ) )
# main function
if __name__ == " __main__ " :
# website to be scrape
2021-10-04 02:47:55 -04:00
#site="http://ykf.ca.distfiles.macports.org/MacPorts/mpdistfiles/"
2021-10-04 01:18:07 -04:00
# works on: https://www.x.org/releases/
2021-10-04 02:47:55 -04:00
# https://mirror.csclub.uwaterloo.ca/linuxmint/ #works wonders for linuxmint
# unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on https://linuxmint.com/mirrors.php seems like the best choice
2021-10-04 01:18:07 -04:00
# calling function
2021-10-04 02:47:55 -04:00
#scrape(site)
2021-10-04 01:18:07 -04:00
2021-10-04 02:47:55 -04:00
#latest_date = get_latest_date(urls[0])
2021-10-04 01:18:07 -04:00
# get_latest_date(urls[0])
2021-10-04 02:47:55 -04:00
#for dir in urls:
# latest_date2 = get_latest_date(dir)
# if (latest_date2 >= latest_date):
# latest_date = latest_date2
#print(latest_date)
2021-10-04 01:18:07 -04:00
2021-10-04 02:47:55 -04:00
page = requests . get ( " http://rsync-mxlinux.org/mirmon/index.html " ) . text
indexOfFile = page . find ( " mirror.csclub.uwaterloo.ca " )
2021-10-04 01:18:07 -04:00
2021-10-04 02:47:55 -04:00
m = re . search ( r ' ( \ d+ hours)|( \ d+( \ .)? \ d+ days) ' , page [ indexOfFile : ] ) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460
2021-10-04 01:18:07 -04:00
2021-10-04 02:47:55 -04:00
duration = pd . to_timedelta ( m . group ( 0 ) )
2021-10-04 01:18:07 -04:00
print ( duration < = pd . to_timedelta ( 86400 , unit = ' s ' ) )
2021-10-04 02:47:55 -04:00
# https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-archive
2021-10-04 01:18:07 -04:00