@ -13,7 +13,7 @@ import pandas as pd
# lists
urls = [ ]
home_site = " http://ykf.ca.distfiles.macports.org "
home_site = " http://ftp.netbsd.org/pub "
# function created
def scrape ( site ) :
@ -30,6 +30,8 @@ def scrape(site):
if href . endswith ( " / " ) and href != " ../ " and href != " / " :
""" if home_site+href in urls: # avoids the link to parent directory
continue """
if href == " //ftp.netbsd.org/ " : # netbsd specific code
continue
site_next = site + href
if site_next not in urls :
@ -53,31 +55,29 @@ def get_latest_date(web_dir):
if __name__ == " __main__ " :
# website to be scrape
#site="http://ykf.ca.distfiles.macports.org/MacPorts/mpdistfiles/ "
site = " http://ftp.netbsd.org/pub/NetBSD/ "
# works on: https://www.x.org/releases/
# https://mirror.csclub.uwaterloo.ca/linuxmint/ #works wonders for linuxmint
# unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on https://linuxmint.com/mirrors.php seems like the best choice
# calling function
#scrape(site )
scrape ( site )
#latest_date = get_latest_date(urls[0] )
latest_date = get_latest_date ( urls [ 0 ] )
# get_latest_date(urls[0])
#for dir in urls :
# latest_date2 = get_latest_date(dir )
# if (latest_date2 >= latest_date) :
# latest_date = latest_date2
for dir in urls :
latest_date2 = get_latest_date ( dir )
if ( latest_date2 > = latest_date ) :
latest_date = latest_date2
#print(latest_date )
print ( latest_date )
page = requests . get ( " http://rsync-mxlinux.org/mirmon/index.html " ) . text
""" page = requests.get( " http://rsync-mxlinux.org/mirmon/index.html " ). text
indexOfFile = page . find ( " mirror.csclub.uwaterloo.ca " )
m = re . search ( r ' ( \ d+ hours)|( \ d+( \ .)? \ d+ days) ' , page [ indexOfFile : ] ) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460
duration = pd . to_timedelta ( m . group ( 0 ) )
print ( duration < = pd . to_timedelta ( 86400 , unit = ' s ' ) )
# https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-archive
print ( duration < = pd . to_timedelta ( 86400 , unit = ' s ' ) ) """