from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
# lists
urls = [ ]
home_site = " http://ftp.netbsd.org/pub "
# function created
def scrape ( site ) :
# getting the request from url
r = requests . get ( site )
# converting the text
s = BeautifulSoup ( r . text , " html.parser " )
for i in s . find_all ( " a " ) : # for a href directories
href = i . attrs [ ' href ' ]
if href . endswith ( " / " ) and href != " ../ " and href != " / " :
""" if home_site+href in urls: # avoids the link to parent directory
continue """
if href == " //ftp.netbsd.org/ " : # netbsd specific code
continue
site_next = site + href
if site_next not in urls :
urls . append ( site_next )
print ( site_next )
# calling it self
scrape ( site_next )
def get_latest_date ( web_dir ) :
page = requests . get ( site ) . text
str_dates = re . findall ( r ' ( \ d {2} - \ w {3} - \ d {4} \ d {2} : \ d {2} )|( \ d {4} - \ d {2} - \ d {2} \ d {2} : \ d {2} ) ' , page )
dates = [ list ( datefinder . find_dates ( date ) ) [ 0 ] for date in str_dates ]
# for date in dates:
# print(date)
return ( max ( dates ) )
# main function
if __name__ == " __main__ " :
""" # website to be scrape
site = " http://ftp.netbsd.org/pub/NetBSD/ "
# works on: https://www.x.org/releases/
# https://mirror.csclub.uwaterloo.ca/linuxmint/ #works wonders for linuxmint
# unfortunately, linuxmint does not have a public repo, the worldwide mirror LayerOnline on https://linuxmint.com/mirrors.php seems like the best choice
# calling function
scrape ( site )
latest_date = get_latest_date ( urls [ 0 ] )
# get_latest_date(urls[0])
for dir in urls :
latest_date2 = get_latest_date ( dir )
if ( latest_date2 > = latest_date ) :
latest_date = latest_date2
print ( latest_date ) """
csc_url = " https://mirror.csclub.uwaterloo.ca/ubuntu-ports/project/trace/anonster.canonical.com "
upstream_url = " http://ports.ubuntu.com/ubuntu-ports/project/trace/anonster.canonical.com "
print ( requests . get ( upstream_url ) . text )
print ( requests . get ( csc_url ) . text == requests . get ( upstream_url ) . text )