mirror-checker/test.py

86 lines
2.5 KiB
Python
Raw Normal View History

2021-10-04 01:18:07 -04:00
from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
import re # for salt stack specifically
2021-10-17 20:59:35 -04:00
from projects import trisquel
import json # import json to read project info stored in json file
2021-10-04 02:47:55 -04:00
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
2021-10-04 01:18:07 -04:00
# lists
urls=[]
home_site = "https://cdimage.ubuntu.com"
2021-10-04 01:18:07 -04:00
# function created
def scrape(site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
# salt stack specific code
# s = s.find("div", {"id": "listing"})
# print(s)
2021-10-04 01:18:07 -04:00
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/":
if home_site+href in urls: # avoids the link to parent directory
2021-10-04 03:49:24 -04:00
continue
"""if href == "//ftp.netbsd.org/": # netbsd specific code
continue"""
2021-10-04 01:18:07 -04:00
site_next = site+href
if site_next not in urls:
urls.append(site_next)
print(site_next)
# calling it self
scrape(site_next)
def get_latest_date(web_dir):
page = requests.get(web_dir).text
2021-10-04 01:18:07 -04:00
2021-10-04 02:47:55 -04:00
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
# print(str_dates[0])
dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates]
2021-10-04 01:18:07 -04:00
# for date in dates:
# print(date)
if len(dates) == 0:
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
2021-10-04 01:18:07 -04:00
return(max(dates))
# main function
if __name__ =="__main__":
with open("data.json", "r", encoding="utf-8") as file:
data = json.load(file)
2021-10-17 20:59:35 -04:00
print(trisquel.check(data, "trisquel"))
2021-10-04 01:18:07 -04:00
"""# website to be scrape
site="https://cdimage.ubuntu.com/releases/"
# works on:
# https://www.x.org/releases/
2021-10-04 01:18:07 -04:00
# calling function
2021-10-04 03:49:24 -04:00
scrape(site)
2021-10-04 01:18:07 -04:00
2021-10-04 03:49:24 -04:00
latest_date = get_latest_date(urls[0])
2021-10-04 01:18:07 -04:00
# get_latest_date(urls[0])
2021-10-04 03:49:24 -04:00
for dir in urls:
latest_date2 = get_latest_date(dir)
if (latest_date2 >= latest_date):
latest_date = latest_date2
2021-10-04 02:47:55 -04:00
print(latest_date)"""
2021-10-04 01:18:07 -04:00