forked from public/mirror-checker
parent
c27fc9b502
commit
48ac2b71e5
@ -1,86 +1,13 @@ |
||||
from bs4 import BeautifulSoup |
||||
import requests |
||||
import datefinder # another date finding library |
||||
import re |
||||
from datetime import datetime |
||||
from datetime import timedelta |
||||
import time |
||||
import pandas as pd |
||||
import re # for salt stack specifically |
||||
""" |
||||
Test Client for individual classes in projects |
||||
""" |
||||
|
||||
from projects import mxlinux_iso |
||||
import json # import json to read project info stored in json file |
||||
|
||||
# this function is brute force looping through the whole directory and checking dates |
||||
# it may sound horrible, but for certain distros, i believe it's indeed the best solution |
||||
|
||||
# lists |
||||
urls=[] |
||||
|
||||
home_site = "https://cdimage.ubuntu.com" |
||||
|
||||
# function created |
||||
def scrape(site): |
||||
|
||||
# getting the request from url |
||||
r = requests.get(site) |
||||
|
||||
# converting the text |
||||
s = BeautifulSoup(r.text,"html.parser") |
||||
|
||||
# salt stack specific code |
||||
# s = s.find("div", {"id": "listing"}) |
||||
# print(s) |
||||
|
||||
for i in s.find_all("a"): # for a href directories |
||||
href = i.attrs['href'] |
||||
|
||||
if href.endswith("/") and href != "../" and href != "/": |
||||
if home_site+href in urls: # avoids the link to parent directory |
||||
continue |
||||
"""if href == "//ftp.netbsd.org/": # netbsd specific code |
||||
continue""" |
||||
site_next = site+href |
||||
|
||||
if site_next not in urls: |
||||
urls.append(site_next) |
||||
print(site_next) |
||||
# calling it self |
||||
scrape(site_next) |
||||
|
||||
def get_latest_date(web_dir): |
||||
page = requests.get(web_dir).text |
||||
|
||||
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page) |
||||
# print(str_dates[0]) |
||||
dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates] |
||||
|
||||
# for date in dates: |
||||
# print(date) |
||||
|
||||
if len(dates) == 0: |
||||
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates |
||||
return(max(dates)) |
||||
|
||||
# main function |
||||
if __name__ =="__main__": |
||||
with open("data.json", "r", encoding="utf-8") as file: |
||||
data = json.load(file) |
||||
print(mxlinux_iso.check(data, "mxlinux_iso")) |
||||
|
||||
"""# website to be scrape |
||||
site="https://cdimage.ubuntu.com/releases/" |
||||
# works on: |
||||
# https://www.x.org/releases/ |
||||
|
||||
# calling function |
||||
scrape(site) |
||||
|
||||
latest_date = get_latest_date(urls[0]) |
||||
# get_latest_date(urls[0]) |
||||
for dir in urls: |
||||
latest_date2 = get_latest_date(dir) |
||||
if (latest_date2 >= latest_date): |
||||
latest_date = latest_date2 |
||||
|
||||
print(latest_date)""" |
||||
|
Loading…
Reference in new issue