made a few refinements

This commit is contained in:
Tom 2021-10-17 20:50:21 -07:00
parent c27fc9b502
commit 48ac2b71e5
3 changed files with 14 additions and 79 deletions

View File

@ -6,6 +6,14 @@ This mirror status checker determines whether CSC mirror is up-to-date with upst
A configuration file may be provided through standard input. Without a configuration file, execute `python main.py`. By default, all the available distributions will be checked. With a configuration file, execute `python main.py < name_of_config_file.in`, for example, `python main.py < example.in`. In this case, only the distributions listed in the configuration file will be checked.
## dev notes
How the program works: We first have a general mirror check class called project.py which checks whether the timestamp in the directory of the mirror is in-sync with the upstream. Then, for each CSC mirror, a class is built which inherits from the general project.py class but often overrides the original check function with a check function specific to the mirror. A few big themes are: some check a mirror status tracker provided by the project mirrored; some check all the Release files for each version in a distro etc. website information which all the mirror checker classes need is stored in the data.json file.
Future notes: Because many of the mirror checkers are built very specific to each mirror. A slight change in the way the project manages their mirror-related websites, public repos etc. can drastically influence whether the mirror checker works correctly or not. These problems are also unfortunately very hard to detect, so it's important that CSC actively maintain the mirror checker so that it works as intended in the long term.
Extra notes: A test client for individual mirror checker classes is provided as test.py. To use it, simply change all occurrences of the imported project class
## Resources
- [CSC Mirror](http://mirror.csclub.uwaterloo.ca/)

View File

@ -8,8 +8,8 @@ import time
import sys
import requests
from projects import * # noqa
import json # import json to read project info stored in json file
from projects import *
import json
if __name__ == "__main__":

81
test.py
View File

@ -1,86 +1,13 @@
from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
import re # for salt stack specifically
"""
Test Client for individual classes in projects
"""
from projects import mxlinux_iso
import json # import json to read project info stored in json file
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
# lists
urls=[]
home_site = "https://cdimage.ubuntu.com"
# function created
def scrape(site):
# getting the request from url
r = requests.get(site)
# converting the text
s = BeautifulSoup(r.text,"html.parser")
# salt stack specific code
# s = s.find("div", {"id": "listing"})
# print(s)
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/":
if home_site+href in urls: # avoids the link to parent directory
continue
"""if href == "//ftp.netbsd.org/": # netbsd specific code
continue"""
site_next = site+href
if site_next not in urls:
urls.append(site_next)
print(site_next)
# calling it self
scrape(site_next)
def get_latest_date(web_dir):
page = requests.get(web_dir).text
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
# print(str_dates[0])
dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates]
# for date in dates:
# print(date)
if len(dates) == 0:
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
return(max(dates))
# main function
if __name__ =="__main__":
with open("data.json", "r", encoding="utf-8") as file:
data = json.load(file)
print(mxlinux_iso.check(data, "mxlinux_iso"))
"""# website to be scrape
site="https://cdimage.ubuntu.com/releases/"
# works on:
# https://www.x.org/releases/
# calling function
scrape(site)
latest_date = get_latest_date(urls[0])
# get_latest_date(urls[0])
for dir in urls:
latest_date2 = get_latest_date(dir)
if (latest_date2 >= latest_date):
latest_date = latest_date2
print(latest_date)"""