diff --git a/README.md b/README.md index 5a92676..0b6ffe3 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,14 @@ This mirror status checker determines whether CSC mirror is up-to-date with upst A configuration file may be provided through standard input. Without a configuration file, execute `python main.py`. By default, all the available distributions will be checked. With a configuration file, execute `python main.py < name_of_config_file.in`, for example, `python main.py < example.in`. In this case, only the distributions listed in the configuration file will be checked. +## dev notes + +How the program works: We first have a general mirror check class called project.py which checks whether the timestamp in the directory of the mirror is in-sync with the upstream. Then, for each CSC mirror, a class is built which inherits from the general project.py class but often overrides the original check function with a check function specific to the mirror. A few big themes are: some check a mirror status tracker provided by the project mirrored; some check all the Release files for each version in a distro etc. website information which all the mirror checker classes need is stored in the data.json file. + +Future notes: Because many of the mirror checkers are built very specific to each mirror. A slight change in the way the project manages their mirror-related websites, public repos etc. can drastically influence whether the mirror checker works correctly or not. These problems are also unfortunately very hard to detect, so it's important that CSC actively maintain the mirror checker so that it works as intended in the long term. + +Extra notes: A test client for individual mirror checker classes is provided as test.py. To use it, simply change all occurrences of the imported project class + ## Resources - [CSC Mirror](http://mirror.csclub.uwaterloo.ca/) diff --git a/main.py b/main.py index 1ca8603..b9ddd8b 100644 --- a/main.py +++ b/main.py @@ -8,8 +8,8 @@ import time import sys import requests -from projects import * # noqa -import json # import json to read project info stored in json file +from projects import * +import json if __name__ == "__main__": diff --git a/test.py b/test.py index b6f01bc..f9a25ca 100644 --- a/test.py +++ b/test.py @@ -1,86 +1,13 @@ -from bs4 import BeautifulSoup -import requests -import datefinder # another date finding library -import re -from datetime import datetime -from datetime import timedelta -import time -import pandas as pd -import re # for salt stack specifically +""" +Test Client for individual classes in projects +""" + from projects import mxlinux_iso import json # import json to read project info stored in json file - -# this function is brute force looping through the whole directory and checking dates -# it may sound horrible, but for certain distros, i believe it's indeed the best solution - -# lists -urls=[] - -home_site = "https://cdimage.ubuntu.com" - -# function created -def scrape(site): - - # getting the request from url - r = requests.get(site) - - # converting the text - s = BeautifulSoup(r.text,"html.parser") - - # salt stack specific code - # s = s.find("div", {"id": "listing"}) - # print(s) - - for i in s.find_all("a"): # for a href directories - href = i.attrs['href'] - - if href.endswith("/") and href != "../" and href != "/": - if home_site+href in urls: # avoids the link to parent directory - continue - """if href == "//ftp.netbsd.org/": # netbsd specific code - continue""" - site_next = site+href - - if site_next not in urls: - urls.append(site_next) - print(site_next) - # calling it self - scrape(site_next) - -def get_latest_date(web_dir): - page = requests.get(web_dir).text - - str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page) - # print(str_dates[0]) - dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates] - - # for date in dates: - # print(date) - - if len(dates) == 0: - return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates - return(max(dates)) # main function if __name__ =="__main__": with open("data.json", "r", encoding="utf-8") as file: data = json.load(file) print(mxlinux_iso.check(data, "mxlinux_iso")) - - """# website to be scrape - site="https://cdimage.ubuntu.com/releases/" - # works on: - # https://www.x.org/releases/ - - # calling function - scrape(site) - - latest_date = get_latest_date(urls[0]) - # get_latest_date(urls[0]) - for dir in urls: - latest_date2 = get_latest_date(dir) - if (latest_date2 >= latest_date): - latest_date = latest_date2 - - print(latest_date)""" \ No newline at end of file