forked from public/mirror-checker
made a few refinements
This commit is contained in:
parent
c27fc9b502
commit
48ac2b71e5
|
@ -6,6 +6,14 @@ This mirror status checker determines whether CSC mirror is up-to-date with upst
|
|||
|
||||
A configuration file may be provided through standard input. Without a configuration file, execute `python main.py`. By default, all the available distributions will be checked. With a configuration file, execute `python main.py < name_of_config_file.in`, for example, `python main.py < example.in`. In this case, only the distributions listed in the configuration file will be checked.
|
||||
|
||||
## dev notes
|
||||
|
||||
How the program works: We first have a general mirror check class called project.py which checks whether the timestamp in the directory of the mirror is in-sync with the upstream. Then, for each CSC mirror, a class is built which inherits from the general project.py class but often overrides the original check function with a check function specific to the mirror. A few big themes are: some check a mirror status tracker provided by the project mirrored; some check all the Release files for each version in a distro etc. website information which all the mirror checker classes need is stored in the data.json file.
|
||||
|
||||
Future notes: Because many of the mirror checkers are built very specific to each mirror. A slight change in the way the project manages their mirror-related websites, public repos etc. can drastically influence whether the mirror checker works correctly or not. These problems are also unfortunately very hard to detect, so it's important that CSC actively maintain the mirror checker so that it works as intended in the long term.
|
||||
|
||||
Extra notes: A test client for individual mirror checker classes is provided as test.py. To use it, simply change all occurrences of the imported project class
|
||||
|
||||
## Resources
|
||||
|
||||
- [CSC Mirror](http://mirror.csclub.uwaterloo.ca/)
|
||||
|
|
4
main.py
4
main.py
|
@ -8,8 +8,8 @@ import time
|
|||
import sys
|
||||
import requests
|
||||
|
||||
from projects import * # noqa
|
||||
import json # import json to read project info stored in json file
|
||||
from projects import *
|
||||
import json
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
81
test.py
81
test.py
|
@ -1,86 +1,13 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import datefinder # another date finding library
|
||||
import re
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
import time
|
||||
import pandas as pd
|
||||
import re # for salt stack specifically
|
||||
"""
|
||||
Test Client for individual classes in projects
|
||||
"""
|
||||
|
||||
from projects import mxlinux_iso
|
||||
import json # import json to read project info stored in json file
|
||||
|
||||
# this function is brute force looping through the whole directory and checking dates
|
||||
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
|
||||
|
||||
# lists
|
||||
urls=[]
|
||||
|
||||
home_site = "https://cdimage.ubuntu.com"
|
||||
|
||||
# function created
|
||||
def scrape(site):
|
||||
|
||||
# getting the request from url
|
||||
r = requests.get(site)
|
||||
|
||||
# converting the text
|
||||
s = BeautifulSoup(r.text,"html.parser")
|
||||
|
||||
# salt stack specific code
|
||||
# s = s.find("div", {"id": "listing"})
|
||||
# print(s)
|
||||
|
||||
for i in s.find_all("a"): # for a href directories
|
||||
href = i.attrs['href']
|
||||
|
||||
if href.endswith("/") and href != "../" and href != "/":
|
||||
if home_site+href in urls: # avoids the link to parent directory
|
||||
continue
|
||||
"""if href == "//ftp.netbsd.org/": # netbsd specific code
|
||||
continue"""
|
||||
site_next = site+href
|
||||
|
||||
if site_next not in urls:
|
||||
urls.append(site_next)
|
||||
print(site_next)
|
||||
# calling it self
|
||||
scrape(site_next)
|
||||
|
||||
def get_latest_date(web_dir):
|
||||
page = requests.get(web_dir).text
|
||||
|
||||
str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
|
||||
# print(str_dates[0])
|
||||
dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates]
|
||||
|
||||
# for date in dates:
|
||||
# print(date)
|
||||
|
||||
if len(dates) == 0:
|
||||
return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
|
||||
return(max(dates))
|
||||
|
||||
# main function
|
||||
if __name__ =="__main__":
|
||||
with open("data.json", "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
print(mxlinux_iso.check(data, "mxlinux_iso"))
|
||||
|
||||
"""# website to be scrape
|
||||
site="https://cdimage.ubuntu.com/releases/"
|
||||
# works on:
|
||||
# https://www.x.org/releases/
|
||||
|
||||
# calling function
|
||||
scrape(site)
|
||||
|
||||
latest_date = get_latest_date(urls[0])
|
||||
# get_latest_date(urls[0])
|
||||
for dir in urls:
|
||||
latest_date2 = get_latest_date(dir)
|
||||
if (latest_date2 >= latest_date):
|
||||
latest_date = latest_date2
|
||||
|
||||
print(latest_date)"""
|
||||
|
Loading…
Reference in New Issue