Compare commits

..

16 Commits

Author SHA1 Message Date
Raymond Li 90bde5a754 Merge pull request 'Fixed slackware checker' (#12) from jtoft-rt-4117-slackware into master
Reviewed-on: public/mirror-checker#12
Reviewed-by: Raymond Li <raymo@csclub.uwaterloo.ca>
2022-07-30 20:30:07 -04:00
Justin Toft 55f3e7bcbe
Fixed slackware checker 2022-07-30 19:29:53 -04:00
Justin Toft 860d5c8e44 Fixed 6 broken mirror checkers (#11)
Note that I updated the time tolerance for Fedora to 259200 seconds (3 days) and changed the ubuntu releases url

Co-authored-by: Justin Toft <justintoft10@gmail.com>
Reviewed-on: public/mirror-checker#11
Reviewed-by: Raymond Li <raymo@csclub.uwaterloo.ca>
Co-authored-by: Justin Toft <jtoft@uwaterloo.ca>
Co-committed-by: Justin Toft <jtoft@uwaterloo.ca>
2022-07-30 01:14:45 -04:00
Raymond Li 65b06ac0a8 Merge pull request 'Fixed broken ubuntu and ubuntu_ports mirror checks' (#10) from jtoft-rt-4117 into master
Reviewed-on: public/mirror-checker#10
Reviewed-by: Raymond Li <raymo@csclub.uwaterloo.ca>
2022-06-10 22:40:16 -04:00
Justin Toft 5afebc8030 Fixed broken ubuntu and ubuntu_ports mirror checks 2022-06-10 21:22:33 -04:00
Raymond Li 54d8a47944 Merge pull request 'Update time stamp parsing in cran, ctan, mxlinux' (#9) from time-parsing into master
Reviewed-on: public/mirror-checker#9
2022-04-15 20:37:17 -04:00
Rio Liu 789f0cd662 update time stamp parsing in cran, ctan, mxlinux 2022-04-14 13:56:59 -04:00
Raymond Li e1ef917af0 Merge pull request 'fix CTAN && trust CPAN's report' (#8) from y266shen/mirror-checker:feature-retry into master
Reviewed-on: public/mirror-checker#8
2022-03-30 18:59:21 -04:00
Yiao Shen 0571f7353a
projects/ctan: make Python happy 2022-03-30 00:15:08 -04:00
Yiao Shen 612ec9d04b
projects/cpan: trust CPAN's mirror status 2022-03-26 22:04:51 -04:00
Raymond Li 73bc2b5ade Merge pull request 'various mirror fixes' (#7) from y266shen/mirror-checker:feature-retry into master
Reviewed-on: public/mirror-checker#7
2022-03-26 21:24:22 -04:00
Yiao Shen 9f345c7b0c
main: display how long a mirror has been out-of-sync 2022-03-26 21:06:23 -04:00
Yiao Shen 660b566715
projects/freebsd: correctly parse FreeBSD's timestamp file 2022-03-26 21:04:17 -04:00
Yiao Shen f408922a96
projects/apache: correctly parse Apache's timestamp file 2022-03-26 21:03:51 -04:00
Raymond Li 4a3cedfb07 Merge pull request 'implement retry' (#6) from y266shen/mirror-checker:feature-retry into master
Reviewed-on: public/mirror-checker#6
2022-03-26 20:24:41 -04:00
Yiao Shen 76f7863a85
main: retry 3 times before reporting errors
- modify `check_project` function so that it returns error reason
  string or returns None
- add some type annotations
2022-03-26 20:13:57 -04:00
19 changed files with 697 additions and 468 deletions

View File

@ -48,7 +48,8 @@
"file": "timestamp"
},
"CPAN": {
"out_of_sync_interval": 172800
"out_of_sync_interval": 172800,
"out_of_sync_since": null
},
"cran": {
"out_of_sync_since": null,
@ -86,7 +87,7 @@
"file": "debian-cd/project/trace/cdimage.debian.org"
},
"DebianMultimedia": {
"out_of_sync_since": null,
"out_of_sync_since": 1659116719,
"out_of_sync_interval": 86400,
"csc": "debian-multimedia/",
"upstream": "http://debian-mirrors.sdinet.de/deb-multimedia/",
@ -116,7 +117,7 @@
},
"Fedora": {
"out_of_sync_since": null,
"out_of_sync_interval": 86400,
"out_of_sync_interval": 259200,
"csc": "fedora/",
"upstream": "http://fedora.mirror.iweb.com/",
"file": "linux/development/rawhide/COMPOSE_ID"
@ -173,7 +174,7 @@
"out_of_sync_interval": 172800
},
"KDE": {
"out_of_sync_since": null,
"out_of_sync_since": 1659116720,
"out_of_sync_interval": 86400,
"csc": "kde/",
"upstream": "https://kde.c3sl.ufpr.br/",
@ -208,7 +209,7 @@
"file": "dists/"
},
"macports": {
"out_of_sync_since": null,
"out_of_sync_since": 1642827723,
"out_of_sync_interval": 86400,
"csc": "MacPorts/mpdistfiles/",
"upstream": "https://distfiles.macports.org/",
@ -269,7 +270,7 @@
"exclude": true
},
"opensuse": {
"out_of_sync_since": null,
"out_of_sync_since": 1648699331,
"out_of_sync_interval": 86400,
"csc": "opensuse/update/",
"upstream": "http://opensuse-mirror-gce-us.opensu.se/update/",
@ -320,7 +321,7 @@
"file": "dists/"
},
"raspbian": {
"out_of_sync_since": null,
"out_of_sync_since": 1659116721,
"out_of_sync_interval": 86400,
"csc": "raspbian/",
"upstream": "http://archive.raspbian.org/",
@ -341,7 +342,7 @@
"file": ""
},
"slackware": {
"out_of_sync_since": null,
"out_of_sync_since": 1642827723,
"out_of_sync_interval": 86400,
"csc": "slackware/",
"upstream": "https://mirrors.slackware.com/slackware/",
@ -373,7 +374,7 @@
"file": ""
},
"ubuntu_ports": {
"out_of_sync_since": null,
"out_of_sync_since": 1651550528,
"out_of_sync_interval": 86400,
"csc": "ubuntu-ports/",
"upstream": "http://ports.ubuntu.com/ubuntu-ports/",
@ -390,7 +391,7 @@
"out_of_sync_since": null,
"out_of_sync_interval": 172800,
"csc": "",
"upstream": "https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-release",
"upstream": "https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-archive",
"file": ""
},
"vlc": {
@ -401,7 +402,7 @@
"file": "trace"
},
"x_org": {
"out_of_sync_since": null,
"out_of_sync_since": 1657512131,
"out_of_sync_interval": 86400,
"csc": "x.org/individual/",
"upstream": "https://www.x.org/releases/individual/",

44
main.py
View File

@ -8,46 +8,68 @@ import time
import sys
import requests
from multiprocessing import Pool, Manager
from typing import Optional
from time import sleep, localtime, strftime
from projects import *
import json
NUM_THREAD = 16
current_time = int(time.time())
MAX_RETRY = 3
RETRY_TIMEOUT = 30 # In seconds
def safe_print(*args, **kwargs):
# When run with 'chronic' and 'timeout', stdout gets suppressed
# due to buffering. Make sure to always flush the output.
print(*args, **kwargs, flush=True)
def check_project(args):
# Return None if no error occurs and a string for error message otherwise
def check_project(args) -> Optional[str]:
current_time = int(time.time())
project, data = args
try:
project_class = getattr(sys.modules[__name__], project)
# Skip projects we no longer mirror
if data[project].get('exclude', False):
return True
return None
checker_result = project_class.check(data, project, current_time)
if checker_result:
data[project]["out_of_sync_since"] = None
safe_print(f"Success: {project} up-to-date")
return True
return None
elif (data[project]["out_of_sync_since"] is not None
and current_time - data[project]["out_of_sync_since"] > data[project]["out_of_sync_interval"]):
safe_print(f"Failure: {project} out-of-sync")
return False
now_str = strftime("%d %b %Y %H:%M:%S (local time)", localtime())
duration = current_time - data[project]["out_of_sync_since"]
return f"{project} out-of-sync at {now_str} for {duration}s"
else:
data[project]["out_of_sync_since"] = current_time
return True
return None
except requests.exceptions.RequestException as err:
safe_print(f"Error: {project}\n{err}")
return f"{project}\n{err}"
def check_project_with_retry(args) -> bool:
project, _ = args
errs = []
for _ in range(MAX_RETRY):
res = check_project(args)
if res == None:
safe_print(f"Success: {project} up-to-date")
return True
else:
errs.append(res)
# Do nothing, try again later
sleep(RETRY_TIMEOUT)
# Max try reached, print errors
safe_print(f"Error: {project}")
for reason in errs:
safe_print(f" {reason}")
return False
@ -61,7 +83,7 @@ def main():
sync_data = manager.dict({k: manager.dict(v) for k, v in data.items()})
with Pool(NUM_THREAD) as pool:
all_pass = all(pool.imap(check_project, ((k, sync_data) for k in data.keys())))
all_pass = all(pool.imap(check_project_with_retry, ((k, sync_data) for k in data.keys())))
with open(data_file, "w", encoding="utf-8") as file:
json.dump({k: dict(v) for k, v in sync_data.items()}, file, indent=' ')

View File

@ -3,7 +3,35 @@ Contains Apache class
"""
from project import Project
from shared import CSC_MIRROR
import requests
class Apache(Project):
"""Apache class"""
# Apache's time file has two segments, so we need a special function
# Example: 1648323001 rsync-he-fi
def check(data, project, current_time):
"""Check if project packages are up-to-date"""
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
req = requests.get(csc_url)
req.raise_for_status()
CSC = req.text
req = requests.get(upstream_url)
req.raise_for_status()
upstream = req.text
if upstream == CSC:
return True
try:
return get_timestamp_from_apache(upstream) - get_timestamp_from_apache(CSC) < data[project]["out_of_sync_interval"]
except ValueError:
print("failed to parse apache")
return False
def get_timestamp_from_apache(s: str) -> int:
real_time = s.split(" ")[0]
return int(real_time)

View File

@ -15,7 +15,8 @@ class CPAN(Project):
def check(data, project, current_time):
res_json = requests.get("http://mirrors.cpan.org/cpan-json.txt").json()
for mirror in res_json:
if mirror["url"] == f"{CSC_MIRROR}CPAN/":
data[project]["out_of_sync_since"] = int(mirror["age"])
return current_time - data[project]["out_of_sync_since"] <= data[project]["out_of_sync_interval"]
if mirror["url"] == f"{CSC_MIRROR}CPAN/" and mirror["last_status"] == "ok":
# This is an improvised method: report we're good if CPAN think we are good
# Change this to a more precise method if you find a better way to do it
return True
return False

View File

@ -18,7 +18,7 @@ class cran(Project):
page = requests.get(data[project]["upstream"]).text
indexOfFile = page.find("mirror.csclub.uwaterloo.ca")
m = re.search(r'(\d+ hour)|(\d+ hours)|(\d+(\.)?\d+ days)', page[indexOfFile:]) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460
m = re.search(r'(\d+ minutes?)|(\d+ hours?)|(\d+(\.)?\d+ days?)', page[indexOfFile:])
duration = pd.to_timedelta(m.group(0))
data[project]["out_of_sync_since"] = current_time - duration.total_seconds()

View File

@ -7,7 +7,7 @@ from project import Project
from shared import CSC_MIRROR
import requests
import datefinder # another date finding library
from datetime import timedelta
from datetime import datetime, timedelta
import re
import pandas as pd
@ -18,9 +18,10 @@ class ctan(Project):
page = requests.get(data[project]["upstream"]).text
indexOfFile = page.find("mirror.csclub.uwaterloo.ca")
m = re.search(r'(\d+ hour)|(\d+ hours)|(\d+(\.)?\d+ days)', page[indexOfFile:]) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460
m = re.search(r'(\d+ minutes?)|(\d+ hours?)|(\d+(\.)?\d+ days?)', page[indexOfFile:])
duration = pd.to_timedelta(m.group(0))
data[project]["out_of_sync_since"] = datetime.now() - duration.total_seconds()
data[project]["out_of_sync_since"] = datetime.now() - duration
return duration <= pd.to_timedelta(data[project]["out_of_sync_interval"], unit='s')

View File

@ -3,7 +3,37 @@ Contains Debian class
"""
from project import Project
from shared import CSC_MIRROR
import requests
from datetime import datetime
import time
class Debian(Project):
"""Debian class"""
@staticmethod
def check(data, project, current_time):
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
req = requests.get(csc_url)
req.raise_for_status()
CSC = req.text
req = requests.get(upstream_url)
req.raise_for_status()
upstream = req.text
if upstream == CSC:
return True
CSC_date = datetime.strptime(CSC.partition('\n')[0], "%a %b %d %H:%M:%S UTC %Y")
CSC_utc_time = time.mktime(CSC_date.timetuple())
upstream_date = datetime.strptime(upstream.partition('\n')[0], "%a %b %d %H:%M:%S UTC %Y")
upstream_utc_time = time.mktime(upstream_date.timetuple())
try:
return int(upstream_utc_time) - int(CSC_utc_time) < data[project]["out_of_sync_interval"]
except ValueError:
return False

View File

@ -3,7 +3,40 @@ Contains DebianCD class
"""
from project import Project
from shared import CSC_MIRROR
import requests
from datetime import datetime
import time
class DebianCD(Project):
"""DebianCD class"""
@staticmethod
def check(data, project, current_time):
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
req = requests.get(csc_url)
req.raise_for_status()
CSC = req.text
req = requests.get(upstream_url)
req.raise_for_status()
upstream = req.text
if upstream == CSC:
return True
# Date Format Example: Sun 27 Mar 00:20:12 UTC 2022
date_format = "%a %d %b %H:%M:%S UTC %Y\n"
CSC_date = datetime.strptime(CSC, date_format)
CSC_utc_time = time.mktime(CSC_date.timetuple())
upstream_date = datetime.strptime(upstream, date_format)
upstream_utc_time = time.mktime(upstream_date.timetuple())
try:
return int(upstream_utc_time) - int(CSC_utc_time) < data[project]["out_of_sync_interval"]
except ValueError:
return False

View File

@ -3,7 +3,40 @@ Contains Fedora class
"""
from project import Project
from shared import CSC_MIRROR
import requests
from datetime import datetime
import time
class Fedora(Project):
"""Fedora class"""
@staticmethod
def check(data, project, current_time):
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
req = requests.get(csc_url)
req.raise_for_status()
CSC = req.text
req = requests.get(upstream_url)
req.raise_for_status()
upstream = req.text
if upstream == CSC:
return True
# Date example: Fedora-Rawhide-20220725.n.1
date_format = "%Y%m%d"
CSC_date = datetime.strptime(CSC[15:23], "%Y%m%d")
CSC_utc_time = time.mktime(CSC_date.timetuple())
upstream_date = datetime.strptime(upstream[15:23], "%Y%m%d")
upstream_utc_time = time.mktime(upstream_date.timetuple())
try:
return upstream_utc_time - CSC_utc_time < data[project]["out_of_sync_interval"]
except ValueError:
return False

View File

@ -3,7 +3,35 @@ Contains FreeBSD class
"""
from project import Project
from shared import CSC_MIRROR
import requests
class FreeBSD(Project):
"""FreeBSD class"""
# FreeBSD's time file has two segments, so we need a special function
# Example TIMESTAMP file: 1648308600 2022-03-26T15:30:00 UTC
def check(data, project, current_time):
"""Check if project packages are up-to-date"""
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
req = requests.get(csc_url)
req.raise_for_status()
CSC = req.text
req = requests.get(upstream_url)
req.raise_for_status()
upstream = req.text
if upstream == CSC:
return True
try:
return get_timestamp_for_freebsd(upstream) - get_timestamp_for_freebsd(CSC) < data[project]["out_of_sync_interval"]
except ValueError:
print("failed to parse apache")
return False
def get_timestamp_for_freebsd(s: str) -> int:
real_time = s.split(" ")[0]
return int(real_time)

View File

@ -3,7 +3,35 @@ Contains GentooDistfiles class
"""
from project import Project
from shared import CSC_MIRROR
import requests
from datetime import datetime
import time
class GentooDistfiles(Project):
"""GentooDistfiles class"""
@staticmethod
def check(data, project, current_time):
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
req = requests.get(csc_url)
req.raise_for_status()
CSC = req.text
req = requests.get(upstream_url)
req.raise_for_status()
upstream = req.text
if upstream == CSC:
return True
CSC_utc_time = CSC[0:11]
upstream_utc_time = upstream[0:11]
try:
return int(upstream_utc_time) - int(CSC_utc_time) < data[project]["out_of_sync_interval"]
except ValueError:
return False

View File

@ -18,7 +18,7 @@ class mxlinux(Project):
page = requests.get(data[project]["upstream"]).text
indexOfFile = page.find("mirror.csclub.uwaterloo.ca")
m = re.search(r'(\d+ hour)|(\d+ hours)|(\d+(\.)?\d+ days)', page[indexOfFile:]) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460
m = re.search(r'(\d+ minutes?)|(\d+ hours?)|(\d+(\.)?\d+ days?)', page[indexOfFile:])
duration = pd.to_timedelta(m.group(0))
data[project]["out_of_sync_since"] = current_time - duration.total_seconds()

View File

@ -19,7 +19,7 @@ class mxlinux_iso(Project):
page = requests.get(data[project]["upstream"]).text
indexOfFile = page.find("mirror.csclub.uwaterloo.ca")
m = re.search(r'(\d+ hour)|(\d+ hours)|(\d+(\.)?\d+ days)', page[indexOfFile:]) # solution from: https://stackoverflow.com/questions/21074100/how-to-convert-standard-timedelta-string-to-timedelta-object/21074460
m = re.search(r'(\d+ minutes?)|(\d+ hours?)|(\d+(\.)?\d+ days?)', page[indexOfFile:])
duration = pd.to_timedelta(m.group(0))

View File

@ -3,6 +3,31 @@ Contains nongnu class
"""
from project import Project
from shared import CSC_MIRROR
import requests
from datetime import datetime
import time
class nongnu(Project):
"""nongnu class"""
@staticmethod
def check(data, project, current_time):
csc_url = CSC_MIRROR + data[project]["csc"] + data[project]["file"]
upstream_url = data[project]["upstream"] + data[project]["file"]
req = requests.get(csc_url)
req.raise_for_status()
CSC = req.text
req = requests.get(upstream_url)
req.raise_for_status()
upstream = req.text
if upstream == CSC:
return True
try:
return int(upstream.partition('\n')[0]) - int(CSC.partition('\n')[0]) < data[project]["out_of_sync_interval"]
except ValueError:
return False

View File

@ -23,7 +23,8 @@ class raspberrypi(Project):
for i in s.find_all("a"): # for a href directories
href = i.attrs['href']
if href.endswith("/") and href != "../" and href != "/":
# The raspberry pi server doesn't use a relative path to the parent directory
if href.endswith("/") and href != "../" and href != "/" and href != "/debian/":
site_next = site+href+"Release"
if site_next not in urls:
@ -44,6 +45,7 @@ class raspberrypi(Project):
cls.scrape(urls1, csc_url)
cls.scrape(urls2, upstream_url)
if (len(urls1) != len(urls2)):
return False
urls1.sort()

View File

@ -34,7 +34,7 @@ class slackware(Project):
hrefs2 = [i.attrs['href'] for i in s2.find_all("a")]
for href in hrefs1: # for a href directories
if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not re.match(r'slackware-([1-7]|8\.0).*', href) and href != "slackware-iso/" and href != "slackware-current/" and href != "slackware-pre-1.0-beta/" and href != "unsupported/":
if href.endswith("/") and href != "../" and href != "/" and not href.startswith("/") and not re.match(r'slackware-([1-7]|8\.0).*', href) and href != "slackware-iso/" and href != "slackware-current/" and href != "slackware-pre-1.0-beta/" and href != "unsupported/" and not href.startswith("http"):
# print(href)
if href not in hrefs2:
return False

View File

@ -5,6 +5,7 @@ Contains ubuntu class
import os
from project import Project
from shared import CSC_MIRROR
from shared import NUM_UBUNTU_RELEASES
import requests
import datefinder # another date finding library
from datetime import timedelta
@ -17,4 +18,4 @@ class ubuntu(Project):
@staticmethod
def check(data, project, current_time):
page = requests.get(data[project]["upstream"]).text
return page.count("Up to date") == 21
return page.count("Up to date") == NUM_UBUNTU_RELEASES

View File

@ -5,6 +5,7 @@ Contains ubuntu_releases class
import os
from project import Project
from shared import CSC_MIRROR
from shared import NUM_UBUNTU_RELEASES
import requests
import datefinder # another date finding library
from datetime import timedelta
@ -17,10 +18,4 @@ class ubuntu_releases(Project):
@staticmethod
def check(data, project, current_time):
page = requests.get(data[project]["upstream"]).text
indexOfFile = page.find("last verified")
matches = list(datefinder.find_dates(page[indexOfFile:]))
date = matches[0].replace(tzinfo=None) # date is of type datetime.datetime
data[project]["out_of_sync_since"] = date.timestamp()
return(pd.to_datetime(current_time, unit='s') - date <= pd.to_timedelta(data[project]["out_of_sync_interval"], unit='s'))
# https://launchpad.net/ubuntu/+mirror/mirror.csclub.uwaterloo.ca-release
return page.count("Up to date") == NUM_UBUNTU_RELEASES

View File

@ -1,3 +1,4 @@
"""Contains shared constants"""
CSC_MIRROR = "http://mirror.csclub.uwaterloo.ca/"
NUM_UBUNTU_RELEASES = 18