mirror-checker/main.py

115 lines
4.2 KiB
Python
Raw Normal View History

2021-08-16 18:23:04 -04:00
#!/usr/bin/env python3
"""
This mirror status checker determines whether CSC mirror is up-to-date with upstream
"""
2021-09-01 23:25:06 -04:00
import time
import sys
2021-08-16 18:23:04 -04:00
import requests
2021-10-03 15:35:17 -04:00
from projects.almalinux import AlmaLinux
from projects.alpine import Alpine
from projects.apache import Apache
from projects.arch import Arch
from projects.centos import CentOS
from projects.ceph import Ceph
from projects.cpan import CPAN
from projects.cygwin import Cygwin
from projects.debian import Debian
from projects.debiancd import DebianCD
from projects.debianmultimedia import DebianMultimedia
from projects.debianports import DebianPorts
from projects.debiansecurity import DebianSecurity
from projects.eclipse import Eclipse
from projects.fedora import Fedora
from projects.freebsd import FreeBSD
from projects.gentoodistfiles import GentooDistfiles
from projects.gentooportage import GentooPortage
from projects.gnome import GNOME
from projects.gnu import GNU
from projects.gutenberg import Gutenberg
from projects.ipfire import IPFire
from projects.kde import KDE
from projects.kdeapplicationdata import KDEApplicationData
from projects.kernel import Kernel
from projects.openbsd import OpenBSD
from shared import CSC_MIRROR
2021-10-03 15:35:17 -04:00
from dateparser.search import search_dates # this library seems to be super slow but the other library: dateutil.parser gets some errors
# http://theautomatic.net/2018/12/18/2-packages-for-extracting-dates-from-a-string-of-text-in-python/
2021-10-03 15:35:17 -04:00
import re # import regular expressions to remove stray numbers in string that might interfere with date finding
2021-10-03 15:44:08 -04:00
import json # import json to read project info stored in json file
2021-10-03 15:35:17 -04:00
import datefinder # another date finding library
def checker(directory_URL, file_name):
2021-10-03 15:35:17 -04:00
page = requests.get(directory_URL).text
2021-10-03 15:56:06 -04:00
file_index = page.find(file_name)
2021-10-03 15:35:17 -04:00
# print(page)
# remove stray numbers (file size numbers in particular) that might interfere with date finding
2021-10-03 15:56:06 -04:00
segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:]) # removes numbers for size
segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:]) # removes numbers + size unit. e.x. 50kb
2021-10-03 15:35:17 -04:00
# print(segment_clean)
2021-10-03 15:35:17 -04:00
# implementation using dateparser.search.search_dates
# notes: some dates don't parse correctly with this tool
2021-10-03 15:56:06 -04:00
# print(search_dates(page[file_index:], languages=['en']))
# print(search_dates(page[file_index:])[0])
2021-10-03 15:35:17 -04:00
# finds the dates in the segment after the file name
# notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.
matches = list(datefinder.find_dates(segment_clean))
# print(matches)
2021-10-03 15:35:17 -04:00
if len(matches) > 0:
date = matches[0] # date is of type datetime.datetime
2021-10-03 15:56:06 -04:00
return date.strftime("%m/%d/%Y, %H:%M:%S")
2021-10-03 15:35:17 -04:00
else:
2021-10-03 15:56:06 -04:00
return 'No dates found'
2021-08-19 17:00:55 -04:00
2021-08-16 18:23:04 -04:00
if __name__ == "__main__":
2021-10-03 15:44:08 -04:00
"""projects = json.load(open('projects.json',))
print(projects)
2021-10-03 15:44:08 -04:00
for project in projects:
print(project[0] + ":")
print("CSC mirror: " + checker(project[1], project[3]))
print("Official project: " + checker(project[2], project[3]))"""
2021-09-01 23:25:06 -04:00
2021-10-03 15:35:17 -04:00
with open("data.json", "r", encoding="utf-8") as file:
data = json.load(file)
if sys.stdin.isatty():
2021-10-03 15:44:08 -04:00
projects = data
2021-10-03 15:35:17 -04:00
else:
2021-10-03 15:44:08 -04:00
projects = [project.rstrip() for project in sys.stdin.readlines()]
2021-10-03 15:35:17 -04:00
current_time = int(time.time())
2021-10-03 15:44:08 -04:00
for project in projects:
2021-10-03 15:35:17 -04:00
try:
2021-10-03 15:44:08 -04:00
if project not in data:
print(f"Failure: {project} does not exist")
2021-10-03 15:35:17 -04:00
continue
2021-10-03 15:44:08 -04:00
project_class = getattr(sys.modules[__name__], project)
if project == "CPAN":
checker_result = project_class.check(data, project, current_time)
2021-10-03 15:35:17 -04:00
if checker_result:
2021-10-03 15:44:08 -04:00
print(f"Success: {project} up-to-date")
2021-10-03 15:35:17 -04:00
else:
2021-10-03 15:44:08 -04:00
print(f"Failure: {project} out-of-sync")
2021-10-03 15:35:17 -04:00
continue
2021-10-03 15:44:08 -04:00
checker_result = project_class.check(data, project)
2021-10-03 15:35:17 -04:00
if checker_result:
2021-10-03 15:44:08 -04:00
data[project]["out_of_sync_since"] = None
elif data[project]["out_of_sync_since"] is None:
data[project]["out_of_sync_since"] = current_time
elif current_time - data[project]["out_of_sync_since"] \
2021-10-03 15:56:06 -04:00
> data[project]["out_of_sync_interval"]:
2021-10-03 15:44:08 -04:00
print(f"Failure: {project} out-of-sync")
2021-10-03 15:35:17 -04:00
continue
2021-10-03 15:44:08 -04:00
print(f"Success: {project} up-to-date")
2021-10-03 15:35:17 -04:00
except requests.exceptions.RequestException as err:
2021-10-03 15:44:08 -04:00
print(f"Error: {project}\n{err}")
2021-10-03 15:35:17 -04:00
with open("data.json", "w", encoding="utf-8") as file:
json.dump(data, file, indent=4)