#!/usr/bin/env python3 """ This mirror status checker determines whether CSC mirror is up-to-date with upstream """ import time import sys import requests from projects import * # noqa # from dateparser.search import search_dates # this library seems to be super slow but the other library: dateutil.parser gets some errors # http://theautomatic.net/2018/12/18/2-packages-for-extracting-dates-from-a-string-of-text-in-python/ import re # import regular expressions to remove stray numbers in string that might interfere with date finding import json # import json to read project info stored in json file import datefinder # another date finding library # checker: gets the timestamp of the file inside the directory at the specified URL and returns it as a string def checker(directory_URL, file_name): page = requests.get(directory_URL).text file_index = page.find(file_name) # print(page) # remove stray numbers (file size numbers in particular) that might interfere with date finding segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:]) # removes numbers for size segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:]) # removes numbers + size unit. e.x. 50kb # print(segment_clean) # implementation using dateparser.search.search_dates # notes: some dates don't parse correctly with this tool # print(search_dates(page[file_index:], languages=['en'])) # print(search_dates(page[file_index:])[0]) # finds the dates in the segment after the file name # notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom. matches = list(datefinder.find_dates(segment_clean)) # print(matches) if len(matches) > 0: date = matches[0] # date is of type datetime.datetime return date.strftime("%m/%d/%Y, %H:%M:%S") else: return 'No dates found' if __name__ == "__main__": """projects = json.load(open('projects.json',)) print(projects) for project in projects: print(project[0] + ":") print("CSC mirror: " + checker(project[1], project[3])) print("Official project: " + checker(project[2], project[3]))""" with open("data.json", "r", encoding="utf-8") as file: data = json.load(file) if sys.stdin.isatty(): projects = data else: projects = [project.rstrip() for project in sys.stdin.readlines()] current_time = int(time.time()) for project in projects: try: if project not in data: print(f"Failure: {project} does not exist") continue project_class = getattr(sys.modules[__name__], project) if project == "CPAN": checker_result = project_class.check(data, project, current_time) if checker_result: print(f"Success: {project} up-to-date") else: print(f"Failure: {project} out-of-sync") continue checker_result = project_class.check(data, project) if checker_result: data[project]["out_of_sync_since"] = None elif data[project]["out_of_sync_since"] is None: data[project]["out_of_sync_since"] = current_time elif current_time - data[project]["out_of_sync_since"] \ > data[project]["out_of_sync_interval"]: print(f"Failure: {project} out-of-sync") continue print(f"Success: {project} up-to-date") except requests.exceptions.RequestException as err: print(f"Error: {project}\n{err}") with open("data.json", "w", encoding="utf-8") as file: json.dump(data, file, indent='\t')