mirror-checker/main.py

#!/usr/bin/env python3

"""
This mirror status checker determines whether CSC mirror is up-to-date with upstream
"""

import time
import sys
import requests

from projects import * # noqa
# from dateparser.search import search_dates  # this library seems to be super slow but the other library: dateutil.parser gets some errors
# http://theautomatic.net/2018/12/18/2-packages-for-extracting-dates-from-a-string-of-text-in-python/
import re  # import regular expressions to remove stray numbers in string that might interfere with date finding
import json  # import json to read project info stored in json file

import datefinder  # another date finding library


# checker: gets the timestamp of the file inside the directory at the specified URL and returns it as a string
def checker(directory_URL, file_name):
	page = requests.get(directory_URL).text
	file_index = page.find(file_name)
	# print(page)

	# remove stray numbers (file size numbers in particular) that might interfere with date finding
	segment_clean = re.sub(r'\s\d+\s', ' ', page[file_index:])  # removes numbers for size
	segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[file_index:])  # removes numbers + size unit. e.x. 50kb
	# print(segment_clean)

	# implementation using dateparser.search.search_dates
	# notes: some dates don't parse correctly with this tool
	# print(search_dates(page[file_index:], languages=['en']))
	# print(search_dates(page[file_index:])[0])

	# finds the dates in the segment after the file name
	# notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.
	matches = list(datefinder.find_dates(segment_clean))
	# print(matches)

	if len(matches) > 0:
		date = matches[0]  # date is of type datetime.datetime
		return date.strftime("%m/%d/%Y, %H:%M:%S")
	else:
		return 'No dates found'


if __name__ == "__main__":
	"""projects = json.load(open('projects.json',))
	print(projects)

	for project in projects:
		print(project[0] + ":")
		print("CSC mirror: " + checker(project[1], project[3]))
		print("Official project: " + checker(project[2], project[3]))"""

	with open("data.json", "r", encoding="utf-8") as file:
		data = json.load(file)
		if sys.stdin.isatty():
			projects = data
		else:
			projects = [project.rstrip() for project in sys.stdin.readlines()]
		current_time = int(time.time())
		for project in projects:
			try:
				if project not in data:
					print(f"Failure: {project} does not exist")
					continue
				project_class = getattr(sys.modules[__name__], project)
				if project == "CPAN" or project == "ubuntu" or project == "ubuntu_releases" or project == "manjaro" or project == "mxlinux" or project == "mxlinux_iso" or project == "slackware" or project == "trisquel":
					checker_result = project_class.check(data, project, current_time)
					if checker_result:
						print(f"Success: {project} up-to-date")
					else:
						print(f"Failure: {project} out-of-sync")
					continue
				checker_result = project_class.check(data, project)
				if checker_result:
					data[project]["out_of_sync_since"] = None
				elif data[project]["out_of_sync_since"] is None:
					data[project]["out_of_sync_since"] = current_time
				elif current_time - data[project]["out_of_sync_since"] \
					> data[project]["out_of_sync_interval"]:
					print(f"Failure: {project} out-of-sync")
					continue
				print(f"Success: {project} up-to-date")
			except requests.exceptions.RequestException as err:
				print(f"Error: {project}\n{err}")
	with open("data.json", "w", encoding="utf-8") as file:
		json.dump(data, file, indent='\t')