added the implementation to check timestamps in distro mirror directory and store distro info in JSON files

This commit is contained in:
Tom 2021-08-24 16:14:24 -07:00
parent 908d7f200e
commit a526e3ea26
2 changed files with 47 additions and 2 deletions

5
distros.json Normal file
View File

@ -0,0 +1,5 @@
[
["OpenBSD", "https://mirror.csclub.uwaterloo.ca/OpenBSD/", "https://ftp.openbsd.org/pub/OpenBSD/", "timestamp"],
["kernel", "http://mirror.csclub.uwaterloo.ca/kernel.org/linux/kernel/next/", "https://mirrors.edge.kernel.org/pub/linux/kernel/next/", "sha256sums.asc"],
["debian", "http://mirror.csclub.uwaterloo.ca/debian/project/trace/", "https://ftp-master.debian.org/debian/project/trace/", "master"]
]

44
main.py
View File

@ -9,10 +9,50 @@ from arch import Arch
from debian import Debian from debian import Debian
from kernel import Kernel from kernel import Kernel
from openbsd import OpenBSD from openbsd import OpenBSD
from dateparser.search import search_dates # this library seems to be super slow but the other library: dateutil.parser gets some errors
# http://theautomatic.net/2018/12/18/2-packages-for-extracting-dates-from-a-string-of-text-in-python/
import re # import regular expressions to remove stray numbers in string that might interfere with date finding
import json # import json to read distro info stored in json file
import datefinder # another date finding library
def checker(directory_URL, file_name):
page = requests.get(directory_URL).text
indexOfFile = page.find(file_name)
# print(page)
# remove stray numbers (file size numbers in particular) that might interfere with date finding
segment_clean = re.sub(r'\s\d+\s', ' ', page[indexOfFile:]) # removes numbers for size
segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[indexOfFile:]) # removes numbers + size unit. e.x. 50kb
# print(segment_clean)
# implementation using dateparser.search.search_dates
# notes: some dates don't parse correctly with this tool
# print(search_dates(page[indexOfFile:], languages=['en']))
# print(search_dates(page[indexOfFile:])[0])
# finds the dates in the segment after the file name
# notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.
matches = list(datefinder.find_dates(segment_clean))
# print(matches)
if len(matches) > 0:
date = matches[0] # date is of type datetime.datetime
return(date.strftime("%m/%d/%Y, %H:%M:%S"))
else:
return('No dates found')
if __name__ == "__main__": if __name__ == "__main__":
for distro in [Arch, Debian, Kernel, OpenBSD]: """for distro in [Arch, Debian, Kernel, OpenBSD]:
try: try:
distro.print_output(distro.check()) distro.print_output(distro.check())
except requests.exceptions.RequestException as err: except requests.exceptions.RequestException as err:
print(f"Error: {distro.name()}\n{err}") print(f"Error: {distro.name()}\n{err}")"""
distros = json.load(open('distros.json',))
print(distros)
for distro in distros:
print(distro[0] + ":")
print("CSC mirror: " + checker(distro[1], distro[3]))
print("Official distro: " + checker(distro[2], distro[3]))