mirror-checker/main.py

#!/usr/bin/env python3

"""
This mirror status checker determines whether CSC mirror is up-to-date with upstream
"""

import requests
from arch import Arch
from ceph import Ceph
from debian import Debian
from eclipse import Eclipse
from gnu import GNU
from kernel import Kernel
from openbsd import OpenBSD
from dateparser.search import search_dates # this library seems to be super slow but the other library: dateutil.parser gets some errors
# http://theautomatic.net/2018/12/18/2-packages-for-extracting-dates-from-a-string-of-text-in-python/
import re # import regular expressions to remove stray numbers in string that might interfere with date finding
import json # import json to read distro info stored in json file

import datefinder # another date finding library

# checker: gets the timestamp of the file inside the directory at the specified URL and returns it as a string
def checker(directory_URL, file_name):
    page = requests.get(directory_URL).text
    indexOfFile = page.find(file_name)
    # print(page)

    # remove stray numbers (file size numbers in particular) that might interfere with date finding
    segment_clean = re.sub(r'\s\d+\s', ' ', page[indexOfFile:]) # removes numbers for size
    segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[indexOfFile:]) # removes numbers + size unit. e.x. 50kb
    # print(segment_clean)

    # implementation using dateparser.search.search_dates
    # notes: some dates don't parse correctly with this tool
    # print(search_dates(page[indexOfFile:], languages=['en']))
    # print(search_dates(page[indexOfFile:])[0])

    # finds the dates in the segment after the file name
    # notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.
    matches = list(datefinder.find_dates(segment_clean))
    # print(matches)

    if len(matches) > 0:
        date = matches[0] # date is of type datetime.datetime
        return(date.strftime("%m/%d/%Y, %H:%M:%S"))
    else:
        return('No dates found')

if __name__ == "__main__":
    """for distro in [Arch, Ceph, Debian, Eclipse, GNU, Kernel, OpenBSD]:
        try:
            distro.print_output(distro.check())
        except requests.exceptions.RequestException as err:
            print(f"Error: {distro.name()}\n{err}")"""
    
    # the implementation to check timestamps in distro mirror directory:
    distros = json.load(open('distros.json',))
    # print(distros)
    
    for distro in distros:
        print(distro[0] + ":")
        print("CSC mirror: " + checker(distro[1], distro[3]))
        print("Official distro: " + checker(distro[2], distro[3]))
    # for eclipse, i couldn't find a directory from the official website, so i decided to temporarily cheat a little bit and peek at another person's mirror for explipse
    # i just realized that my method doesn't just work for scraping html directories but everything returned by a URL (websites, files storing a time)
    # i think i'll put checking arch, ceph, ecplipse on hold because it's more proper to check them using as.Date(17383, origin="1970-01-01") where the date is stored in 
    #  a file as a number, but this would require another function and me looping through different functions. i think i'll do this later
    #  corresponding jsons: ["Eclipse", "https://mirror.csclub.uwaterloo.ca/eclipse/", "http://eclipse.mirror.rafal.ca/", "TIME"],
    #         ["Ceph", "https://mirror.csclub.uwaterloo.ca/ceph/", "https://download.ceph.com/", "timestamp"],
    #  i don't know what to do with damnsmalllinux, it seems like an abandoned project. useful link: http://distro.ibiblio.org/damnsmall/
    # the other function is probably probing mirror status sites like this: http://mirrors.cpan.org/
    # maybe i can make a column in json that specifies the method for checking the particular mirror and that identifier will find a specific function to check the update
Created Arch mirror status checker 2021-08-16 18:23:04 -04:00			`#!/usr/bin/env python3`

			`"""`
			`This mirror status checker determines whether CSC mirror is up-to-date with upstream`
			`"""`

			`import requests`
Refactored code into classes and added .gitignore 2021-08-23 18:43:02 -04:00			`from arch import Arch`
Created Ceph mirror status checker 2021-08-24 18:37:27 -04:00			`from ceph import Ceph`
Refactored code into classes and added .gitignore 2021-08-23 18:43:02 -04:00			`from debian import Debian`
Created Eclipse mirror status checker 2021-08-24 17:00:51 -04:00			`from eclipse import Eclipse`
Created GNU mirror status checker 2021-08-24 17:34:39 -04:00			`from gnu import GNU`
Refactored code into classes and added .gitignore 2021-08-23 18:43:02 -04:00			`from kernel import Kernel`
			`from openbsd import OpenBSD`
added the implementation to check timestamps in distro mirror directory and store distro info in JSON files 2021-08-24 19:14:24 -04:00			`from dateparser.search import search_dates # this library seems to be super slow but the other library: dateutil.parser gets some errors`
			`# http://theautomatic.net/2018/12/18/2-packages-for-extracting-dates-from-a-string-of-text-in-python/`
			`import re # import regular expressions to remove stray numbers in string that might interfere with date finding`
			`import json # import json to read distro info stored in json file`

			`import datefinder # another date finding library`

added some distros 2021-09-12 01:06:55 -04:00			`# checker: gets the timestamp of the file inside the directory at the specified URL and returns it as a string`
added the implementation to check timestamps in distro mirror directory and store distro info in JSON files 2021-08-24 19:14:24 -04:00			`def checker(directory_URL, file_name):`
			`page = requests.get(directory_URL).text`
			`indexOfFile = page.find(file_name)`
			`# print(page)`

			`# remove stray numbers (file size numbers in particular) that might interfere with date finding`
			`segment_clean = re.sub(r'\s\d+\s', ' ', page[indexOfFile:]) # removes numbers for size`
			`segment_clean = re.sub(r'\s\d+\w*\s', ' ', page[indexOfFile:]) # removes numbers + size unit. e.x. 50kb`
			`# print(segment_clean)`

			`# implementation using dateparser.search.search_dates`
			`# notes: some dates don't parse correctly with this tool`
			`# print(search_dates(page[indexOfFile:], languages=['en']))`
			`# print(search_dates(page[indexOfFile:])[0])`

			`# finds the dates in the segment after the file name`
			`# notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.`
			`matches = list(datefinder.find_dates(segment_clean))`
			`# print(matches)`

			`if len(matches) > 0:`
			`date = matches[0] # date is of type datetime.datetime`
			`return(date.strftime("%m/%d/%Y, %H:%M:%S"))`
			`else:`
			`return('No dates found')`
Created OpenBSD mirror status checker 2021-08-19 17:00:55 -04:00
Created Arch mirror status checker 2021-08-16 18:23:04 -04:00			`if __name__ == "__main__":`
fixed merge conflicts 2021-08-24 19:20:12 -04:00			`"""for distro in [Arch, Ceph, Debian, Eclipse, GNU, Kernel, OpenBSD]:`
Refactored code into classes and added .gitignore 2021-08-23 18:43:02 -04:00			`try:`
			`distro.print_output(distro.check())`
			`except requests.exceptions.RequestException as err:`
added the implementation to check timestamps in distro mirror directory and store distro info in JSON files 2021-08-24 19:14:24 -04:00			`print(f"Error: {distro.name()}\n{err}")"""`

added some distros 2021-09-12 01:06:55 -04:00			`# the implementation to check timestamps in distro mirror directory:`
added the implementation to check timestamps in distro mirror directory and store distro info in JSON files 2021-08-24 19:14:24 -04:00			`distros = json.load(open('distros.json',))`
added some distros 2021-09-12 01:06:55 -04:00			`# print(distros)`

added the implementation to check timestamps in distro mirror directory and store distro info in JSON files 2021-08-24 19:14:24 -04:00			`for distro in distros:`
			`print(distro[0] + ":")`
			`print("CSC mirror: " + checker(distro[1], distro[3]))`
added some distros 2021-09-12 01:06:55 -04:00			`print("Official distro: " + checker(distro[2], distro[3]))`
			`# for eclipse, i couldn't find a directory from the official website, so i decided to temporarily cheat a little bit and peek at another person's mirror for explipse`
			`# i just realized that my method doesn't just work for scraping html directories but everything returned by a URL (websites, files storing a time)`
			`# i think i'll put checking arch, ceph, ecplipse on hold because it's more proper to check them using as.Date(17383, origin="1970-01-01") where the date is stored in`
			`# a file as a number, but this would require another function and me looping through different functions. i think i'll do this later`
			`# corresponding jsons: ["Eclipse", "https://mirror.csclub.uwaterloo.ca/eclipse/", "http://eclipse.mirror.rafal.ca/", "TIME"],`
			`# ["Ceph", "https://mirror.csclub.uwaterloo.ca/ceph/", "https://download.ceph.com/", "timestamp"],`
			`# i don't know what to do with damnsmalllinux, it seems like an abandoned project. useful link: http://distro.ibiblio.org/damnsmall/`
			`# the other function is probably probing mirror status sites like this: http://mirrors.cpan.org/`
			`# maybe i can make a column in json that specifies the method for checking the particular mirror and that identifier will find a specific function to check the update`