from bs4 import BeautifulSoup
import requests
import datefinder # another date finding library
import re
from datetime import datetime
from project import Project
from shared import CSC_MIRROR
# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution
class xiph ( Project ) :
""" xiph class """
@staticmethod
def scrape ( releases , site ) :
# getting the request from url
r = requests . get ( site )
# converting the text
s = BeautifulSoup ( r . text , " html.parser " )
for i in s . find_all ( " a " ) : # for a href directories
href = i . attrs [ ' href ' ]
if href . endswith ( " / " ) and href != " ../ " and href != " / " and href != " /pub/xiph/ " and not href . startswith ( " http:// " ) :
if href not in releases :
releases . append ( href )
# print(href)
@staticmethod
def get_latest_date ( web_dir ) :
page = requests . get ( web_dir ) . text
str_dates = re . findall ( r ' ( \ d {2} - \ w {3} - \ d {4} \ d {2} : \ d {2} )|( \ d {4} - \ d {2} - \ d {2} \ d {2} : \ d {2} ) ' , page )
# if you want to match 1+ patterns, like r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', note that findall will return a tuple of two groups!!!
# print(str_dates[0])
if len ( str_dates ) == 0 :
return datetime ( 1000 , 1 , 1 ) # return ridiculously old date to discard this entry, since it has no dates
# for date in str_dates:
# print(date)
# print("")
dates = [ list ( datefinder . find_dates ( " " . join ( date ) ) ) [ 0 ] for date in str_dates ]
# for date in dates:
# print(date)
return ( max ( dates ) )
def get_checksum_date ( directory_URL ) :
page = requests . get ( directory_URL ) . text
file_index = page . find ( " SUMS.txt " )
# print(page)
# remove stray numbers (file size numbers in particular) that might interfere with date finding
segment_clean = re . sub ( r ' \ s \ d+ \ s ' , ' ' , page [ file_index : ] ) # removes numbers for size
segment_clean = re . sub ( r ' \ s \ d+ \ w* \ s ' , ' ' , page [ file_index : ] ) # removes numbers + size unit. e.x. 50kb
# print(segment_clean)
# finds the dates in the segment after the file name
# notes: a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.
matches = list ( datefinder . find_dates ( segment_clean ) )
# print(matches[0])
return matches [ 0 ]
@classmethod
def compare_release ( cls , csc_dir , upstream_dir ) :
page = requests . get ( upstream_dir ) . text
file_index = page . find ( " SUMS.txt " )
if file_index == - 1 :
return cls . get_latest_date ( csc_dir ) == cls . get_latest_date ( upstream_dir )
else :
return cls . get_checksum_date ( csc_dir ) == cls . get_checksum_date ( upstream_dir )
@classmethod
def check_mirror ( cls , csc_url , upstream_url , releases ) :
compare = [ ]
for release in releases :
compare . append ( cls . compare_release ( csc_url + release , upstream_url + release ) )
return all ( compare )
@classmethod
def check ( cls , data , project , current_time ) :
""" Check if project packages are up-to-date """
# lists
releases1 = [ ]
releases2 = [ ]
csc_url = CSC_MIRROR + data [ project ] [ " csc " ] + data [ project ] [ " file " ]
upstream_url = data [ project ] [ " upstream " ] + data [ project ] [ " file " ]
# calling function
cls . scrape ( releases1 , csc_url )
cls . scrape ( releases2 , upstream_url )
if set ( releases1 ) != set ( releases2 ) :
return False
return cls . check_mirror ( csc_url , upstream_url , releases2 )