mirror-checker/test.py

from bs4 import BeautifulSoup
import requests
import datefinder  # another date finding library
import re
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
import re # for salt stack specifically
from projects import trisquel
import json  # import json to read project info stored in json file

# this function is brute force looping through the whole directory and checking dates
# it may sound horrible, but for certain distros, i believe it's indeed the best solution

# lists
urls=[]

home_site = "https://cdimage.ubuntu.com"

# function created
def scrape(site):
       
    # getting the request from url
    r = requests.get(site)
       
    # converting the text
    s = BeautifulSoup(r.text,"html.parser")

    # salt stack specific code
    # s = s.find("div", {"id": "listing"})
    # print(s)

    for i in s.find_all("a"): # for a href directories
        href = i.attrs['href']
           
        if href.endswith("/") and href != "../" and href != "/":
            if home_site+href in urls: # avoids the link to parent directory
                continue
            """if href == "//ftp.netbsd.org/": # netbsd specific code
                continue"""
            site_next = site+href

            if site_next not in  urls:
                urls.append(site_next) 
                print(site_next)
                # calling it self
                scrape(site_next)

def get_latest_date(web_dir):
    page = requests.get(web_dir).text

    str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)
    # print(str_dates[0])
    dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates]

    # for date in dates:
    #     print(date)

    if len(dates) == 0:
        return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates
    return(max(dates))
   
# main function
if __name__ =="__main__":
    with open("data.json", "r", encoding="utf-8") as file:
        data = json.load(file)
        print(trisquel.check(data, "trisquel"))
   
    """# website to be scrape
    site="https://cdimage.ubuntu.com/releases/"
    # works on: 
    #           https://www.x.org/releases/

    # calling function
    scrape(site)

    latest_date = get_latest_date(urls[0])
    # get_latest_date(urls[0])
    for dir in urls:
        latest_date2 = get_latest_date(dir)
        if (latest_date2 >= latest_date):
            latest_date = latest_date2

    print(latest_date)"""
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00			`from bs4 import BeautifulSoup`
			`import requests`
			`import datefinder # another date finding library`
			`import re`
			`from datetime import datetime`
			`from datetime import timedelta`
			`import time`
			`import pandas as pd`
added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`import re # for salt stack specifically`
updated trisquel 2021-10-17 20:59:35 -04:00			`from projects import trisquel`
added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`import json # import json to read project info stored in json file`
added mxlinux, mxlinux-iso 2021-10-04 02:47:55 -04:00
			`# this function is brute force looping through the whole directory and checking dates`
			`# it may sound horrible, but for certain distros, i believe it's indeed the best solution`

added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00			`# lists`
			`urls=[]`

added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`home_site = "https://cdimage.ubuntu.com"`
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00
			`# function created`
			`def scrape(site):`

			`# getting the request from url`
			`r = requests.get(site)`

			`# converting the text`
			`s = BeautifulSoup(r.text,"html.parser")`

added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`# salt stack specific code`
			`# s = s.find("div", {"id": "listing"})`
			`# print(s)`

added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00			`for i in s.find_all("a"): # for a href directories`
			`href = i.attrs['href']`

			`if href.endswith("/") and href != "../" and href != "/":`
added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`if home_site+href in urls: # avoids the link to parent directory`
nongnu added 2021-10-04 03:49:24 -04:00			`continue`
added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`"""if href == "//ftp.netbsd.org/": # netbsd specific code`
			`continue"""`
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00			`site_next = site+href`

			`if site_next not in urls:`
			`urls.append(site_next)`
			`print(site_next)`
			`# calling it self`
			`scrape(site_next)`

			`def get_latest_date(web_dir):`
added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`page = requests.get(web_dir).text`
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00
added mxlinux, mxlinux-iso 2021-10-04 02:47:55 -04:00			`str_dates = re.findall(r'(\d{2}-\w{3}-\d{4} \d{2}:\d{2})\|(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', page)`
added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`# print(str_dates[0])`
			`dates = [list(datefinder.find_dates(date[1]))[0] for date in str_dates]`
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00
			`# for date in dates:`
			`# print(date)`

added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`if len(dates) == 0:`
			`return datetime(1000, 1, 1) # return ridiculously old date to discard this entry, since it has no dates`
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00			`return(max(dates))`

			`# main function`
			`if __name__ =="__main__":`
added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`with open("data.json", "r", encoding="utf-8") as file:`
			`data = json.load(file)`
updated trisquel 2021-10-17 20:59:35 -04:00			`print(trisquel.check(data, "trisquel"))`
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00
added pkgsrc qtproject raspbian slackware trisquel ubuntu-ports ubuntu-releases 2021-10-04 06:36:22 -04:00			`"""# website to be scrape`
added linuxmint, linuxmint-packages, raspberry pi, ubuntu-ports-releases, and xubuntu-releases 2021-10-14 21:22:39 -04:00			`site="https://cdimage.ubuntu.com/releases/"`
			`# works on:`
			`# https://www.x.org/releases/`
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00
			`# calling function`
nongnu added 2021-10-04 03:49:24 -04:00			`scrape(site)`
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00
nongnu added 2021-10-04 03:49:24 -04:00			`latest_date = get_latest_date(urls[0])`
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00			`# get_latest_date(urls[0])`
nongnu added 2021-10-04 03:49:24 -04:00			`for dir in urls:`
			`latest_date2 = get_latest_date(dir)`
			`if (latest_date2 >= latest_date):`
			`latest_date = latest_date2`
added mxlinux, mxlinux-iso 2021-10-04 02:47:55 -04:00
added pkgsrc qtproject raspbian slackware trisquel ubuntu-ports ubuntu-releases 2021-10-04 06:36:22 -04:00			`print(latest_date)"""`
added manjaro and fixed ubuntu 2021-10-04 01:18:07 -04:00