diff --git a/merlin/config/config.go b/merlin/config/config.go index 0066b37..abc5c26 100644 --- a/merlin/config/config.go +++ b/merlin/config/config.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "gopkg.in/ini.v1" @@ -139,6 +140,8 @@ type Repo struct { PasswordFile string `ini:"password_file"` // the archive name for the ftpsync command (required for csc-sync-ftpsync) FtpsyncArchive string `ini:"ftpsync_archive"` + // the sync command to execute with `sh -c` (required for csc-sync-exec) + ExecStr string `ini:"exec"` // full path to file storing the repo sync state StateFile string `ini:"-"` // full path for file storing general logging of this repo @@ -258,11 +261,23 @@ func LoadConfig(configPath string, doneChan chan SyncResult, stopChan chan struc panic("Missing or invalid frequency for " + repo.Name) } else if repo.SyncType == "" { panic("Missing sync type from " + repo.Name) + } else if repo.SyncType == "csc-sync-ftpsync" { + if repo.FtpsyncArchive == "" { + panic("Missing ftpsync archive for " + repo.Name) + } + } else if repo.SyncType == "csc-sync-exec" { + if repo.ExecStr == "" { + panic("Missing 'exec' for " + repo.Name) + } + // Allow some limited placeholders + repo.ExecStr = strings.ReplaceAll(repo.ExecStr, "{{repoLogFile}}", repo.RepoLogFile) + repo.ExecStr = strings.ReplaceAll(repo.ExecStr, "{{rsyncLogFile}}", repo.RsyncLogFile) + if strings.Contains(repo.ExecStr, "{{") { + panic(repo.Name + " has unsatisfied placeholders in 'exec': " + repo.ExecStr) + } } else if repo.LocalDir == "" { panic("Missing local download location for " + repo.Name) - } else if repo.SyncType == "csc-sync-ftpsync" && repo.FtpsyncArchive == "" { - panic("Missing ftpsync archive for " + repo.Name) - } else if repo.SyncType != "csc-sync-ftpsync" && repo.SyncType != "csc-sync-fedora" && repo.RsyncHost == "" { + } else if repo.RsyncHost == "" { panic("Missing rsync host for " + repo.Name) } diff --git a/merlin/merlin-config.ini b/merlin/merlin-config.ini index d10edec..bcb3e77 100644 --- a/merlin/merlin-config.ini +++ b/merlin/merlin-config.ini @@ -279,8 +279,11 @@ rsync_dir = ftp.freebsd.org/pub/FreeBSD/ ; This handles both fedora/linux and fedora/epel ; See ~/quick-fedora-mirror/quick-fedora-mirror.conf [fedora] -sync_type = csc-sync-fedora +sync_type = csc-sync-exec frequency = bi-hourly +; See ~/quick-fedora-mirror/quick-fedora-mirror.conf +; LOGFILE is set to {{repoLogFile}} (~/merlin/log/fedora.log) +exec = truncate --size=0 {{repoLogFile}}; cd ~/quick-fedora-mirror && ./quick-fedora-mirror > {{rsyncLogFile}} [ubuntu-ports-releases] sync_type = csc-sync-standard @@ -396,12 +399,9 @@ rsync_host = rsync.alpinelinux.org rsync_dir = alpine [raspbian] -;verbose = true -sync_type = csc-sync-standard +sync_type = csc-sync-exec frequency = bi-hourly -local_dir = raspbian -rsync_host = raspbian.freemirror.org -rsync_dir = raspbian +exec = cd /mirror/root/raspbian && mkdir -p .~tmp~ && ~/raspbmirror/raspbmirror.py --tmpdir ./.~tmp~ --sourcepool /mirror/root/debian/pool > {{rsyncLogFile}} [raspberrypi] sync_type = csc-sync-standard-ipv6 diff --git a/merlin/sync/sync.go b/merlin/sync/sync.go index 0959b83..b99b0fb 100644 --- a/merlin/sync/sync.go +++ b/merlin/sync/sync.go @@ -55,8 +55,8 @@ func getSyncCommand(repo *config.Repo) (cmds [][]string) { return append(cmds, cscSyncDebianStep1(repo), cscSyncDebianStep2(repo)) case "csc-sync-debian-cd": return append(cmds, cscSyncDebianCD(repo)) - case "csc-sync-fedora": - return append(cmds, cscSyncFedora(repo)) + case "csc-sync-exec": + return append(cmds, cscSyncExec(repo)) case "csc-sync-ftpsync": return append(cmds, cscSyncFtpsync(repo)) case "csc-sync-s3": @@ -293,14 +293,8 @@ func cscSyncDebianCD(repo *config.Repo) []string { return args } -func cscSyncFedora(repo *config.Repo) []string { - // Make sure that repo.RepoLogFile (default: ~/merlin/log/fedora.log) - // is the same as the LOGFILE setting in ~/quick-fedora-mirror/quick-fedora-mirror.conf - return []string{ - "sh", "-c", - "truncate --size=0 " + repo.RepoLogFile + "; " + - "cd ~/quick-fedora-mirror && ./quick-fedora-mirror > " + repo.RsyncLogFile, - } +func cscSyncExec(repo *config.Repo) []string { + return []string{"sh", "-c", repo.ExecStr} } func cscSyncFtpsync(repo *config.Repo) []string { diff --git a/raspbmirror/README.md b/raspbmirror/README.md new file mode 100644 index 0000000..b334a07 --- /dev/null +++ b/raspbmirror/README.md @@ -0,0 +1,5 @@ +This script is to be used for Raspbian, which is NOT the same as Raspberry Pi OS. + +It was downloaded from [here](https://raw.githubusercontent.com/plugwash/raspbian-tools/master/raspbmirror.py). + +See [here](https://www.raspbian.org/RaspbianMirrors) for details about Raspbian mirroring. diff --git a/raspbmirror/raspbmirror.py b/raspbmirror/raspbmirror.py new file mode 100755 index 0000000..cebc187 --- /dev/null +++ b/raspbmirror/raspbmirror.py @@ -0,0 +1,835 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Peter Green +# Released under the MIT/Expat license, see doc/COPYING + +import os +import sys +import hashlib +import gzip +import stat +#from sortedcontainers import SortedDict +#from sortedcontainers import SortedList +from collections import deque +from collections import OrderedDict +from datetime import datetime +from email.utils import parsedate_to_datetime +import argparse +import re +from heapq import heappush, heappop +import fcntl + +parser = argparse.ArgumentParser(description="mirror raspbian repo.") +parser.add_argument("baseurl", help="base url for source repo (e.g. https://archive.raspbian.org/ )",nargs='?') +parser.add_argument("mdurl", help="base url for mirrordirector or local source mirror (e.g. https://mirrordirector.raspbian.org/ )",nargs='?') +parser.add_argument("hpurl", help="base url for last result hash pool (e.g. http://snapshot.raspbian.org/hashpool )",nargs='?') + +parser.add_argument("--internal", help=argparse.SUPPRESS) #base URL for private repo (internal use only) +parser.add_argument("--sourcepool", help="specify a source pool to look for packages in before downloading them (useful if maintaining multiple mirrors)",action='append') +parser.add_argument("--tmpdir", help="specify a temporary directory to avoid storing temporary files in the output tree, must be on the same filesystem as the output tree") + +#debug option to set the index file used for the "downloadnew" phase but not the "finalize" phase, used to test error recovery. +parser.add_argument("--debugfif", help=argparse.SUPPRESS) +#debug option to set the source url used to download "dists" files during the "downloadnew" phase, used to test error recovery. +parser.add_argument("--debugfdistsurl", help=argparse.SUPPRESS) + +parser.add_argument("--tlwhitelist", help="specify comma-seperated whitelist of top-level directories") + +parser.add_argument("--cleanup",help="scan for and remove files not managed by raspbmirror from mirror tree", action="store_true") + +parser.add_argument("--debugskippool",help="skip downloading pool data, only download metadata (for debugging)",action="store_true") + +parser.add_argument("--distswhitelist", help="specify comman seperated list of distributions") + +parser.add_argument("--nolock", help="don't try to lock the target directory", action="store_true") + +parser.add_argument("--repair", help="during mirroring, verify that all on-disk files match the expected sha256", action="store_true") + +parser.add_argument("--urllib", help="force usage of the builtin urllib module, even if urllib3 is present", action="store_true") + +parser.add_argument("--urllib3", help="force usage of the urllib3 module, panics if the dependency is missing", action="store_true") + +parser.add_argument("--ipv4", help="force usage of IPv4 addresses. Requires urllib3", action="store_true") + +parser.add_argument("--ipv6", help="force usage of IPv6 addresses. Requires urllib3", action="store_true") + +args = parser.parse_args() + +if not args.nolock: + lockfd = os.open('.',os.O_RDONLY) + fcntl.flock(lockfd,fcntl.LOCK_EX | fcntl.LOCK_NB) + +if args.urllib and args.urllib3: + print("error: flags --urllib and --urllib3 are in conflict") + exit(1) + +if args.urllib: + import urllib.request + use_urllib3 = False +elif args.urllib3: + import urllib3 + use_urllib3 = True +else: + # auto detect urllib3 + try: + import urllib3 + use_urllib3 = True + except: + import urllib.request + use_urllib3 = False + +if args.ipv4 and args.ipv6: + print("error: flags --ipv4 and --ipv6 are in conflict") + exit(1) + +if use_urllib3: + # the number of pools should be greater than the number of concurrently used sites. + # 10 should be safe. + dlmanager = urllib3.PoolManager(num_pools=10) + print("info: using urllib3") + + # a fairly hacky way to force the usage of ipv4 or ipv6 addresses + # https://stackoverflow.com/questions/33046733/force-requests-to-use-ipv4-ipv6 + if args.ipv4: + import socket + import requests.packages.urllib3.util.connection as urllib3_cn + def allowed_gai_family(): + return socket.AF_INET + urllib3_cn.allowed_gai_family = allowed_gai_family + elif args.ipv6: + import socket + import requests.packages.urllib3.util.connection as urllib3_cn + def allowed_gai_family(): + return socket.AF_INET6 + urllib3_cn.allowed_gai_family = allowed_gai_family +else: + print("info: using urllib") + if args.ipv4: + print("error: flag --ipv4 requires the urllib3 package") + exit(1) + elif args.ipv6: + print("error: flag --ipv6 requires the urllib3 package") + exit(1) + +def addfilefromdebarchive(filestoverify,filequeue,filename,sha256,size): + size = int(size) + sha256andsize = [sha256,size,'M'] + if filename in filestoverify: + if (sha256andsize[0:2] != filestoverify[filename][0:2]): + if stage == 'scanexisting': + print('warning: same file with different hash/size during scanexisting phase old:'+repr(filestoverify[filename])+' new:'+repr(sha256andsize)) + #find existing sha1/size of file on disk if it exists + if os.path.isfile(filename): + f = open(filename,'rb') + data = f.read() + f.close() + sha256hash = hashlib.sha256(data) + sha256hashed = sha256hash.hexdigest().encode('ascii') + size = len(data) + else: + #otherwise we have no idea + sha256 = None + size = None + filestoverify[filename] = [sha256,size,'M'] + else: + print('error: same file with different hash/size during downloadnew phase old:'+repr(filestoverify[filename])+' new:'+repr(sha256andsize)) + sys.exit(1) + else: + filestoverify[filename] = sha256andsize + addtofilequeue(filequeue,filename) + +def addtofilequeue(filequeue,filename): + filenamesplit = filename.split(b'/') + if b'dists' in filenamesplit: + if filename.endswith(b'.gz'): + # process gz files with high priority so they can be used as substitutes for their uncompressed counterparts + heappush(filequeue,(10,filename)) + else: + heappush(filequeue,(20,filename)) + heappush(filequeue,(30,filename)) + + +#regex used for filename sanity checks +pfnallowed = re.compile(b'[a-z0-9A-Z\-_:\+~\.]+',re.ASCII) +shaallowed = re.compile(b'[a-z0-9]+',re.ASCII) + +def ensuresafepath(path): + pathsplit = path.split(b'/') + if path[0] == '/': + print("path must be relative") + sys.exit(1) + for component in pathsplit: + if not pfnallowed.fullmatch(component): + print("component "+ascii(component)+" in path "+ascii(path)+" contains unexpected characters") + sys.exit(1) + elif component[0] == '.': + print("filenames starting with a dot are not allowed") + sys.exit(1) + +def geturl(fileurl): + if use_urllib3: + response = dlmanager.request("GET", fileurl.decode('ascii')) + ts = getts(fileurl, response) + return (response.data, ts) + else: + with urllib.request.urlopen(fileurl.decode('ascii')) as response: + data = response.read() + ts = getts(fileurl, response) + return (data, ts) + +def getts(fileurl, response): + if fileurl[:7] == b'file://': + ts = os.path.getmtime(fileurl[7:]) + else: + dt = parsedate_to_datetime(response.getheader('Last-Modified')) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + ts = dt.timestamp() + return ts + + +def makenewpath(path): + if args.tmpdir is None: + return path+b'.new' + else: + return os.path.join(args.tmpdir.encode('ascii'),(path+b'.new').replace(b'/',b'~')) + +def getfile(path,sha256,size): + ensuresafepath(path) + if not shaallowed.fullmatch(sha256): + print('invalid character in sha256 hash') + sys.exit(1) + #hashfn = b'../hashpool/' + sha256[:2] +b'/'+ sha256[:4] +b'/'+ sha256 + #if os.path.isfile(hashfn): + # if os.path.getsize(hashfn) != size: + # print('size mismatch on existing file in hash pool') + # sys.exit(1) + #else: + # secondhashfn = None + # if args.secondpool is not None: + # secondhashfn = os.path.join(args.secondpool.encode('ascii'),sha256[:2] +b'/'+ sha256[:4] +b'/'+ sha256) + # #print(secondhashfn) + # if not os.path.isfile(secondhashfn): + # secondhashfn = None + # if secondhashfn is None: + # else: + # print('copying '+path.decode('ascii')+' with hash '+sha256.decode('ascii')+' from secondary pool') + # f = open(secondhashfn,'rb') + # data = f.read() + # f.close() + # ts = os.path.getmtime(secondhashfn) + # sha256hash = hashlib.sha256(data) + # sha256hashed = sha256hash.hexdigest().encode('ascii') + # if (sha256 != sha256hashed): + # #print(repr(filesize)) + # #print(repr(sha256)) + # #print(repr(sha256hashed)) + # print('hash mismatch while downloading file '+path.decode('ascii')+' '+sha256.decode('ascii')+' '+sha256hashed.decode('ascii')); + # sys.exit(1) + # if len(data) != size: + # print('size mismatch while downloading file') + # sys.exit(1) + # hashdir = os.path.dirname(hashfn) + # os.makedirs(hashdir,exist_ok=True) + # f = open(hashfn,'wb') + # f.write(data) + # f.close() + # + # os.utime(hashfn,(ts,ts)) + if len(os.path.dirname(path)) > 0: + os.makedirs(os.path.dirname(path),exist_ok=True) + havenewfile = os.path.isfile(makenewpath(path)) + if havenewfile: # "new" file already exists, lets check the hash + fn = makenewpath(path) + sha256hashed, tl = getfilesha256andsize(fn) + if (sha256 == sha256hashed) and (size == tl): + print('existing file '+path.decode('ascii')+' matched by hash and size') + fileupdates.add(path) + return # no download needed but rename is + if os.path.isfile(path): # file already exists + if (size == os.path.getsize(path)): #no point reading the data and calculating a hash if the size does not match + if (not args.repair) and (path in oldknownfiles) and (not havenewfile): + #shortcut exit if file is unchanged, we skip this if a "new" file was detected because + #that means some sort of update was going on to the file and may need to be finished/cleaned up. + oldsha256,oldsize,oldstatus = oldknownfiles[path] + if (oldsha256 == sha256) and (oldsize == size) and (oldstatus != 'F'): + return # no update needed + + sha256hashed, tl = getfilesha256andsize(path) + if (sha256 == sha256hashed) and (size == tl): + print('existing file '+path.decode('ascii')+' matched by hash and size') + if havenewfile: + #if file is up to date but a "new" file exists and is bad + #(we wouldn't have got this far if it was good) + #schedule the "new" file for removal by adding it to "basefiles" + basefiles.add(makenewpath(path)) + return # no update needed + if os.path.isfile(path): # file already exists + fileupdates.add(path) + if os.path.isfile(makenewpath(path)): + os.remove(makenewpath(path)) + outputpath = makenewpath(path) + else: + outputpath = path + pathsplit = path.split(b'/') + if (pathsplit[1:2] == [b'pool']) and (args.debugskippool): + print('skipping download of '+path.decode('ascii')+' because --debugskippool was specified') + return + if (args.internal is not None) and (pathsplit[0] == b'raspbian'): + fileurl = args.internal.encode('ascii') +b'/private/' + b'/'.join(pathsplit[1:]) + else: + fileurl = baseurl + b'/' + path + data = None + if args.sourcepool is not None: + for sourcepool in args.sourcepool: + #print(repr(args.sourcepool)) + #print(repr(sourcepool)) + sourcepool = sourcepool.encode('ascii') + if (len(pathsplit) > 1) and (pathsplit[1] == b'pool'): + spp = os.path.join(sourcepool,b'/'.join(pathsplit[2:])) + if os.path.isfile(spp) and (size == os.path.getsize(spp)): + print('trying file from sourcepool '+spp.decode('ascii')) + ts = os.path.getmtime(spp) + f = open(spp,'rb') + data = f.read() + f.close() + sha256hash = hashlib.sha256(data) + sha256hashed = sha256hash.hexdigest().encode('ascii') + if (sha256 != sha256hashed): + #print(repr(filesize)) + #print(repr(sha256)) + #print(repr(sha256hashed)) + print('hash mismatch while trying file from sourcepool, ignoring file'); + data = None + continue + try: + os.link(spp,outputpath) + print('successfully hardlinked file to source pool') + + except: + print('file in souce pool was good but hard linking failed, copying file instead') + break + fdownloads.write(outputpath+b'\n') + fdownloads.flush() + return + if data is None: + if path+b'.gz' in knownfiles: + if path+b'.gz' in fileupdates: + gzfile = makenewpath(path+b'.gz') + else: + gzfile = path+b'.gz' + print('uncompressing '+gzfile.decode('ascii')+' with hash '+sha256.decode('ascii')+' to '+outputpath.decode('ascii')) + f = gzip.open(gzfile) + data = f.read() + f.close() + ts = os.path.getmtime(gzfile) + if not checkdatahash(data, sha256, 'hash mismatch while uncompressing file ', path, ''): + sys.exit(1) + if len(data) != size: + print('size mismatch while uncompressing file') + sys.exit(1) + + #use slicing so we don't error if pathsplit only has one item + if (data is None) and (mdurl is not None) and (pathsplit[1:2] == [b'pool']): + + fileurl = mdurl + b'/' + path + #fileurl = mdurl + b'/' + b'/'.join(pathsplit[1:]) + data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, ' from mirrordirector',' trying main server instead') + if data is None: + + if (args.internal is not None) and (pathsplit[0] == b'raspbian'): + fileurl = args.internal.encode('ascii') +b'/private/' + b'/'.join(pathsplit[1:]) + elif (args.debugfdistsurl is not None) and (stage == 'downloadnew') and (b'dists' in pathsplit): + fileurl = args.debugfdistsurl.encode('ascii') + b'/' + path + else: + fileurl = baseurl + b'/' + path + data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, '','') + if data is None: + if (stage == 'downloadnew') and (b'dists' not in pathsplit): + print('continuing dispite download failure of '+path.decode('ascii')+', may revisit later') + global dlerrorcount + dlerrorcount += 1 + knownfiles[path][2] = 'F' + return + if (data is None) and (hpurl is not None): + print('failed to get '+path.decode('ascii')+' from normal sources, trying hash pool') + ensuresafepath(sha256) + fileurl = hpurl + b'/' + sha256[0:2] + b'/' + sha256[0:4] + b'/' + sha256 + data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, '', '') + if data is None: + print('failed to get '+path.decode('ascii')+' aborting') + sys.exit(1) + if data is not ...: #... is used to indicate that the file has been downloaded directly to disk and we don't + # need to write it out here. + f = open(outputpath,'wb') + f.write(data) + f.close() + os.utime(outputpath,(ts,ts)) + fdownloads.write(outputpath+b'\n') + fdownloads.flush() + + +def getfilesha256andsize(fn): + sha256hash = hashlib.sha256() + f = open(fn, 'rb') + l = bs + tl = 0 + while l == bs: + data = f.read(bs) + l = len(data) + tl += l + sha256hash.update(data) + f.close() + sha256hashed = sha256hash.hexdigest().encode('ascii') + return sha256hashed, tl + + +bs = 16 * 1024 * 1024 + +def getandcheckfile(fileurl, sha256, size, path, outputpath, errorfromstr, errorsuffix): + f = None + try: + sha256hash = hashlib.sha256() + if path == outputpath: + writepath = makenewpath(path) + viamsg = ' via '+writepath.decode('ascii') + else: + writepath = outputpath + viamsg = '' + print( + 'downloading ' + fileurl.decode('ascii') + ' with hash ' + sha256.decode( + 'ascii') + ' to ' + outputpath.decode( + 'ascii') + viamsg) + f = open(writepath, 'wb') + if use_urllib3: + response = dlmanager.request("GET", fileurl.decode('ascii'), preload_content=False) + ts = getts(fileurl, response) + tl = 0 + for data in response.stream(bs): + tl += len(data) + f.write(data) + sha256hash.update(data) + response.release_conn() + else: + with urllib.request.urlopen(fileurl.decode('ascii')) as response: + l = bs + tl = 0 + while l == bs: + data = response.read(bs) + f.write(data) + l = len(data) + tl += l + sha256hash.update(data) + ts = getts(fileurl, response) + + data = ... #used as a flag to indicate that the data is written to disk rather than stored in memory + f.close() + if not testandreporthash(sha256hash, sha256, 'hash mismatch while downloading file' + errorfromstr + ' ', path, + errorsuffix): + data = None + elif tl != size: + print('size mismatch while downloading file' + errorfromstr + '.' + errorsuffix) + data = None + except Exception as e: + print('exception ' + str(e) + ' while downloading file' + errorfromstr + '.' + errorsuffix) + if f is not None: + f.close() + data = None + ts = None + if data is not None: + #success + if writepath != outputpath: + os.rename(writepath, outputpath) + else: + #failure, cleanup writepath if nessacery + if os.path.exists(writepath): + os.remove(writepath) + + return data, ts + + +def checkdatahash(data, sha256, errorprefix, path, errorsuffix): + sha256hash = hashlib.sha256(data) + return testandreporthash(sha256hash, sha256, errorprefix, path, errorsuffix) + + +def testandreporthash(sha256hash, sha256, errorprefix, path, errorsuffix): + sha256hashed = sha256hash.hexdigest().encode('ascii') + if (sha256 != sha256hashed): + # print(repr(filesize)) + # print(repr(sha256)) + # print(repr(sha256hashed)) + print(errorprefix + path.decode('ascii') + ' ' + sha256.decode('ascii') + ' ' + sha256hashed.decode( + 'ascii') + errorsuffix); + return False + return True + + +if (args.mdurl is None) or (args.mdurl.upper() == 'NONE'): + mdurl = None +else: + mdurl = args.mdurl.encode('ascii') + +if (args.hpurl is None) or (args.hpurl.upper() == 'NONE'): + hpurl = None +else: + hpurl = args.hpurl.encode('ascii') + +if args.baseurl is None: + baseurl = b'https://archive.raspbian.org' + mdurl = b'http://mirrordirector.raspbian.org' + hpurl = b'http://snapshot.raspbian.org/hashpool' +else: + baseurl = args.baseurl.encode('ascii') + + + + +symlinkupdates = list() +fileupdates = set() + +def opengu(filepath): + #print('in opengu') + #print('filepath = '+repr(filepath)) + #print('fileupdates = '+repr(fileupdates)) + f = None + if (filepath in fileupdates): + print((b'opening '+makenewpath(filepath)+b' for '+filepath).decode('ascii')) + f = open(makenewpath(filepath),'rb') + elif (filepath+b'.gz' in fileupdates): + print((b'opening '+makenewpath(filepath+b'.gz')+b' for '+filepath).decode('ascii')) + f = gzip.open(makenewpath(filepath+b'.gz'),'rb') + elif os.path.exists(filepath): + print((b'opening '+filepath+b' for '+filepath).decode('ascii')) + f = open(filepath,'rb') + elif os.path.exists(filepath+b'.gz'): + print((b'opening '+filepath+b'.gz for '+filepath).decode('ascii')) + f = gzip.open(filepath+b'.gz','rb') + return f + +oldsymlinks = set() +newsymlinks = set() + +fdownloads = open(makenewpath(b'raspbmirrordownloads.txt'),"ab") + +dlerrorcount = 0; + +for stage in ("scanexisting","downloadnew","finalize"): + if stage == "finalize": + if dlerrorcount == 0: + print('skipping stage 3 as there were no download failures in stage 2') + #we can finish now. + break + print('stage 3, download final updates') + + oldknownfiles = knownfiles + oldsymlinks |= newsymlinks + newsymlinks = set() + + if stage == "downloadnew": + print('stage 2, main download') + oldknownfiles = knownfiles + basefiles = set(oldknownfiles.keys()) + + if stage == "scanexisting": + print('stage 1, scan existing') + else: + if args.internal is not None: + fileurl = args.internal.encode('ascii') + b'/snapshotindex.txt' + else: + fileurl = baseurl +b'/snapshotindex.txt' + + if (stage == "downloadnew") and (args.debugfif is not None): + fileurl = args.debugfif.encode('ascii') + (filedata,ts) = geturl(fileurl) + + f = open(makenewpath(b'snapshotindex.txt'),'wb') + if (args.tlwhitelist is None) and (args.distswhitelist is None): + f.write(filedata) + else: + lines = filedata.split(b'\n') + if lines[-1] == b'': + del(lines[-1]) + if args.tlwhitelist is not None: + tlwhitelist = set(args.tlwhitelist.encode('ascii').split(b',')) + linesnew = [] + for line in lines: + linesplit = line.split(b'/') + if linesplit[0] in tlwhitelist: + linesnew.append(line) + lines = linesnew + if args.distswhitelist is not None: + distswhitelist = set(args.distswhitelist.encode('ascii').split(b',')) + founddists = set() + foundesdists = set() + linesnew = [] + for line in lines: + path, sizeandsha = line.split(b' ') + pathsplit = path.split(b'/') + #print(pathsplit) + #print(len(pathsplit)) + if (len(pathsplit) > 2) and (pathsplit[1] == b'dists'): + if sizeandsha[0:2] == b'->': #symlink + target = sizeandsha[2:] + if target in distswhitelist: + linesnew.append(line) + elif pathsplit[2] in distswhitelist: + linesnew.append(line) + founddists.add((pathsplit[0],pathsplit[2])) + if (len(pathsplit) > 3) and (pathsplit[3] == b'extrasources'): + foundesdists.add((pathsplit[0],pathsplit[2])) + elif (len(pathsplit) > 1) and pathsplit[1] == b'pool': + pass + else: + linesnew.append(line) + + lines = linesnew + if founddists == set(): + print('none of the whitelisted distributions were found in the index file') + sys.exit(1) + missingesdists = founddists - foundesdists + if missingesdists != set(): + for toplevel,distribution in missingesdists: + print((b'missing extra sources file for '+toplevel+b'/dists/'+distribution).decode('ascii')) + sys.exit(1) + for line in lines: + f.write(line+b'\n') + f.close() + os.utime(makenewpath(b'snapshotindex.txt'),(ts,ts)) + + knownfiles = OrderedDict() + filequeue = [] + + if stage == "scanexisting": + if os.path.isfile(b'snapshotindex.txt'): + f = open(b'snapshotindex.txt','rb') + else: + continue + else: + f = open(makenewpath(b'snapshotindex.txt'),'rb') + for line in f: + line = line.strip() + filepath, sizeandsha = line.split(b' ') + if sizeandsha[:2] == b'->': + symlinktarget = sizeandsha[2:] + ensuresafepath(filepath) + ensuresafepath(symlinktarget) + if len(os.path.dirname(filepath)) > 0: + os.makedirs(os.path.dirname(filepath),exist_ok=True) + if stage == "scanexisting": + oldsymlinks.add(filepath) + else: + if os.path.islink(filepath): + if os.readlink(filepath) != symlinktarget: + symlinkupdates.append((filepath,symlinktarget)) + else: + print('creating symlink '+filepath.decode('ascii')+' -> '+symlinktarget.decode('ascii')) + os.symlink(symlinktarget,filepath) + newsymlinks.add(filepath) + else: + size,sha256 = sizeandsha.split(b':') + size = int(size) + knownfiles[filepath] = [sha256,size,'R'] + addtofilequeue(filequeue,filepath) + + f.close() + + extrasources = {} + while filequeue: + (priority, filepath) = heappop(filequeue) + #print('processing '+filepath.decode('ascii')) + sha256,size,status = knownfiles[filepath] + if (stage != "scanexisting") and ((filepath+b'.gz' not in knownfiles) or (status == 'R') or os.path.exists(filepath)): + getfile(filepath,sha256,size) + pathsplit = filepath.split(b'/') + #print(pathsplit[-1]) + #if (pathsplit[-1] == b'Packages'): + # print(repr(pathsplit)) + if (pathsplit[-1] == b'Release') and (pathsplit[-3] == b'dists'): + distdir = b'/'.join(pathsplit[:-1]) + f = opengu(filepath) + if f is None: + if stage == 'scanexisting': + print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state') + continue + else: + print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting') + sys.exit(1) + insha256 = False; + for line in f: + #print(repr(line[0])) + if (line == b'SHA256:\n'): + insha256 = True + elif ((line[0] == 32) and insha256): + linesplit = line.split() + filename = distdir+b'/'+linesplit[2] + #if filename in knownfiles: + # if files + #print(filename) + addfilefromdebarchive(knownfiles,filequeue,filename,linesplit[0],linesplit[1]); + else: + insha256 = False + f.close() + elif (pathsplit[-1] == b'Packages') and ((pathsplit[-5] == b'dists') or ((pathsplit[-3] == b'debian-installer') and (pathsplit[-6] == b'dists'))): + if pathsplit[-5] == b'dists': + toplevel = b'/'.join(pathsplit[:-5]) + else: + toplevel = b'/'.join(pathsplit[:-6]) + print('found packages file: '+filepath.decode('ascii')) + pf = opengu(filepath) + if pf is None: + if stage == 'scanexisting': + print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state') + continue + else: + print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting') + sys.exit(1) + + filename = None + size = None + sha256 = None + + for line in pf: + linesplit = line.split() + if (len(linesplit) == 0): + if (filename != None): + addfilefromdebarchive(knownfiles,filequeue,filename,sha256,size); + filename = None + size = None + sha256 = None + elif (linesplit[0] == b'Filename:'): + filename = toplevel+b'/'+linesplit[1] + elif (linesplit[0] == b'Size:'): + size = linesplit[1] + elif (linesplit[0] == b'SHA256:'): + sha256 = linesplit[1] + pf.close() + elif (pathsplit[-1] == b'Sources') and (pathsplit[-5] == b'dists'): + print('found sources file: '+filepath.decode('ascii')) + toplevel = b'/'.join(pathsplit[:-5]) + pf = opengu(filepath) + if pf is None: + if stage == 'scanexisting': + print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state') + continue + else: + print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting') + sys.exit(1) + filesfound = []; + directory = None + insha256p = False; + for line in pf: + linesplit = line.split() + if (len(linesplit) == 0): + for ls in filesfound: + #print(repr(ls)) + addfilefromdebarchive(knownfiles,filequeue,toplevel+b'/'+directory+b'/'+ls[2],ls[0],ls[1]); + filesfound = []; + directory = None + insha256p = False + elif ((line[0] == 32) and insha256p): + filesfound.append(linesplit) + elif (linesplit[0] == b'Directory:'): + insha256p = False + directory = linesplit[1] + elif (linesplit[0] == b'Checksums-Sha256:'): + insha256p = True + else: + insha256p = False + pf.close() + elif (args.distswhitelist is not None) and (pathsplit[-1] == b'extrasources') and (pathsplit[-3] == b'dists'): + print('found extrasources file: '+filepath.decode('ascii')) + esf = opengu(filepath) + if esf is None: + if stage == 'scanexisting': + print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state') + continue + else: + print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting') + sys.exit(1) + for line in esf: + line = line.strip() + filename , shaandsize = line.split(b' ') + size , sha256 = shaandsize.split(b':') + addfilefromdebarchive(knownfiles,filequeue,filename,sha256,size) + extrasources[filename] = shaandsize + #print(line) + +fdownloads.close() +fdownloads = open(makenewpath(b'raspbmirrordownloads.txt'),"rb") +for line in fdownloads: + basefiles.add(line.strip()) +fdownloads.close() + +def throwerror(error): + raise error + +if args.cleanup: + towalk = os.walk('.', True, throwerror, False) + for (dirpath, dirnames, filenames) in towalk: + for filename in (filenames + dirnames): # os.walk seems to regard symlinks to directories as directories. + filepath = os.path.join(dirpath, filename)[2:].encode('ascii') # [2:] is to strip the ./ prefix + # print(filepath) + if os.path.islink(filepath): + oldsymlinks.add(filepath) + for filename in filenames: + filepath = os.path.join(dirpath, filename)[2:].encode('ascii') # [2:] is to strip the ./ prefix + if not os.path.islink(filepath) and not filepath.startswith(b'snapshotindex.txt') and not filepath.startswith(b'raspbmirrordownloads.txt'): + basefiles.add(filepath) + +print('stage 4, moves and deletions') + +for filepath in fileupdates: + print((b'renaming '+makenewpath(filepath)+b' to '+filepath).decode('ascii')) + os.replace(makenewpath(filepath),filepath) + +for (filepath,symlinktarget) in symlinkupdates: + print('updating symlink '+filepath.decode('ascii')+' -> '+symlinktarget.decode('ascii')) + os.remove(filepath) + os.symlink(symlinktarget,filepath) + + +removedfiles = (basefiles | oldsymlinks) - (set(knownfiles.keys()) | newsymlinks) + +def isemptydir(dirpath): + #scandir would be significantly more efficient, but needs python 3.6 or above + #which is not reasonable to expect at this time. + #return os.path.isdir(dirpath) and ((next(os.scandir(dirpath), None)) is None) + return os.path.isdir(dirpath) and (len(os.listdir(dirpath)) == 0) + +if args.tmpdir is None: + tmpdir = None +else: + tmpdir = args.tmpdir.encode('ascii') + if tmpdir[-1] != b'/': + tmpdir += b'/' + + +for filepath in removedfiles: + #file may not actually exist, either due to earlier updates gone-wrong + #or due to the file being a non-realised uncompressed version of + #a gzipped file. + if os.path.exists(filepath): + checkpath = filepath + #if the path points into the temporary directory we only check the part of it + #that is relative to the tempory directory. + if tmpdir is not None and filepath.startswith(tmpdir): + checkpath = filepath[len(tmpdir):] + ensuresafepath(checkpath) + print('removing '+filepath.decode('ascii')) + os.remove(filepath) + #clean up empty directories. + dirpath = os.path.dirname(filepath) + while (len(dirpath) != 0) and isemptydir(dirpath): + print('removing empty dir '+dirpath.decode('ascii')) + os.rmdir(dirpath) + dirpath = os.path.dirname(dirpath) + +f = open(makenewpath(b'snapshotindex.txt'),'ab') +for filename, shaandsize in extrasources.items(): + f.write(filename+b' '+shaandsize+b'\n') +f.close() + +os.rename(makenewpath(b'snapshotindex.txt'),b'snapshotindex.txt') +os.remove(makenewpath(b'raspbmirrordownloads.txt')) +