add csc-sync-exec

2023-01-28 17:06:16 -05:00 · 2023-01-28 17:06:16 -05:00 · fd75bedb61
parent 52b5945857
commit fd75bedb61
5 changed files with 868 additions and 19 deletions
--- a/merlin/config/config.go
+++ b/merlin/config/config.go
@ -5,6 +5,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"

 	"gopkg.in/ini.v1"

@ -139,6 +140,8 @@ type Repo struct {
 	PasswordFile string `ini:"password_file"`
 	// the archive name for the ftpsync command (required for csc-sync-ftpsync)
 	FtpsyncArchive string `ini:"ftpsync_archive"`
+	// the sync command to execute with `sh -c` (required for csc-sync-exec)
+	ExecStr string `ini:"exec"`
 	// full path to file storing the repo sync state
 	StateFile string `ini:"-"`
 	// full path for file storing general logging of this repo
@ -258,11 +261,23 @@ func LoadConfig(configPath string, doneChan chan SyncResult, stopChan chan struc
 			panic("Missing or invalid frequency for " + repo.Name)
 		} else if repo.SyncType == "" {
 			panic("Missing sync type from " + repo.Name)
+		} else if repo.SyncType == "csc-sync-ftpsync" {
+			if repo.FtpsyncArchive == "" {
+				panic("Missing ftpsync archive for " + repo.Name)
+			}
+		} else if repo.SyncType == "csc-sync-exec" {
+			if repo.ExecStr == "" {
+				panic("Missing 'exec' for " + repo.Name)
+			}
+			// Allow some limited placeholders
+			repo.ExecStr = strings.ReplaceAll(repo.ExecStr, "{{repoLogFile}}", repo.RepoLogFile)
+			repo.ExecStr = strings.ReplaceAll(repo.ExecStr, "{{rsyncLogFile}}", repo.RsyncLogFile)
+			if strings.Contains(repo.ExecStr, "{{") {
+				panic(repo.Name + " has unsatisfied placeholders in 'exec': " + repo.ExecStr)
+			}
 		} else if repo.LocalDir == "" {
 			panic("Missing local download location for " + repo.Name)
-		} else if repo.SyncType == "csc-sync-ftpsync" && repo.FtpsyncArchive == "" {
-			panic("Missing ftpsync archive for " + repo.Name)
-		} else if repo.SyncType != "csc-sync-ftpsync" && repo.SyncType != "csc-sync-fedora" && repo.RsyncHost == "" {
+		} else if repo.RsyncHost == "" {
 			panic("Missing rsync host for " + repo.Name)
 		}

--- a/merlin/merlin-config.ini
+++ b/merlin/merlin-config.ini
@ -279,8 +279,11 @@ rsync_dir = ftp.freebsd.org/pub/FreeBSD/
 ; This handles both fedora/linux and fedora/epel
 ; See ~/quick-fedora-mirror/quick-fedora-mirror.conf
 [fedora]
-sync_type = csc-sync-fedora
+sync_type = csc-sync-exec
 frequency = bi-hourly
+; See ~/quick-fedora-mirror/quick-fedora-mirror.conf
+; LOGFILE is set to {{repoLogFile}} (~/merlin/log/fedora.log)
+exec = truncate --size=0 {{repoLogFile}}; cd ~/quick-fedora-mirror && ./quick-fedora-mirror > {{rsyncLogFile}}

 [ubuntu-ports-releases]
 sync_type = csc-sync-standard
@ -396,12 +399,9 @@ rsync_host = rsync.alpinelinux.org
 rsync_dir = alpine

 [raspbian]
-;verbose = true
-sync_type = csc-sync-standard
+sync_type = csc-sync-exec
 frequency = bi-hourly
-local_dir = raspbian
-rsync_host = raspbian.freemirror.org
-rsync_dir = raspbian
+exec = cd /mirror/root/raspbian && mkdir -p .~tmp~ && ~/raspbmirror/raspbmirror.py --tmpdir ./.~tmp~ --sourcepool /mirror/root/debian/pool > {{rsyncLogFile}}

 [raspberrypi]
 sync_type = csc-sync-standard-ipv6
--- a/merlin/sync/sync.go
+++ b/merlin/sync/sync.go
@ -55,8 +55,8 @@ func getSyncCommand(repo *config.Repo) (cmds [][]string) {
 		return append(cmds, cscSyncDebianStep1(repo), cscSyncDebianStep2(repo))
 	case "csc-sync-debian-cd":
 		return append(cmds, cscSyncDebianCD(repo))
-	case "csc-sync-fedora":
-		return append(cmds, cscSyncFedora(repo))
+	case "csc-sync-exec":
+		return append(cmds, cscSyncExec(repo))
 	case "csc-sync-ftpsync":
 		return append(cmds, cscSyncFtpsync(repo))
 	case "csc-sync-s3":
@ -293,14 +293,8 @@ func cscSyncDebianCD(repo *config.Repo) []string {
 	return args
 }

-func cscSyncFedora(repo *config.Repo) []string {
-	// Make sure that repo.RepoLogFile (default: ~/merlin/log/fedora.log)
-	// is the same as the LOGFILE setting in ~/quick-fedora-mirror/quick-fedora-mirror.conf
-	return []string{
-		"sh", "-c",
-		"truncate --size=0 " + repo.RepoLogFile + "; " +
-			"cd ~/quick-fedora-mirror && ./quick-fedora-mirror > " + repo.RsyncLogFile,
-	}
+func cscSyncExec(repo *config.Repo) []string {
+	return []string{"sh", "-c", repo.ExecStr}
 }

 func cscSyncFtpsync(repo *config.Repo) []string {
--- a/raspbmirror/README.md
+++ b/raspbmirror/README.md
@ -0,0 +1,5 @@
+This script is to be used for Raspbian, which is NOT the same as Raspberry Pi OS.
+
+It was downloaded from [here](https://raw.githubusercontent.com/plugwash/raspbian-tools/master/raspbmirror.py).
+
+See [here](https://www.raspbian.org/RaspbianMirrors) for details about Raspbian mirroring.
--- a/raspbmirror/raspbmirror.py
+++ b/raspbmirror/raspbmirror.py
@ -0,0 +1,835 @@
+#!/usr/bin/env python3
+
+# Copyright 2018 Peter Green
+# Released under the MIT/Expat license, see doc/COPYING
+
+import os
+import sys
+import hashlib
+import gzip
+import stat
+#from sortedcontainers import SortedDict
+#from sortedcontainers import SortedList
+from collections import deque
+from collections import OrderedDict
+from datetime import datetime
+from email.utils import parsedate_to_datetime
+import argparse
+import re
+from heapq import heappush, heappop
+import fcntl
+
+parser = argparse.ArgumentParser(description="mirror raspbian repo.")
+parser.add_argument("baseurl", help="base url for source repo (e.g. https://archive.raspbian.org/ )",nargs='?')
+parser.add_argument("mdurl", help="base url for mirrordirector or local source mirror (e.g. https://mirrordirector.raspbian.org/ )",nargs='?')
+parser.add_argument("hpurl", help="base url for last result hash pool (e.g. http://snapshot.raspbian.org/hashpool )",nargs='?')
+
+parser.add_argument("--internal", help=argparse.SUPPRESS) #base URL for private repo (internal use only)
+parser.add_argument("--sourcepool", help="specify a source pool to look for packages in before downloading them (useful if maintaining multiple mirrors)",action='append')
+parser.add_argument("--tmpdir", help="specify a temporary directory to avoid storing temporary files in the output tree, must be on the same filesystem as the output tree")
+
+#debug option to set the index file used for the "downloadnew" phase but not the "finalize" phase, used to test error recovery.
+parser.add_argument("--debugfif", help=argparse.SUPPRESS)
+#debug option to set the source url used to download "dists" files during the "downloadnew" phase, used to test error recovery.
+parser.add_argument("--debugfdistsurl", help=argparse.SUPPRESS)
+
+parser.add_argument("--tlwhitelist", help="specify comma-seperated whitelist of top-level directories")
+
+parser.add_argument("--cleanup",help="scan for and remove files not managed by raspbmirror from mirror tree", action="store_true")
+
+parser.add_argument("--debugskippool",help="skip downloading pool data, only download metadata (for debugging)",action="store_true")
+
+parser.add_argument("--distswhitelist", help="specify comman seperated list of distributions")
+
+parser.add_argument("--nolock", help="don't try to lock the target directory", action="store_true")
+
+parser.add_argument("--repair", help="during mirroring, verify that all on-disk files match the expected sha256", action="store_true")
+
+parser.add_argument("--urllib", help="force usage of the builtin urllib module, even if urllib3 is present", action="store_true")
+
+parser.add_argument("--urllib3", help="force usage of the urllib3 module, panics if the dependency is missing", action="store_true")
+
+parser.add_argument("--ipv4", help="force usage of IPv4 addresses. Requires urllib3", action="store_true")
+
+parser.add_argument("--ipv6", help="force usage of IPv6 addresses. Requires urllib3", action="store_true")
+
+args = parser.parse_args()
+
+if not args.nolock:
+	lockfd = os.open('.',os.O_RDONLY)
+	fcntl.flock(lockfd,fcntl.LOCK_EX | fcntl.LOCK_NB)
+
+if args.urllib and args.urllib3:
+	print("error: flags --urllib and --urllib3 are in conflict")
+	exit(1)
+
+if args.urllib:
+	import urllib.request
+	use_urllib3 = False
+elif args.urllib3:
+	import urllib3
+	use_urllib3 = True
+else:
+	# auto detect urllib3
+	try:
+		import urllib3
+		use_urllib3 = True
+	except:
+		import urllib.request
+		use_urllib3 = False
+
+if args.ipv4 and args.ipv6:
+	print("error: flags --ipv4 and --ipv6 are in conflict")
+	exit(1)
+
+if use_urllib3:
+	# the number of pools should be greater than the number of concurrently used sites.
+	# 10 should be safe.
+	dlmanager = urllib3.PoolManager(num_pools=10)
+	print("info: using urllib3")
+
+	# a fairly hacky way to force the usage of ipv4 or ipv6 addresses
+	# https://stackoverflow.com/questions/33046733/force-requests-to-use-ipv4-ipv6
+	if args.ipv4:
+		import socket
+		import requests.packages.urllib3.util.connection as urllib3_cn
+		def allowed_gai_family():
+			return socket.AF_INET
+		urllib3_cn.allowed_gai_family = allowed_gai_family
+	elif args.ipv6:
+		import socket
+		import requests.packages.urllib3.util.connection as urllib3_cn
+		def allowed_gai_family():
+			return socket.AF_INET6
+		urllib3_cn.allowed_gai_family = allowed_gai_family
+else:
+	print("info: using urllib")
+	if args.ipv4:
+		print("error: flag --ipv4 requires the urllib3 package")
+		exit(1)
+	elif args.ipv6:
+		print("error: flag --ipv6 requires the urllib3 package")
+		exit(1)
+
+def addfilefromdebarchive(filestoverify,filequeue,filename,sha256,size):
+	size = int(size)
+	sha256andsize = [sha256,size,'M']
+	if filename in filestoverify:
+		if (sha256andsize[0:2] != filestoverify[filename][0:2]):
+			if stage == 'scanexisting':
+				print('warning: same file with different hash/size during scanexisting phase old:'+repr(filestoverify[filename])+' new:'+repr(sha256andsize))
+				#find existing sha1/size of file on disk if it exists
+				if os.path.isfile(filename):
+					f = open(filename,'rb')
+					data = f.read()
+					f.close()
+					sha256hash = hashlib.sha256(data)
+					sha256hashed = sha256hash.hexdigest().encode('ascii')
+					size = len(data)
+				else:
+					#otherwise we have no idea
+					sha256 = None
+					size = None
+				filestoverify[filename] = [sha256,size,'M']
+			else:
+				print('error: same file with different hash/size during downloadnew phase old:'+repr(filestoverify[filename])+' new:'+repr(sha256andsize))
+				sys.exit(1)
+	else:
+		filestoverify[filename] = sha256andsize
+		addtofilequeue(filequeue,filename)
+
+def addtofilequeue(filequeue,filename):
+	filenamesplit = filename.split(b'/')
+	if b'dists' in filenamesplit:
+		if filename.endswith(b'.gz'):
+			# process gz files with high priority so they can be used as substitutes for their uncompressed counterparts
+			heappush(filequeue,(10,filename))
+		else:
+			heappush(filequeue,(20,filename))
+	heappush(filequeue,(30,filename))
+
+
+#regex used for filename sanity checks
+pfnallowed = re.compile(b'[a-z0-9A-Z\-_:\+~\.]+',re.ASCII)
+shaallowed = re.compile(b'[a-z0-9]+',re.ASCII)
+
+def ensuresafepath(path):
+	pathsplit = path.split(b'/')
+	if path[0] == '/':
+		print("path must be relative")
+		sys.exit(1)
+	for component in pathsplit:
+		if not pfnallowed.fullmatch(component):
+			print("component "+ascii(component)+" in path "+ascii(path)+" contains unexpected characters")
+			sys.exit(1)
+		elif component[0] == '.':
+			print("filenames starting with a dot are not allowed")
+			sys.exit(1)
+	
+def geturl(fileurl):
+    if use_urllib3:
+        response = dlmanager.request("GET", fileurl.decode('ascii'))
+        ts = getts(fileurl, response)
+        return (response.data, ts)
+    else:
+        with urllib.request.urlopen(fileurl.decode('ascii')) as response:
+            data = response.read()
+        ts = getts(fileurl, response)
+        return (data, ts)
+
+def getts(fileurl, response):
+	if fileurl[:7] == b'file://':
+		ts = os.path.getmtime(fileurl[7:])
+	else:
+		dt = parsedate_to_datetime(response.getheader('Last-Modified'))
+		if dt.tzinfo is None:
+			dt = dt.replace(tzinfo=timezone.utc)
+		ts = dt.timestamp()
+	return ts
+
+
+def makenewpath(path):
+	if args.tmpdir is None:
+		return path+b'.new'
+	else:
+		return os.path.join(args.tmpdir.encode('ascii'),(path+b'.new').replace(b'/',b'~'))
+
+def getfile(path,sha256,size):
+	ensuresafepath(path)
+	if not shaallowed.fullmatch(sha256):
+		print('invalid character in sha256 hash')
+		sys.exit(1)
+	#hashfn = b'../hashpool/' + sha256[:2] +b'/'+ sha256[:4] +b'/'+ sha256
+	#if os.path.isfile(hashfn):
+	#	if os.path.getsize(hashfn) != size:
+	#		print('size mismatch on existing file in hash pool')
+	#		sys.exit(1)
+	#else:
+	#	secondhashfn = None
+	#	if args.secondpool is not None:
+	#		secondhashfn = os.path.join(args.secondpool.encode('ascii'),sha256[:2] +b'/'+ sha256[:4] +b'/'+ sha256)
+	#		#print(secondhashfn)
+	#		if not os.path.isfile(secondhashfn):
+	#			secondhashfn = None
+	#	if secondhashfn is None:
+	#	else:
+	#		print('copying '+path.decode('ascii')+' with hash '+sha256.decode('ascii')+' from secondary pool')
+	#		f = open(secondhashfn,'rb')
+	#		data = f.read()
+	#		f.close()
+	#		ts = os.path.getmtime(secondhashfn)
+	#	sha256hash = hashlib.sha256(data)
+	#	sha256hashed = sha256hash.hexdigest().encode('ascii')
+	#	if (sha256 != sha256hashed):
+	#		#print(repr(filesize))
+	#		#print(repr(sha256))
+	#		#print(repr(sha256hashed))
+	#		print('hash mismatch while downloading file '+path.decode('ascii')+' '+sha256.decode('ascii')+' '+sha256hashed.decode('ascii'));
+	#		sys.exit(1)
+	#	if len(data) != size:
+	#		print('size mismatch while downloading file')
+	#		sys.exit(1)
+	#	hashdir = os.path.dirname(hashfn)
+	#	os.makedirs(hashdir,exist_ok=True)
+	#	f = open(hashfn,'wb')
+	#	f.write(data)
+	#	f.close()
+	#	            
+	#	os.utime(hashfn,(ts,ts))
+	if len(os.path.dirname(path)) > 0:
+		os.makedirs(os.path.dirname(path),exist_ok=True)
+	havenewfile = os.path.isfile(makenewpath(path))
+	if havenewfile: # "new" file already exists, lets check the hash
+		fn = makenewpath(path)
+		sha256hashed, tl = getfilesha256andsize(fn)
+		if (sha256 == sha256hashed) and (size == tl):
+			print('existing file '+path.decode('ascii')+' matched by hash and size')
+			fileupdates.add(path)
+			return # no download needed but rename is
+	if os.path.isfile(path): # file already exists
+		if (size == os.path.getsize(path)): #no point reading the data and calculating a hash if the size does not match
+			if (not args.repair) and (path in oldknownfiles) and (not havenewfile):
+				#shortcut exit if file is unchanged, we skip this if a "new" file was detected because
+				#that means some sort of update was going on to the file and may need to be finished/cleaned up.
+				oldsha256,oldsize,oldstatus = oldknownfiles[path]
+				if (oldsha256 == sha256) and (oldsize == size) and (oldstatus != 'F'):
+					return # no update needed
+
+			sha256hashed, tl = getfilesha256andsize(path)
+			if (sha256 == sha256hashed) and (size == tl):
+				print('existing file '+path.decode('ascii')+' matched by hash and size')
+				if havenewfile:
+					#if file is up to date but a "new" file exists and is bad
+					#(we wouldn't have got this far if it was good)
+					#schedule the "new" file for removal by adding it to "basefiles"
+					basefiles.add(makenewpath(path))
+				return  # no update needed
+	if os.path.isfile(path): # file already exists
+		fileupdates.add(path)
+		if os.path.isfile(makenewpath(path)):
+			os.remove(makenewpath(path))
+		outputpath = makenewpath(path)
+	else:
+		outputpath = path
+	pathsplit = path.split(b'/')
+	if (pathsplit[1:2] == [b'pool']) and (args.debugskippool):
+		print('skipping download of '+path.decode('ascii')+' because --debugskippool was specified')
+		return
+	if (args.internal is not None) and (pathsplit[0] == b'raspbian'):
+		fileurl = args.internal.encode('ascii') +b'/private/' + b'/'.join(pathsplit[1:])
+	else:
+		fileurl = baseurl + b'/' + path
+	data = None
+	if args.sourcepool is not None:
+		for sourcepool in args.sourcepool:
+			#print(repr(args.sourcepool))
+			#print(repr(sourcepool))
+			sourcepool = sourcepool.encode('ascii')
+			if (len(pathsplit) > 1) and (pathsplit[1] == b'pool'):
+				spp = os.path.join(sourcepool,b'/'.join(pathsplit[2:]))
+				if os.path.isfile(spp)  and (size == os.path.getsize(spp)):
+					print('trying file from sourcepool '+spp.decode('ascii'))
+					ts = os.path.getmtime(spp)
+					f = open(spp,'rb')
+					data = f.read()
+					f.close()
+					sha256hash = hashlib.sha256(data)
+					sha256hashed = sha256hash.hexdigest().encode('ascii')
+					if (sha256 != sha256hashed):
+						#print(repr(filesize))
+						#print(repr(sha256))
+						#print(repr(sha256hashed))
+						print('hash mismatch while trying file from sourcepool, ignoring file');
+						data = None
+						continue
+					try:
+						os.link(spp,outputpath)
+						print('successfully hardlinked file to source pool')
+					
+					except:
+						print('file in souce pool was good but hard linking failed, copying file instead')
+						break
+					fdownloads.write(outputpath+b'\n')
+					fdownloads.flush()
+					return
+	if data is None:
+		if path+b'.gz' in knownfiles:
+			if path+b'.gz' in fileupdates:
+				gzfile = makenewpath(path+b'.gz')
+			else:
+				gzfile = path+b'.gz'
+			print('uncompressing '+gzfile.decode('ascii')+' with hash '+sha256.decode('ascii')+' to '+outputpath.decode('ascii'))
+			f = gzip.open(gzfile)
+			data = f.read()
+			f.close()
+			ts = os.path.getmtime(gzfile)
+			if not checkdatahash(data, sha256, 'hash mismatch while uncompressing file ', path, ''):
+				sys.exit(1)
+			if len(data) != size:
+				print('size mismatch while uncompressing file')
+				sys.exit(1)
+
+	#use slicing so we don't error if pathsplit only has one item
+	if (data is None) and (mdurl is not None) and (pathsplit[1:2] == [b'pool']):
+
+		fileurl = mdurl + b'/' + path
+		#fileurl = mdurl + b'/' + b'/'.join(pathsplit[1:])
+		data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, ' from mirrordirector',' trying main server instead')
+	if data is None:
+
+		if (args.internal is not None) and (pathsplit[0] == b'raspbian'):
+			fileurl = args.internal.encode('ascii') +b'/private/' + b'/'.join(pathsplit[1:])
+		elif (args.debugfdistsurl is not None) and (stage == 'downloadnew') and (b'dists' in pathsplit):
+			fileurl = args.debugfdistsurl.encode('ascii') + b'/' + path
+		else:
+			fileurl = baseurl + b'/' + path
+		data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, '','')
+	if data is None:
+		if (stage == 'downloadnew') and (b'dists' not in pathsplit):
+			print('continuing dispite download failure of '+path.decode('ascii')+', may revisit later')
+			global dlerrorcount
+			dlerrorcount += 1
+			knownfiles[path][2] = 'F'
+			return
+	if (data is None) and (hpurl is not None):
+		print('failed to get '+path.decode('ascii')+' from normal sources, trying hash pool')
+		ensuresafepath(sha256)
+		fileurl = hpurl + b'/' + sha256[0:2] + b'/' + sha256[0:4] + b'/' + sha256
+		data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, '', '')
+	if data is None:
+		print('failed to get '+path.decode('ascii')+' aborting')
+		sys.exit(1)
+	if data is not ...: #... is used to indicate that the file has been downloaded directly to disk and we don't
+		                # need to write it out here.
+		f = open(outputpath,'wb')
+		f.write(data)
+		f.close()
+	os.utime(outputpath,(ts,ts))
+	fdownloads.write(outputpath+b'\n')
+	fdownloads.flush()
+
+
+def getfilesha256andsize(fn):
+	sha256hash = hashlib.sha256()
+	f = open(fn, 'rb')
+	l = bs
+	tl = 0
+	while l == bs:
+		data = f.read(bs)
+		l = len(data)
+		tl += l
+		sha256hash.update(data)
+	f.close()
+	sha256hashed = sha256hash.hexdigest().encode('ascii')
+	return sha256hashed, tl
+
+
+bs = 16 * 1024 * 1024
+
+def getandcheckfile(fileurl, sha256, size, path, outputpath, errorfromstr, errorsuffix):
+	f = None
+	try:
+		sha256hash = hashlib.sha256()
+		if path == outputpath:
+			writepath = makenewpath(path)
+			viamsg = ' via '+writepath.decode('ascii')
+		else:
+			writepath = outputpath
+			viamsg = ''
+		print(
+			'downloading ' + fileurl.decode('ascii') + ' with hash ' + sha256.decode(
+				'ascii') + ' to ' + outputpath.decode(
+				'ascii') + viamsg)
+		f = open(writepath, 'wb')
+		if use_urllib3:
+				response = dlmanager.request("GET", fileurl.decode('ascii'), preload_content=False)
+				ts = getts(fileurl, response)
+				tl = 0
+				for data in response.stream(bs):
+						tl += len(data)
+						f.write(data)
+						sha256hash.update(data)
+				response.release_conn()
+		else:
+			with urllib.request.urlopen(fileurl.decode('ascii')) as response:
+				l = bs
+				tl = 0
+				while l == bs:
+					data = response.read(bs)
+					f.write(data)
+					l = len(data)
+					tl += l
+					sha256hash.update(data)
+				ts = getts(fileurl, response)
+
+		data = ... #used as a flag to indicate that the data is written to disk rather than stored in memory
+		f.close()
+		if not testandreporthash(sha256hash, sha256, 'hash mismatch while downloading file' + errorfromstr + ' ', path,
+							 errorsuffix):
+			data = None
+		elif tl != size:
+			print('size mismatch while downloading file' + errorfromstr + '.' + errorsuffix)
+			data = None
+	except Exception as e:
+		print('exception ' + str(e) + ' while downloading file' + errorfromstr + '.' + errorsuffix)
+		if f is not None:
+			f.close()
+		data = None
+		ts = None
+	if data is not None:
+		#success
+		if writepath != outputpath:
+			os.rename(writepath, outputpath)
+	else:
+		#failure, cleanup writepath if nessacery
+		if os.path.exists(writepath):
+			os.remove(writepath)
+
+	return data, ts
+
+
+def checkdatahash(data, sha256, errorprefix, path, errorsuffix):
+	sha256hash = hashlib.sha256(data)
+	return testandreporthash(sha256hash, sha256, errorprefix, path, errorsuffix)
+
+
+def testandreporthash(sha256hash, sha256, errorprefix, path, errorsuffix):
+	sha256hashed = sha256hash.hexdigest().encode('ascii')
+	if (sha256 != sha256hashed):
+		# print(repr(filesize))
+		# print(repr(sha256))
+		# print(repr(sha256hashed))
+		print(errorprefix + path.decode('ascii') + ' ' + sha256.decode('ascii') + ' ' + sha256hashed.decode(
+			'ascii') + errorsuffix);
+		return False
+	return True
+
+
+if (args.mdurl is None) or (args.mdurl.upper() == 'NONE'):
+	mdurl = None
+else:
+	mdurl = args.mdurl.encode('ascii')
+
+if (args.hpurl is None) or (args.hpurl.upper() == 'NONE'):
+	hpurl = None
+else:
+	hpurl = args.hpurl.encode('ascii')
+
+if args.baseurl is None:
+	baseurl = b'https://archive.raspbian.org'
+	mdurl = b'http://mirrordirector.raspbian.org'
+	hpurl = b'http://snapshot.raspbian.org/hashpool'
+else:
+	baseurl = args.baseurl.encode('ascii')
+
+
+
+
+symlinkupdates = list()
+fileupdates = set()
+
+def opengu(filepath):
+	#print('in opengu')
+	#print('filepath = '+repr(filepath))
+	#print('fileupdates = '+repr(fileupdates))
+	f = None
+	if (filepath in fileupdates):
+		print((b'opening '+makenewpath(filepath)+b' for '+filepath).decode('ascii'))
+		f = open(makenewpath(filepath),'rb')
+	elif (filepath+b'.gz' in fileupdates):
+		print((b'opening '+makenewpath(filepath+b'.gz')+b' for '+filepath).decode('ascii'))
+		f = gzip.open(makenewpath(filepath+b'.gz'),'rb')
+	elif os.path.exists(filepath):
+		print((b'opening '+filepath+b' for '+filepath).decode('ascii'))
+		f = open(filepath,'rb')
+	elif os.path.exists(filepath+b'.gz'):
+		print((b'opening '+filepath+b'.gz for '+filepath).decode('ascii'))
+		f = gzip.open(filepath+b'.gz','rb')
+	return f
+
+oldsymlinks = set()
+newsymlinks = set()
+
+fdownloads = open(makenewpath(b'raspbmirrordownloads.txt'),"ab")
+
+dlerrorcount = 0;
+
+for stage in ("scanexisting","downloadnew","finalize"):
+	if stage == "finalize":
+		if dlerrorcount == 0:
+			print('skipping stage 3 as there were no download failures in stage 2')
+			#we can finish now.
+			break
+		print('stage 3, download final updates')
+		
+		oldknownfiles = knownfiles
+		oldsymlinks |= newsymlinks
+		newsymlinks = set()
+
+	if stage == "downloadnew":
+		print('stage 2, main download')
+		oldknownfiles = knownfiles
+		basefiles = set(oldknownfiles.keys())
+
+	if stage == "scanexisting":
+		print('stage 1, scan existing')
+	else:
+		if args.internal is not None:
+			fileurl = args.internal.encode('ascii') + b'/snapshotindex.txt'
+		else:
+			fileurl = baseurl +b'/snapshotindex.txt'
+
+		if (stage == "downloadnew") and (args.debugfif is not None):
+			fileurl = args.debugfif.encode('ascii')
+		(filedata,ts) = geturl(fileurl) 
+
+		f = open(makenewpath(b'snapshotindex.txt'),'wb')
+		if (args.tlwhitelist is None) and (args.distswhitelist is None):
+			f.write(filedata)
+		else:
+			lines = filedata.split(b'\n')
+			if lines[-1] == b'':
+				del(lines[-1])
+			if args.tlwhitelist is not None:
+				tlwhitelist = set(args.tlwhitelist.encode('ascii').split(b','))
+				linesnew = []
+				for line in lines:
+					linesplit = line.split(b'/')
+					if linesplit[0] in tlwhitelist:
+						linesnew.append(line)
+				lines = linesnew
+			if args.distswhitelist is not None:
+				distswhitelist = set(args.distswhitelist.encode('ascii').split(b','))
+				founddists = set()
+				foundesdists = set()
+				linesnew = []
+				for line in lines:
+					path, sizeandsha = line.split(b' ')
+					pathsplit = path.split(b'/')
+					#print(pathsplit)
+					#print(len(pathsplit))
+					if (len(pathsplit) > 2) and (pathsplit[1] == b'dists'):
+						if sizeandsha[0:2] == b'->': #symlink
+							target = sizeandsha[2:]
+							if target in distswhitelist:
+								linesnew.append(line)
+						elif pathsplit[2] in distswhitelist:
+							linesnew.append(line)
+							founddists.add((pathsplit[0],pathsplit[2]))
+							if (len(pathsplit) > 3) and (pathsplit[3] == b'extrasources'):
+								foundesdists.add((pathsplit[0],pathsplit[2]))
+					elif (len(pathsplit) > 1) and pathsplit[1] == b'pool':
+						pass
+					else:
+						linesnew.append(line)
+					
+				lines = linesnew
+				if founddists == set():
+					print('none of the whitelisted distributions were found in the index file')
+					sys.exit(1)
+				missingesdists = founddists - foundesdists
+				if missingesdists != set():
+					for toplevel,distribution in missingesdists:
+						print((b'missing extra sources file for '+toplevel+b'/dists/'+distribution).decode('ascii'))
+					sys.exit(1)
+			for line in lines:
+				f.write(line+b'\n')
+		f.close()
+		os.utime(makenewpath(b'snapshotindex.txt'),(ts,ts))
+
+	knownfiles = OrderedDict()
+	filequeue = []
+
+	if stage == "scanexisting":
+		if os.path.isfile(b'snapshotindex.txt'):
+			f = open(b'snapshotindex.txt','rb')
+		else:
+			continue
+	else:
+		f = open(makenewpath(b'snapshotindex.txt'),'rb')
+	for line in f:
+		line = line.strip()
+		filepath, sizeandsha = line.split(b' ')
+		if sizeandsha[:2] == b'->':
+			symlinktarget = sizeandsha[2:]
+			ensuresafepath(filepath)
+			ensuresafepath(symlinktarget)
+			if len(os.path.dirname(filepath)) > 0:
+				os.makedirs(os.path.dirname(filepath),exist_ok=True)
+			if stage == "scanexisting":
+				oldsymlinks.add(filepath)
+			else:
+				if os.path.islink(filepath):
+					if os.readlink(filepath) != symlinktarget:
+						symlinkupdates.append((filepath,symlinktarget))
+				else:
+					print('creating symlink '+filepath.decode('ascii')+' -> '+symlinktarget.decode('ascii'))
+					os.symlink(symlinktarget,filepath)
+				newsymlinks.add(filepath)
+		else:
+			size,sha256 = sizeandsha.split(b':')
+			size = int(size)
+			knownfiles[filepath] = [sha256,size,'R']
+			addtofilequeue(filequeue,filepath)
+
+	f.close()
+
+	extrasources = {}
+	while filequeue:
+		(priority, filepath) = heappop(filequeue)
+		#print('processing '+filepath.decode('ascii'))
+		sha256,size,status = knownfiles[filepath]
+		if (stage != "scanexisting") and ((filepath+b'.gz' not in knownfiles) or (status == 'R') or os.path.exists(filepath)):
+			getfile(filepath,sha256,size)
+		pathsplit = filepath.split(b'/')
+		#print(pathsplit[-1])
+		#if (pathsplit[-1] == b'Packages'):
+		#	print(repr(pathsplit))
+		if (pathsplit[-1] == b'Release') and (pathsplit[-3] == b'dists'):
+			distdir = b'/'.join(pathsplit[:-1])
+			f = opengu(filepath)
+			if f is None:
+				if stage == 'scanexisting':
+					print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
+					continue
+				else:
+					print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
+					sys.exit(1)
+			insha256 = False;
+			for line in f:
+				#print(repr(line[0]))
+				if (line == b'SHA256:\n'):
+					insha256 = True
+				elif ((line[0] == 32) and insha256):
+					linesplit = line.split()
+					filename = distdir+b'/'+linesplit[2]
+					#if filename in knownfiles:
+					#	if files
+					#print(filename)
+					addfilefromdebarchive(knownfiles,filequeue,filename,linesplit[0],linesplit[1]);
+				else:
+					insha256 = False
+			f.close()
+		elif (pathsplit[-1] == b'Packages') and ((pathsplit[-5] == b'dists') or ((pathsplit[-3] == b'debian-installer') and (pathsplit[-6] == b'dists'))):
+						if pathsplit[-5] == b'dists':
+							toplevel = b'/'.join(pathsplit[:-5])
+						else:
+							toplevel = b'/'.join(pathsplit[:-6])
+						print('found packages file: '+filepath.decode('ascii'))
+						pf = opengu(filepath)
+						if pf is None:
+							if stage == 'scanexisting':
+								print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
+								continue
+							else:
+								print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
+								sys.exit(1)
+
+						filename = None
+						size = None
+						sha256 = None
+							
+						for line in pf:
+							linesplit = line.split()
+							if (len(linesplit) == 0):
+								if (filename != None):
+									addfilefromdebarchive(knownfiles,filequeue,filename,sha256,size);
+								filename = None
+								size = None
+								sha256 = None
+							elif (linesplit[0] == b'Filename:'):
+								filename = toplevel+b'/'+linesplit[1]
+							elif (linesplit[0] == b'Size:'):
+								size = linesplit[1]
+							elif (linesplit[0] == b'SHA256:'):
+								sha256 = linesplit[1]
+						pf.close()
+		elif (pathsplit[-1] == b'Sources') and (pathsplit[-5] == b'dists'):
+						print('found sources file: '+filepath.decode('ascii'))
+						toplevel = b'/'.join(pathsplit[:-5])
+						pf = opengu(filepath)
+						if pf is None:
+							if stage == 'scanexisting':
+								print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
+								continue
+							else:
+								print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
+								sys.exit(1)
+						filesfound = [];
+						directory = None
+						insha256p = False;
+						for line in pf:
+							linesplit = line.split()
+							if (len(linesplit) == 0):
+								for ls in filesfound:
+									#print(repr(ls))
+									addfilefromdebarchive(knownfiles,filequeue,toplevel+b'/'+directory+b'/'+ls[2],ls[0],ls[1]);
+								filesfound = [];
+								directory = None
+								insha256p = False
+							elif ((line[0] == 32) and insha256p):
+								filesfound.append(linesplit)
+							elif (linesplit[0] == b'Directory:'):
+								insha256p = False
+								directory = linesplit[1]
+							elif (linesplit[0] == b'Checksums-Sha256:'):
+								insha256p = True
+							else:
+								insha256p = False
+						pf.close()
+		elif (args.distswhitelist is not None) and (pathsplit[-1] == b'extrasources') and (pathsplit[-3] == b'dists'):
+						print('found extrasources file: '+filepath.decode('ascii'))
+						esf = opengu(filepath)
+						if esf is None:
+							if stage == 'scanexisting':
+								print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
+								continue
+							else:
+								print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
+								sys.exit(1)
+						for line in esf:
+							line = line.strip()
+							filename , shaandsize = line.split(b' ')
+							size , sha256 = shaandsize.split(b':')
+							addfilefromdebarchive(knownfiles,filequeue,filename,sha256,size)
+							extrasources[filename] = shaandsize
+							#print(line)
+
+fdownloads.close()
+fdownloads = open(makenewpath(b'raspbmirrordownloads.txt'),"rb")
+for line in fdownloads:
+	basefiles.add(line.strip())
+fdownloads.close()
+
+def throwerror(error):
+	raise error
+
+if args.cleanup:
+	towalk = os.walk('.', True, throwerror, False)
+	for (dirpath, dirnames, filenames) in towalk:
+		for filename in (filenames + dirnames):  # os.walk seems to regard symlinks to directories as directories.
+			filepath = os.path.join(dirpath, filename)[2:].encode('ascii')  # [2:] is to strip the ./ prefix
+			# print(filepath)
+			if os.path.islink(filepath):
+				oldsymlinks.add(filepath)
+		for filename in filenames:
+			filepath = os.path.join(dirpath, filename)[2:].encode('ascii')  # [2:] is to strip the ./ prefix
+			if not os.path.islink(filepath) and not filepath.startswith(b'snapshotindex.txt') and not filepath.startswith(b'raspbmirrordownloads.txt'):
+				basefiles.add(filepath)
+
+print('stage 4, moves and deletions')
+
+for filepath in fileupdates:
+	print((b'renaming '+makenewpath(filepath)+b' to '+filepath).decode('ascii'))
+	os.replace(makenewpath(filepath),filepath)
+
+for (filepath,symlinktarget) in symlinkupdates:
+	print('updating symlink '+filepath.decode('ascii')+' -> '+symlinktarget.decode('ascii'))
+	os.remove(filepath)
+	os.symlink(symlinktarget,filepath)
+
+
+removedfiles = (basefiles | oldsymlinks) - (set(knownfiles.keys()) | newsymlinks)
+
+def isemptydir(dirpath):
+	#scandir would be significantly more efficient, but needs python 3.6 or above
+	#which is not reasonable to expect at this time.
+	#return os.path.isdir(dirpath) and ((next(os.scandir(dirpath), None)) is None)
+	return os.path.isdir(dirpath) and (len(os.listdir(dirpath)) == 0)
+
+if args.tmpdir is None:
+	tmpdir = None
+else:
+	tmpdir = args.tmpdir.encode('ascii')
+	if tmpdir[-1] != b'/':
+		tmpdir += b'/'
+
+
+for filepath in removedfiles:
+	#file may not actually exist, either due to earlier updates gone-wrong
+	#or due to the file being a non-realised uncompressed version of
+	#a gzipped file.
+	if os.path.exists(filepath):
+		checkpath = filepath
+		#if the path points into the temporary directory we only check the part of it
+		#that is relative to the tempory directory.
+		if tmpdir is not None and filepath.startswith(tmpdir):
+			checkpath = filepath[len(tmpdir):]
+		ensuresafepath(checkpath)
+		print('removing '+filepath.decode('ascii'))
+		os.remove(filepath)
+		#clean up empty directories.
+		dirpath = os.path.dirname(filepath)
+		while (len(dirpath) != 0) and isemptydir(dirpath):
+			print('removing empty dir '+dirpath.decode('ascii'))
+			os.rmdir(dirpath)
+			dirpath = os.path.dirname(dirpath)
+
+f = open(makenewpath(b'snapshotindex.txt'),'ab')
+for filename, shaandsize in extrasources.items():
+	f.write(filename+b' '+shaandsize+b'\n')
+f.close()
+
+os.rename(makenewpath(b'snapshotindex.txt'),b'snapshotindex.txt')
+os.remove(makenewpath(b'raspbmirrordownloads.txt'))
+