diff --git a/.gitignore b/.gitignore index 4cca111..647a291 100644 --- a/.gitignore +++ b/.gitignore @@ -1,17 +1,21 @@ !.gitignore +*.swp .git_old/ /dead.letter /*.bz2 +/.aptitude/ +/.bash_history +/.bashrc +/.cache/ +/.config/ +/.lesshst +/.ssh/ +/.vim/ +/.vimrc +/.viminfo +/.wget-hsts +/.zcompdump +/go/ +/quick-fedora-mirror/timefile* /passwords/ /tmp/ -/.bash_history -/.lesshst -/.vim/ -/.viminfo -/.zcompdump -/.aptitude/ -/.config/ -/.cache/ -/.ssh/ -/go/ -/.bashrc diff --git a/merlin/config/config.go b/merlin/config/config.go index cf1e8e6..fbb7b63 100644 --- a/merlin/config/config.go +++ b/merlin/config/config.go @@ -18,6 +18,7 @@ const ( TWICE_HOURLY = HOURLY / 2 BI_HOURLY = HOURLY * 2 TRI_HOURLY = HOURLY * 3 + QUAD_HOURLY = HOURLY * 4 TEN_MINUTELY = 600 FIVE_MINUTELY = 300 MINUTELY = 60 @@ -43,6 +44,7 @@ var frequencies = map[string]int{ "twice-hourly": TWICE_HOURLY, "bi-hourly": BI_HOURLY, "tri-hourly": TRI_HOURLY, + "quad-hourly": QUAD_HOURLY, "ten-minutely": TEN_MINUTELY, "five-minutely": FIVE_MINUTELY, "minutely": MINUTELY, @@ -262,7 +264,7 @@ func LoadConfig(configPath string, doneChan chan SyncResult, stopChan chan struc panic("Missing local download location for " + repo.Name) } else if repo.SyncType == "csc-sync-ftpsync" && repo.FtpsyncArchive == "" { panic("Missing ftpsync archive for " + repo.Name) - } else if repo.SyncType != "csc-sync-ftpsync" && repo.RsyncHost == "" { + } else if repo.SyncType != "csc-sync-ftpsync" && repo.SyncType != "csc-sync-fedora" && repo.RsyncHost == "" { panic("Missing rsync host for " + repo.Name) } diff --git a/merlin/logger/logger.go b/merlin/logger/logger.go index 6ff8f66..84f05b1 100644 --- a/merlin/logger/logger.go +++ b/merlin/logger/logger.go @@ -28,8 +28,9 @@ var levels = map[int]string{ ERROR: "[ERROR]", } -var outLogger = log.New(os.Stdout, "", log.LstdFlags) -var errLogger = log.New(os.Stderr, "", log.LstdFlags) +// We don't need to log the date/time because journald takes care of that for us +var outLogger = log.New(os.Stdout, "", 0) +var errLogger = log.New(os.Stderr, "", 0) // log to stdout func OutLog(v ...interface{}) { @@ -77,7 +78,7 @@ func (logger *Logger) Debug(v ...interface{}) { // write information to the logfile and to stdout func (logger *Logger) Info(v ...interface{}) { - OutLog(append([]interface{}{"[" + logger.name + "]"}, v...)) + OutLog(append([]interface{}{"[" + logger.name + "]"}, v...)...) logger.log(INFO, v...) } @@ -88,6 +89,6 @@ func (logger *Logger) Warning(v ...interface{}) { // write errors to the logfile and to stderr func (logger *Logger) Error(v ...interface{}) { - ErrLog(append([]interface{}{"[" + logger.name + "]"}, v...)) + ErrLog(append([]interface{}{"[" + logger.name + "]"}, v...)...) logger.log(ERROR, v...) } diff --git a/merlin/merlin-config.ini b/merlin/merlin-config.ini index ae781c5..fbfb7ce 100644 --- a/merlin/merlin-config.ini +++ b/merlin/merlin-config.ini @@ -14,7 +14,7 @@ sock_path = /mirror/merlin/run/merlin.sock [debian] sync_type = csc-sync-ftpsync -frequency = bi-hourly +frequency = quad-hourly ftpsync_archive = debian [ubuntu] @@ -111,14 +111,6 @@ local_dir = CTAN rsync_host = rsync.dante.ctan.org rsync_dir = CTAN -[fedora-epel] -verbose = true -sync_type = csc-sync-standard -frequency = bi-hourly -local_dir = fedora/epel -rsync_host = mirrors.kernel.org -rsync_dir = fedora-epel - [cygwin] sync_type = csc-sync-standard frequency = twice-daily @@ -284,14 +276,11 @@ local_dir = FreeBSD rsync_host = ftp2.uk.freebsd.org rsync_dir = ftp.freebsd.org/pub/FreeBSD/ -[fedora-enchilada] -verbose = true -; csc-sync-standard fedora/linux mirrors.kernel.org fedora-enchilada/linux/ --ignore-errors && ~/bin/report_mirror >/dev/null -sync_type = csc-sync-standard +; This handles both fedora/linux and fedora/epel +; See ~/quick-fedora-mirror/quick-fedora-mirror.conf +[fedora] +sync_type = csc-sync-fedora frequency = bi-hourly -local_dir = fedora/linux -rsync_host = mirrors.kernel.org -rsync_dir = fedora-enchilada/linux/ [ubuntu-ports-releases] sync_type = csc-sync-standard diff --git a/merlin/merlin-go.service b/merlin/merlin-go.service index c783973..844713c 100644 --- a/merlin/merlin-go.service +++ b/merlin/merlin-go.service @@ -3,11 +3,12 @@ Description=Manages synchronization of mirrored projects After=network.target [Service] -ExecStart=/home/mirror/merlin/merlin --config=/home/mirror/merlin/merlin-config.ini +ExecStart=/home/mirror/merlin/merlin --config=./merlin-config.ini WorkingDirectory=/home/mirror/merlin User=mirror Group=mirror SyslogIdentifier=merlin +ExecReload=kill -HUP $MAINPID [Install] WantedBy=multi-user.target diff --git a/merlin/sync/sync.go b/merlin/sync/sync.go index c096c9f..0959b83 100644 --- a/merlin/sync/sync.go +++ b/merlin/sync/sync.go @@ -55,6 +55,8 @@ func getSyncCommand(repo *config.Repo) (cmds [][]string) { return append(cmds, cscSyncDebianStep1(repo), cscSyncDebianStep2(repo)) case "csc-sync-debian-cd": return append(cmds, cscSyncDebianCD(repo)) + case "csc-sync-fedora": + return append(cmds, cscSyncFedora(repo)) case "csc-sync-ftpsync": return append(cmds, cscSyncFtpsync(repo)) case "csc-sync-s3": @@ -291,6 +293,16 @@ func cscSyncDebianCD(repo *config.Repo) []string { return args } +func cscSyncFedora(repo *config.Repo) []string { + // Make sure that repo.RepoLogFile (default: ~/merlin/log/fedora.log) + // is the same as the LOGFILE setting in ~/quick-fedora-mirror/quick-fedora-mirror.conf + return []string{ + "sh", "-c", + "truncate --size=0 " + repo.RepoLogFile + "; " + + "cd ~/quick-fedora-mirror && ./quick-fedora-mirror > " + repo.RsyncLogFile, + } +} + func cscSyncFtpsync(repo *config.Repo) []string { // ftpsync configs are in /home/mirror/ftpsync/ return []string{"ftpsync", "sync:archive:" + repo.FtpsyncArchive} diff --git a/quick-fedora-mirror/quick-fedora-mirror b/quick-fedora-mirror/quick-fedora-mirror new file mode 100755 index 0000000..514bfc7 --- /dev/null +++ b/quick-fedora-mirror/quick-fedora-mirror @@ -0,0 +1,1679 @@ +#!/bin/zsh +# Simple script to grab the file list from Fedora and rsync everything that's +# changed since the last time we pulled. +# +# Originally written by Jason Tibbitts in 2016. +# Donated to the public domain. If you require a statement of license, please +# consider this work to be licensed as "CC0 Universal", any version you choose. + +# Variables in upper case are user configurables. + +# ZSHISM? Turn on empty globs +set -G +export LANG=C +# ZSHISM? newline for IFS. +IFS=$'\n' + +# Declare globals +typeset -A tcounts # Transfer counts + +# Do this very early +starttime=$(date +%s) + +# Debug output; +# Level 0: nothing except errors. +# Level 1: lvl0 unless there is a tranfer, and then basic info and times. +# Output goes to a file which may be spit out at the end of the run. +# Level >= 2: Always some info, output to the terminal. +db1 () { + if (( VERBOSE >= 2 )); then + echo $* + elif (( VERBOSE >= 1 )); then + echo $* >> $outfile + fi + # Otherwise output nothing.... +} +db1f () { db1 $(printf $*); } + +db2 () { (( VERBOSE >= 2 )) && echo $*} +db2f () { (( VERBOSE >= 2 )) && printf $*} +db3 () { (( VERBOSE >= 3 )) && echo '>>' $*} +db4 () { (( VERBOSE >= 4 )) && echo '>>>>' $*} +sep () { (( VERBOSE >= 2 )) && echo '============================================================'} + +logwrite () { + # Send logging info to the right place + if [[ -n $LOGJOURNAL ]]; then + echo $* >&3 + elif [[ -n $LOGFILE && -w $LOGFILE ]]; then + echo $(date '+%b %d %T') $* >> $LOGFILE + fi +} + +logit () { + # Basic logging function + local item=$1 + shift + local err='' + [[ $item == 'E' ]] && err='ERR:' + [[ $item == 'e' ]] && err='Err:' + + if [[ $LOGITEMS =~ $item || $LOGITEMS =~ '@' ]]; then + logwrite $err $* + fi + if (( VERBOSE >= 3 )); then + db3 Log: $err $* + fi + + # XXX Consider sending errors to stdout + #if [[ -n $err ]]; then + # (>&2 echo $*) + #fi + +} + +retcheck () { + local ret=$1 + local prg='' + [[ -n $2 ]] && prg="$2 " + + if [[ $ret -ne 0 ]]; then + db1 "${prg}failed at $functrace[1]: with return $ret" + logit E "${prg}call failed at $functrace[1]: with return $ret" + fi +} + +lock () { + eval "exec 9>>$1" + flock -n 9 && return 0 + return 1 +} + +save_state () { + # Doing an mv here actually undoes the locking. Could use cp instead. + # Currently the unlocking is a good thing because it allows the checkin to + # proceed without the next run waiting. But this should be audited. + if [[ -z $skiptimestamp ]]; then + db2 Saving mirror time to $TIMEFILE + if [[ -e $TIMEFILE ]]; then + mv $TIMEFILE $TIMEFILE.prev + fi + echo LASTTIME=$starttime > $TIMEFILE + + if (( ? != 0 )); then + (>&2 echo Problem saving timestamp file $TIMEFILE) + logit E "Failed to update timestamp file" + exit 1 + fi + else + db2 Skipping timestamp save. + fi +} + +append_state () { + # Think about how to save extra state in the timestamp file or some + # associated file. Should we even do this? + # Should this be saved to a separate status file instead? + + + # Cannot rewrite the file or else the locking breaks. Updating it should + # be OK. + # Save things in a format that can be sourced (VAR=value). + # Repeated uses (VAR=value2) are OK and overwrite the previous value when the file is sourced. + + # What would use this? A separate status program or some other monitor? + # + # Save data about the current transfer: + # The current point in the process ( + # Counts + # The current tempdir + # Important transfer list files + # The current rsync output file (for tailing and counting) since this is random. + +} + +finish () { + # Finish up. + # + # Takes two optional arguments. The first is the return value; the script + # will exit with that value and will dump the output file to stdout if the + # value is nonzero. If the second is nonempty, the output will be dumped + # regardless of the return value. + local ret=$1 + local out=$2 + db1 "=========================" + db1 "Mirror finished: $(date) ($ret)" + logit R "Run end; exiting $ret." + [[ $ret -gt 0 || -n $out ]] && cat $outfile + exit $ret +} + +filter () { + # Client-side file list filtering. + if [[ -n $FILTEREXP ]]; then + db4 filtering $1 + sed --in-place=-prefilter -r -e "\,$FILTEREXP,d" $1 + fi +} + +hr_b () { + # Produce human-readable byte counts + # Yes, this has a bug at 1024EB + typeset -F2 out + + if [[ $1 -lt 1024 ]]; then + echo ${1}B + return + fi + + out=$(( $1 / 1024. )) + for unit in KB MB GB TB PB EB; do + (( $out < 1024 )) && break + out=$(( out / 1024. )) + done + + echo ${out}${unit} +} + +hr_s () { + # Produce human-readable second counts + typeset -F2 out=$1 + + if [[ $1 -lt 60 ]]; then + echo ${1}s + return + fi + + out=$(( $1 / 60. )) + if [[ $out -lt 60 ]]; then + echo ${out}m + return + fi + + out=$(( $out / 60. )) + echo ${out}h +} + +parse_rsync_stats () { + # Parse some of the statistics that rsync gives us. + # Takes an rsync output log (stdout) as an argument. + # No return value, but sill set several global variables: + # rsfilestransferred + # rsfilesize + # rstotalbytesreceived + # rstotalbytessent + # rsfilelistgentime + # rsfilelisttransfertime + # rstransferspeed + # rsspeedup + # These will all be set unset if not present in the given log. + # + # Here's the full block of info that rsync provides: + # + # rsync[30399] (receiver) heap statistics: + # arena: 311296 (bytes from sbrk) + # ordblks: 2 (chunks not in use) + # smblks: 1 + # hblks: 2 (chunks from mmap) + # hblkhd: 532480 (bytes from mmap) + # allmem: 843776 (bytes from sbrk + mmap) + # usmblks: 0 + # fsmblks: 48 + # uordblks: 178272 (bytes used) + # fordblks: 133024 (bytes free) + # keepcost: 131200 (bytes in releasable chunk) + # + # rsync[30394] (generator) heap statistics: + # arena: 311296 (bytes from sbrk) + # ordblks: 2 (chunks not in use) + # smblks: 1 + # hblks: 2 (chunks from mmap) + # hblkhd: 532480 (bytes from mmap) + # allmem: 843776 (bytes from sbrk + mmap) + # usmblks: 0 + # fsmblks: 48 + # uordblks: 178208 (bytes used) + # fordblks: 133088 (bytes free) + # keepcost: 131200 (bytes in releasable chunk) + # + # Number of files: 11,140 (reg: 9,344, dir: 1,796) + # Number of created files: 1,329 (reg: 1,327, dir: 2) + # Number of deleted files: 0 + # Number of regular files transferred: 1,182 + # Total file size: 165,405,056,029 bytes + # Total transferred file size: 3,615,178,247 bytes + # Literal data: 3,229,943,512 bytes + # Matched data: 385,234,735 bytes + # File list size: 468,791 + # File list generation time: 0.217 seconds + # File list transfer time: 0.000 seconds + # Total bytes sent: 1,249,286 + # Total bytes received: 3,231,373,895 + # + # sent 1,249,286 bytes received 3,231,373,895 bytes 81,838,561.54 bytes/sec + # total size is 165,405,056,029 speedup is 51.17 + + local log=$1 + + # Number of regular files transferred: 1 + unset rsfilestransferred + rsfilestransferred=$(awk '/^Number of regular files transferred:/ {print $6; exit}' $log) + + # Total file size: 10,174,746 bytes + unset rsfilesize + rsfilesize=$(awk '/^Total file size: (.*) bytes/ {print $4; exit}' $log | sed -e 's/,//g') + + # Total bytes received: 2,425,728 + unset rstotalbytesreceived + rstotalbytesreceived=$(awk '/^Total bytes received: (.*)/ {print $4; exit}' $log | sed -e 's/,//g') + + # Total bytes sent: 384,602 + unset rstotalbytessent + rstotalbytessent=$(awk '/^Total bytes sent: (.*)/ {print $4; exit}' $log | sed -e 's/,//g') + + # File list generation time: 0.308 seconds + unset rsfilelistgentime + rsfilelistgentime=$(awk '/^File list generation time: (.*) seconds/ {print $5; exit}' $log) + + # File list transfer time: 0.000 seconds + unset rsfilelisttransfertime + rsfilelisttransfertime=$(awk '/^File list transfer time: (.*) seconds/ {print $5; exit}' $log) + + # sent 71 bytes received 2,425,728 bytes 156,503.16 bytes/sec + unset rstransferspeed + rstransferspeed=$(awk '/^sent .* bytes .* received .* bytes (.*) bytes\/sec$/ {print $7; exit}' $log \ + | sed -e 's/,//g') + + # total size is 10,174,746 speedup is 4.19 + unset rsspeedup + rsspeedup=$(awk '/^total size is .* speedup is (.*)$/ {print $7; exit}' $log) +} + +do_rsync () { + # The main function to do a transfer + # Accepts four options: + # 1) The source repository + # 2) The destination directory + # 3) The list of files + # 4) The name of an array containing additional rsync options + # + # This may sleep and retry when receiving specific errors. + # Returns the rsync return code (where 0 indicates full success, but other + # values may indicate a finished copy). + + local src=$1 dest=$2 files=$3 opts=$4 + local runcount=0 + local log=$(mktemp -p . rsync-out-XXXXXX.log) + local errlog=$(mktemp -p . rsync-err-XXXXXX.log) + local sleep rr rvbash rvzsh + local rsyncto="--timeout=$RSYNCTIMEOUT" + + local -a verboseopts flopts allopts + + # These add to the default rsync verbosity + (( VERBOSE >= 7 )) && verboseopts+=(--progress) + (( VERBOSE >= 5 )) && verboseopts+=(-v) + (( VERBOSE >= 4 )) && verboseopts+=(-v) + + # Usually we won't want to see this. + (( VERBOSE <= 3 )) && verboseopts+=(--no-motd) + + flopts=("--files-from=$files") + allopts=($rsyncto $RSYNCOPTS $verboseopts $flopts ${(P)opts} $src $dest) + + while true; do + runcount=$(( runcount+1 )) + # ZSHISM: (P) flag to act on a variable by name. Sadly, bash has + # broken array handling. bash 4.3 has local -n for this. Older bash + # needs hacks, or eval. More info: + # https://stackoverflow.com/questions/1063347/passing-arrays-as-parameters-in-bash + # Or just use a freaking global. + + # We have to do this separately because you can't redirect to /dev/stderr when running under sudo. + # ZSHISM Teeing both stderr and stdout while keeping the return code is + # easy in zsh with multios but seems to be terribly difficult under bash. + db3 Calling $RSYNC $allopts + logit c calling $RSYNC $allopts + if (( VERBOSE >= 5 )); then + $RSYNC $allopts 1>&1 2>&2 >> $log 2>> $errlog + elif (( VERBOSE >= 2 )); then + $RSYNC $allopts >> $log 2>&2 2>> $errlog + else + $RSYNC $allopts >> $log 2>> $errlog + fi + rr=$? + + # Check return values + if (( rr == 0 )); then + logit C rsync call completed succesfully with return $rr + parse_rsync_stats $log + return 0 + + elif (( rr == 24 )); then + # 24: Partial transfer due to vanished source files + logit e "rsync says source files vanished." + return $rr + + elif (( rr == 5 || rr == 10 || rr == 23 || rr == 30 || rr == 35 )); then + # Most of these are retryable network issues + # 5: Error starting client-server protocol + # 10: Error in socket I/O + # 30: Timeout in data send/receive + # 35: Timeout waiting for daemon connection + # 23: Partial transfer due to error + # (could be a file list problem) + if [[ $rr -eq 23 && -f $errlog ]] ; then + # See if it we tried to tranfer files that don't exist + grep -q '^rsync: link_stat .* failed: No such file or directory (2)$' $errlog + if (( ? == 0 )); then + logit e "Looks like the file list is outdated." + (>&2 echo "Looks like the file list is outdated.") + [[ -f $errlog ]] && (>&2 cat $errlog) + return $rr + fi + fi + + # It's not one of those special 23 errors, so we may retry. First + # see if we've already tried too many times. + if (( runcount >= MAXRETRIES )); then + logit E rsync from $REMOTE failed + (>&2 echo "Could not sync from $REMOTE") + [[ -f $errlog ]] && (>&2 cat $errlog) + return $rr + fi + + # Then sleep for a bit + sleep=$(( 2 ** runcount )) + logit e "rsync returned $rr (retryable), sleeping for $sleep" + db2 rsync failed: sleeping for $sleep + sleep $sleep + continue + fi + + # We only get here if we got a return we didn't expect + logit E "rsync returned $rr, which was not expected." + (>&2 echo "rsync returned $rr, which was not expected." + [[ -f $errlog ]] && cat $errlog + ) + return $rr + done +} + +usage () { + cat < 0 ]]; do + opt=$1 + case $opt in + -a | --alwayscheck) + alwayscheck=1 + ;; + -c | --config) + cfgfile=$2 + shift + if [[ ! -r $cfgfile ]]; then + (>&2 echo Cannot read $cfgfile) + exit 1 + fi + ;; + -d) # Debugging + verboseopt=$2 + shift + ;; + -h | --help) + usage + exit 1 + ;; + -n | --dry-run) + rsyncdryrun=1 + skipdelete=1 + skiptimestamp=1 + ;; + -N | --transfer-only) + skipdelete=1 + skiptimestamp=1 + ;; + -t ) + backdate=$2 + alwayscheck=1 + shift + ;; + -T | --backdate) + backdate=$(date -d "$2" +%s) + alwayscheck=1 + shift + ;; + --checkin-only) + skiptransfer=1 + skipdelete=1 + skiptimestamp=1 + forcecheckin=1 + ;; + --dir-times) + updatealldirtimes=1 + alwayscheck=1 + ;; + --refresh) + skipdelete=1 + skiptimestamp=1 + skipcheckin=1 + refreshpattern=$2 + shift + ;; + --dump-mm-checkin) + # Just for the test suite; dump the raw payload to the given + # filename with the module name appended. + dumpmmcheckin=$2 + shift + ;; + --no-paranoia) + # Don't backdate the last mirrortime + noparanoia=1 + ;; + *) + (>&2 echo "Unrecognized argument.") + exit 1 + ;; + esac + shift + done +} + +read_config () { + # Load up the configuration file from any of a number of locations + local file + for file in \ + $cfgfile \ + /etc/quick-fedora-mirror.conf \ + ~/.config/quick-fedora-mirror.conf \ + $(dirname $0)/quick-fedora-mirror.conf \ + ./quick-fedora-mirror.conf; \ + do + if [[ -r $file ]]; then + source $file + cfgfile=$file + break + fi + done + + # Override some settings with previously parsed command-line options + [[ -n $verboseopt ]] && VERBOSE=$verboseopt + + # Check that the required parameters were provided + if [[ -z $DESTD ]]; then + (>&2 echo "You must define DESTD in your configuration file ($cfgfile).") + fi + if [[ -z $TIMEFILE ]]; then + (>&2 echo "You must define TIMEFILE in your configuration file ($cfgfile).") + fi + + # Set some other general variables based on the value of provided + # configuration settings + [[ -z $CHECKIN_SITE ]] && skipcheckin=1 + [[ -z $MAXCHECKINRETRIES ]] && MAXCHECKINRETRIES=$MAXRETRIES +} + +set_default_vars () { + # Set various defaults before the configuration file is loaded. + + # Mapping from module names to directories under fedora-buffet + # ZSHISM (initialize associative array) + typeset -g -A MODULEMAPPING + typeset -g -A MIRRORMANAGERMAPPING + typeset -g -A MIRRORMANAGERMODULEMAPPING + + MODULEMAPPING=( + fedora-alt alt + fedora-archive archive + fedora-enchilada fedora + fedora-epel epel + fedora-secondary fedora-secondary + ) + + MIRRORMANAGERMAPPING=( + fedora-alt 'fedora other' + fedora-archive 'fedora archive' + fedora-enchilada 'fedora linux' + fedora-epel 'fedora epel' + fedora-secondary 'fedora secondary arches' + ) + + # Mirrormanager has a weird prefix for "fedora-enchilada", so copy the + # existing module mapping and alter it + MIRRORMANAGERMODULEMAPPING=(${(kv)MODULEMAPPING}) + MIRRORMANAGERMODULEMAPPING[fedora-enchilada]="fedora/linux" + + # Default arguments; override in quick-fedora-mirror.conf + VERBOSE=0 + LOGITEMS=aeElrR + + DESTD= + TIMEFILE= + + CHECKIN_HOST=$(hostname) + CURL=/usr/bin/curl + FILELIST='fullfiletimelist-$mdir' + EXTRAFILES=(fullfilelist imagelist-\$mdir) + MIRRORMANAGER=https://admin.fedoraproject.org/mirrormanager/xmlrpc + REMOTE=rsync://dl.fedoraproject.org + RSYNC=/usr/bin/rsync + RSYNCTIMEOUT=$((60 * 10)) + WARNDELAY=$((60 * 60 * 24)) + MAXRETRIES=10 + + rsyncver=$(rsync --version | head -1 | awk '{print $3}') + if [[ $rsyncver == 3.1.3 ]]; then + # 3.1.3 has broken support for --preallocate and -S (--sparse) together + RSYNCOPTS=(-aSH -f 'R .~tmp~' --stats --delay-updates --out-format='@ %i %10l %n%L') + elif [[ $rsyncver == 3.1* ]]; then + RSYNCOPTS=(-aSH -f 'R .~tmp~' --stats --preallocate --delay-updates --out-format='@ %i %10l %n%L') + else + RSYNCOPTS=(-aSH -f 'R .~tmp~' --stats --delay-updates --out-format='@ %i %10l %n%L') + fi + + MASTERMODULE=fedora-buffet + MODULES=(fedora-enchilada fedora-epel) +} + +check_file_list_version () { + # Look at the file list to see if we can handle it + # + # Takes the file list name. + # Returns 0 if we can handle it, 1 if we can't. + local max_fl_version=3 + local fl=$1 + + if [[ ! -f $fl ]]; then + (>&2 echo "Cannot check file list \"$fl\". Exiting.") + exit 1 + fi + + local flversion=$(awk -F '\t' '/^\[Version/ {s=1; next} /^$/ {exit} {if (s) print $0}' < $fl) + if [[ "$flversion" -le $max_fl_version ]]; then + return + fi + + # Either it is too new or we just can't parse it, so quit. + (>&2 echo "File list from the mirror cannot be processed by this script. Exiting.") + exit 1 +} + +clean_all_transfer_temps () { + # Delete temporary transfer files, but not any log files. + # Be sure to add any extra generated temporaries here. + # XXX Is it OK that this doesn't delete the file lists? They will just get + # copied over. + rm -f *.old + for i in ${(v)MODULEMAPPING} alldirs allfiles allfilesizes changedpaths \ + changes checksumfailed checksums deletedirs deletefiles flist \ + localdirs localfiles localfilesizes localfulllist master missingdirs \ + missingfiles newdirs newfiles staletmpdirs staletmpfiles \ + transferlist updatedfiles updatetimestamps; do + rm -f $i-* + done +} + +clean_stale_rsync_temps () { + # Clean up temporaries left over from a previous aborted rsync run. + local mod=$1 + + db2 Possibly aborted rsync run. Cleaning up. + logit a "cleaning up previous aborted run: $(wc -l < staletmpfiles-$mod) file(s)." + + # Move the files in those tmpdirs a level up if a file with the + # same name doesn't exist. We don't update the file lists because + # we want rsync to re-check those files and possibly fix up the + # permissions. The dirs will be cleaned up later. + # Note that this _may_ leave a few files around which should not be + # there. They will of course be cleaned up at the next run. + # XXX We could do better by comparing the stale files against the + # to-be-fransferred list, but it's probably not worth it. + for dir in $(cat staletmpdirs-$mod); do + pushd $DESTD/$dir + for file in *; do + if [[ ! -f ../$file ]]; then + logit A Saving previous download $file + db3 Saving previous download: $file + mv $file .. + fi + done + popd + done +} + +fetch_file_lists () { + # Download the file list for each configred module + # Will set the global variable "checksums" containing the checksum of the + # file list of each module that exists on the client at the beginning of the transfer. + + local extra flname module rsyncreturn + + sep + logit o Remote file list download start + db2 Downloading file lists + # ZSHISM (declare associative array) + typeset -g -A checksums + checksums=() + for module in $MODULES; do + # ZSHISM? (associative array indexing) + moduledir=$MODULEMAPPING[$module] + mkdir $moduledir + flname=${FILELIST/'$mdir'/$moduledir} + if [[ -f $DESTD/$moduledir/$flname ]]; then + cp -p $DESTD/$moduledir/$flname $moduledir + ln $moduledir/$flname $moduledir/$flname.old + # ZSHISM (assign assoc. array value) + checksums[$module]=$(sha1sum $DESTD/$moduledir/$flname | cut -d' ' -f1) + fi + + echo $moduledir/$flname >> filelist-transferlist + done + + extra=(--no-dirs --relative --compress) + do_rsync $REMOTE/$MASTERMODULE/ . filelist-transferlist extra + rsyncreturn=$? + if [[ $rsyncreturn -ne 0 ]]; then + (>&2 echo "rsync finished with nonzero exit status.\nCould not retrieve file lists.") + logit E Aborting due to rsync failure while retrieving file lists + finish 1 + fi + + # Log very basic stats + logit s "File list download: $(hr_b $rstotalbytesreceived) received, $(hr_b $rstransferspeed)/s" + + # Check that we can handle the downloaded lists + for module in $MODULES; do + moduledir=$MODULEMAPPING[$module] + flname=${FILELIST/'$mdir'/$moduledir} + check_file_list_version $moduledir/$flname + done + + # rsync won't transfer those files to the current directory, so move them and + # clean up. + mv */* . + rmdir * 2> /dev/null + logit o Remote file list download: end +} + +checkin_build_inner_payload () { + # Build the inner json payload + # Takes the module name and the name of the output file to use + local module=$1 + local mm=$2 + local checkinhost=$3 + + local moduledir=$MIRRORMANAGERMODULEMAPPING[$module] + local mmcheckin=$MIRRORMANAGERMAPPING[$module] + + cat >$mm <>$mm + done + + # The data sent by report_mirror always includes a blank directory; add it + # manually here which conveniently means we don't have to deal with the + # trailing comma. And after that, the various parameters mirrormanager + # wants. + cat >>$mm < $in.bz2.b64 + + # change '+' to '-' and '/' to '_' + tr '+/' '-_' < $in.bz2.b64 > $out + + rm $in.bz2 $in.bz2.b64 +} + +checkin_build_outer_payload () { + # Wrap the encoded payload in just the right xml + # Takes input and output filenames + + local in=$1 + local out=$2 + + cat >>$out < + +checkin + + +EOF + echo -n "" >>$out + + cat <$in >>$out + + cat >>$out < + + + +EOF +} + +checkin_upload_payload () { + # Now actually upload the payload + # We have to remove the Expect: header that curl sends but which mirrormanager cannot handle + local payload=$1 + local module=$2 + local -a curlopts + local curlret + + logit M "Making xmlrpc call for $module" + curlopts=(--silent) + curl --help | grep -q http1\.1 + (( ? == 0 )) && curlopts+=(--http1.1) + (( VERBOSE >= 4 )) && curlopts=(--verbose) + db3 "$CURL $curlopts -H \"Expect:\" -H \"Content-Type: text/xml\" --data @$mx $MIRRORMANAGER" + $CURL $curlopts -H "Expect:" -H "Content-Type: text/xml" --data @$mx $MIRRORMANAGER > curl.out + curlret=$? + if [[ $curlret -ne 0 ]]; then + logit e "Checkin failure: curl returned $curlret" + (>&2 echo "Checkin failure: curl returned $curlret") + return 2 + fi + + # Parse the output to see if we got any useful return + # The sed call attempts to strip xml tags. Easily fooled but we don't expect + # any complicated return from mirrormanager. + sed -e 's/<[^>]*>//g' curl.out > curl.noxml + grep -q -i successful curl.noxml + + if [[ $? -ne 0 ]]; then + db1 "Mirrormanager checkin for $module did not appear to succeed." + logit e "Doesn't look like we got a good return from mirrormanager." + logit e $(cat curl.noxml) + return 1 + fi + return 0 +} + +checkin_module () { + # Perform the mirrormanager checkin for a particular module + local module=$1 + + local mm=mirrormanager-payload-$module + local mx=mirrormanager-xmlrpc-$module + local moduledir=$MODULEMAPPING[$module] + + if [[ ! -f alldirs-$module ]]; then + # We were asked to check in a module that we hadn't previously + # processed, which should not happen. + logit E "Cannot perform checkin for $module; no directory list exists." + return + fi + + # Determine the "mirrormanager hostname" to use for this checkin. + # Different modules can be set up under different "hosts" in mirrormanager, + # even though these might all be on the same machine. This works around + # problems mirrormanager has when crawling machines which mirror + # everything. + # ZSHISM: This uses "(P)"; the equivalent in bash is "!". + local checkinhost=$CHECKIN_HOST + local hostspecificvar=CHECKIN_HOST_${module//-/_} + if [[ -n ${(P)hostspecificvar} ]]; then + checkinhost=${(P)hostspecificvar} + fi + + db3 "Performing mirrormanager checkin for $module (in $moduledir) as $checkinhost" + logit M "Processing $module (in $moduledir) as $checkinhost" + + # Construct the checkin payload + checkin_build_inner_payload $module $mm $checkinhost + checkin_encode_inner_payload $mm $mm.enc + checkin_build_outer_payload $mm.enc $mx + + # For the test suite, just dump the checkin info and bail + if [[ -n $dumpmmcheckin ]]; then + cat $mx > $dumpmmcheckin-$module + return + fi + + # Try to check in until we've retried too often. + local retries=1 + while true; do + checkin_upload_payload $mx $module + + if [[ $? -eq 0 ]]; then + break + fi + + if (( retries >= MAXRETRIES )); then + logit E "Could not complete checkin after $MAXCHECKINRETRIES tries." + break + fi + + logit e "Checkin attempt $retries failed. Will retry." + retries=$(( retries +1 )) + sleep $(( 2*retries )) + done + + logit M "Processing $module: end" +} + +awk_extract_file_list () { + local inf=$1 + local outf=$inf.flist + [[ -n $2 ]] && outf=$2 + + awk ' \ + /^\[Files/ {s=1;next} + /^$/ {if (s==1) exit} + s {print}' \ + < $inf > $outf + retcheck $? awk +} + +awk_extract_paths_from_file_list_restricted () { + local inf=$1 + local outf=$2 + local mdir=$3 + + # We can just ignore the type and permissions completely + awk -F '\t' "{print \"$mdir/\" \$4}" < $inf > $outf + retcheck $? awk +} + +awk_extract_paths_from_file_list_norestricted () { + local inf=$1 + local outf=$2 + local mdir=$3 + + awk -F '\t' " \ + { if (\$2 == \"d\" || \$2 == \"f\" || \$2 == \"l\") \ + print \"$mdir/\" \$4 \ + }" < $inf > $outf + retcheck $? awk +} + +awk_extract_newer_dirs_restricted () { + local inf=$1 + local outf=$2 + local mdir=$3 + + local last=0 + [[ -n $4 ]] && last=$4 + + awk -F '\t' " \ + /\\[Files/ {s=1;next} + /^\$/ {s=0;next} + { if (s && \$1 >= $last \ + && (\$2 == \"d\" || \$2 == \"d-\" || \$2 == \"d*\")) \ + print \"$mdir/\" \$4 \ + }" \ + < $inf > $outf + retcheck $? awk +} + +awk_extract_newer_dirs_no_restricted () { + local inf=$1 + local outf=$2 + local mdir=$3 + + local last=0 + [[ -n $4 ]] && last=$4 + + awk -F '\t' " \ + /\\[Files/ {s=1;next} \ + /^\$/ {s=0;next} \ + { if (s && \$1 >= $last \ + && (\$2 == \"d\")) \ + print \"$mdir/\" \$4 \ + }" \ + < $inf > $outf + retcheck $? awk +} + +awk_extract_newer_files_restricted () { + local inf=$1 + local outf=$2 + local mdir=$3 + + local last=0 + [[ -n $4 ]] && last=$4 + + awk -F '\t' "/\\[Files/ {s=1;next} \ + /^\$/ {s=0;next} \ + {if (s && \$1 >= $last && \ + (\$2 == \"f\" || \$2 == \"f-\" || \$2 == \"f*\" \ + || \$2 == \"l\" || \$2 == \"l-\" || \$2 == \"l*\" \ + )) \ + print \"$mdir/\" \$4 \"\t\" \$3 \ + } \ + " $inf > $outf + retcheck $? awk +} + +awk_extract_newer_files_no_restricted () { + local inf=$1 + local outf=$2 + local mdir=$3 + + local last=0 + [[ -n $4 ]] && last=$4 + + awk -F '\t' "/\\[Files/ {s=1;next} \ + /^\$/ {s=0;next} \ + {if (s && \$1 >= $last && \ + (\$2 == \"f\" \ + || \$2 == \"l\" \ + )) \ + print \"$mdir/\" \$4 \"\t\" \$3 \ + } \ + " $inf > $outf + retcheck $? awk +} + +process_file_list_diff () { + # Extract and then diff the old and new file lists for a module + # Creates changedfiles-$module file + + local fl=$1 + local mod=$2 + local mdir=$3 + + local oldflist=flist-old-$mod + local newflist=flist-new-$mod + + logit l "Generating database diff start: $mod" + + # Extract the file list part of old and new file lists. + awk_extract_file_list $fl.old flist-old-$mod + awk_extract_file_list $fl flist-new-$mod + + # sort each by path + sort -t$'\t' -k4 $oldflist > $oldflist.sorted + sort -t$'\t' -k4 $newflist > $newflist.sorted + + # compute the changes + diff --changed-group-format='%>' --unchanged-group-format='' $oldflist.sorted $newflist.sorted > changes-$mod + + # Extract path from changes + if [[ -n $PREBITFLIP ]]; then + awk_extract_paths_from_file_list_restricted changes-$mod changedpaths-$mod $mdir + else + awk_extract_paths_from_file_list_norestricted changes-$mod changedpaths-$mod $mdir + fi + + # We must filter here so that files we don't want to transfer won't appear + # to have changed. + filter changedpaths-$mod + + logit l "Generating database diff end: $mod" +} + +compute_file_list_stats () { + # Calculate and log counts of the various generated lists + local mod=$1 + local -a stats + stats=(allfiles alldirs newfiles newdirs changedpaths localfiles \ + localdirs deletefiles deletedirs missingfiles missingdirs \ + updatedfiles updatetimestamps checksumfailed) + + for i in stats; do + counts[$i]=0 + [[ -f $i-$mod ]] && counts[$i]=$(wc -l < $i-$mod) + done + + counts[totaltransfer]=$(wc -l transferlist-$mod) + + # Until the rest of the code is fixed up + counts[extrafiles]=$counts[deletefiles] + counts[extradirs]=$counts[deletedirs] + counts[sizechanged]=$counts[updatedfiles] + counts[allserverfiles]=$counts[allfiles] + counts[allserverdirs]=$counts[alldirs] + counts[newserverfiles]=$counts[newfiles] + counts[newserverdirs]=$counts[newdirs] + + # Previously these two were printed before generating the local file lists + db2f "Total on server: %7d files, %4d dirs.\n" $cntallserverfiles $cntallserverdirs + db2f "New on server: %7d files, %4d dirs.\n" $cntnewserverfiles $cntnewserverdirs + + db2f "Total on client: %7d files, %4d dirs.\n" $counts[localfiles $counts[localdirs] + db2f "Not present on server: %7d files, %4d dirs.\n" $counts[extrafiles] $counts[extradirs] + db2f "Missing on client: %7d files, %4d dirs.\n" $counts[missingfiles] $counts[missingdirs] + db2f "Size Changed: %7d files.\n" $counts[sizechanged] + db2f "Timestamps to restore: %7d files.\n" $counts[updatetimestamps] + db2f "Checksum Failed: %7d files.\n" $counts[checksumfailed] + db2f "Filelist changes: %7d paths.\n" $counts[changedpaths] + db2f "Total to transfer: %7d paths.\n" $counts[totaltransfer] + + logit L "Counts for $mod: Svr:$counts[allserverfiles]/$counts[allserverdirs] Loc:$counts[localfiles]/$counts[localdirs] Diff:$counts[changedpaths] New:$counts[newserverfiles]/$counts[newserverdirs] Xtra:$counts[extrafiles]/$counts[extradirs] Miss:$counts[missingfiles]/$counts[missingdirs] Size:$counts[sizechanged] Csum:$counts[checksumfailed] Dtim:$counts[updatetimestamps]" + +} + +generate_local_file_list () { + # Generate lists of what the client has. + local mod=$1 + local mdir=$2 + + db3 Generating local file/dir list + logit l "Generating local file list start: $mod" + + # Traverse the filesystem only once + pushd $DESTD + find $mdir/* -printf '%y\t%p\t%s\n' > $tempd/localfulllist-$mod + popd + + # Now extract file and dir lists from that + awk -F '\t' '{if ($1 == "d") {print $2}}' < localfulllist-$mod > localdirs-$mod + awk -F '\t' '{if ($1 == "f" || $1 == "l") {print $2}}' < localfulllist-$mod > localfiles-$mod + awk -F '\t' '{if ($1 == "f" || $1 == "l") {print $2 "\t" $3}}' < localfulllist-$mod > localfilesizes-$mod + + # Look for stray .~tmp~ dirs + if [[ -z $NORSYNCRECOVERY ]]; then + grep '\.~tmp~' localdirs-$mod > staletmpdirs-$mod + grep '\.~tmp~' localfiles-$mod > staletmpfiles-$mod + fi + + logit l "Generating local file list end: $mod" +} + +process_local_file_list () { + # Compare what the client has to what the server has, and generate more + # lists based on that. + # Generates the fillowing file lists: + # deletefiles-$module + # deletedirs-$module + # updatetimestamps-$module + # missingfiles-$module + # missingdirs-$module + # updatedfiles-$module + # checksumfailed-$module + + # XXX Don't do any master transferlist manipulation here. + local mod=$1 + local mdir=$2 + + # Find files on the client which don't exist on the server + sort allfiles-$mod allfiles-$mod localfiles-$mod \ + | uniq -u > deletefiles-$mod + remove_filelists_from_file deletefiles-$mod $mdir + + # Find dirs on the client which don't exist on the server + sort alldirs-$mod alldirs-$mod localdirs-$mod \ + | uniq -u > deletedirs-$mod + + # Extract dirnames of every file and dir in the delete lists, and all of their parents. + if [[ -n $updatealldirtimes ]]; then + echo $mdir > updatetimestamps-$mod + cat alldirs-$mod >> updatetimestamps-$mod + else + awk '{dn($0)} function dn(p) { while (sub(/\/[^\/]*\]?$/, "", p)) print p }' \ + deletefiles-$mod deletedirs-$mod \ + | sort -u > updatetimestamps-$mod + fi + + # Find files on the server which are missing on the client + sort localfiles-$mod localfiles-$mod allfiles-$mod \ + | uniq -u > missingfiles-$mod + + # Find dirs on the server which are missing on the client + sort localdirs-$mod localdirs-$mod alldirs-$mod \ + | uniq -u > missingdirs-$mod + + # Find files which have changed size + sort allfilesizes-$mod localfilesizes-$mod \ + | uniq -u | awk -F '\t' '{print $1}' \ + | uniq -d > updatedfiles-$mod + + # Extract and verify checksums + awk -F '\t' "/^\[Checksums/ {s=1; next} /^$/ {s=0; next} {if (s) print \$1 \" $mdir/\" \$2}" $fl > checksums-$mod + pushd $DESTD > /dev/null 2>&1 + sha1sum --check --quiet $tempd/checksums-$mod 2> /dev/null \ + | grep -i 'failed$' \ + | awk -F: '{print $1}' > $tempd/checksumfailed-$mod + popd > /dev/null 2>&1 +} + +process_remote_file_list () { + # Extract various file and directory lists from the master file list + # + # This will also handle ignoring restricted or pre-bitflip content if + # necessary. + # + # Will create the following files: + # allfilesizes-$module + # allfiles-$module + # alldirs-$module + # newdirs-$module + + local fl=$1 + local module=$2 + local moduledir=$3 + + db3 Extracting file and directory lists for $module. + + if [[ -n $PREBITFLIP ]]; then + db4 "Directories (pre-bitflip included)" + awk_extract_newer_dirs_restricted $fl alldirs-$module $moduledir + + db4 "New dirs (pre-bitflip included)" + awk_extract_newer_dirs_restricted $fl newdirs-$module $moduledir $LASTTIME + + db4 "Files (pre-bitflip included)" + awk_extract_newer_files_restricted $fl allfilesizes-$module $moduledir + + db4 "New files (pre-bitflip included)" + awk_extract_newer_files_restricted $fl newfilesizes-$module $moduledir $LASTTIME + else + # All dirs, unrestricted only + db4 "Directories (pre-bitflip excluded)" + awk_extract_newer_dirs_no_restricted $fl alldirs-$module $moduledir + + db4 "New dirs (pre-bitflip excluded)" + awk_extract_newer_dirs_no_restricted $fl newdirs-$module $moduledir $LASTTIME + + db4 "Files (pre-bitflip excluded)" + awk_extract_newer_files_no_restricted $fl allfilesizes-$module $moduledir + + db4 "New files (pre-bitflip excluded)" + awk_extract_newer_files_no_restricted $fl newfilesizes-$module $moduledir $LASTTIME + fi + + # Filter the lists if needed + filter alldirs-$module + filter newdirs-$module + filter allfilesizes-$module + filter newfilesizes-$module + + # Produce the file lists without sizes. + awk -F '\t' '{print $1}' allfilesizes-$module > allfiles-$module; retcheck $? awk + awk -F '\t' '{print $1}' newfilesizes-$module > newfiles-$module; retcheck $? awk +} + +update_master_file_lists () { + # Simply append various per-module lists to the master lists + cat deletefiles-$module >> master-deletefiles + cat deletedirs-$module >> master-deletedirs + cat updatetimestamps-$module >> master-updatetimestamps + cat missingfiles-$module >> transferlist-$module + cat missingdirs-$module >> transferlist-$module + cat updatedfiles-$module >> transferlist-$module + cat checksumfailed-$module >> transferlist-$module +} + +remove_filelists_from_file () { + # Remove the file from $FILELIST and anything given by $EXTRAFILES. + # Takes: + # file to modify + # directory of current module (for substituting $mdir) + # Modifies the file directly + # Calls egrep -v in a loop. Generally this is called on files of no more + # than a few thousand lines, so performance shouldn't be an issue. + + local f=$1 + local moduledir=$2 + local tmp=$f.rfff + local fl + + for fl in $FILELIST $EXTRAFILES; do + fl=${fl/'$mdir'/$moduledir} + egrep -v "^[^/]*/$fl" $f > $tmp + mv $tmp $f + done + + rm -f $tmp +} + +process_module () { + # Determine what needs to be transferred and removed from a single module. + # + # Takes the name of the module to process, returns nothing. + # + # Sets the following globals: + # changed_modules + # + # Will leave the following lists in the temporary dir for use by other + # functions: (all of them; currently deletes nothing) + # + # May leave other files, but don't depend on them. + # + # The various status variables, for logging: + # cntallserverfiles/cntallserverdirs - total files/dirs on server. + # cntnewserverfiles/cntnewserverdirs - new files/dirs on server (since last mirror time) + # cntlocalfiles/cntlocaldirs - total files/dirs on client. + # cntextrafiles/cntextradirs - files/dirs on client but not server. + # cntmissingfiles/cntmissingdirs - files/dirs on server but not client. + # cntsizechanged - files where size differs between server/client. + # cntupdatetimestamps - dir timestamps to restore + # cntchecksumfailed - files where checksum differs between server/client. + # cntchangedpaths - count of all differences between file lists. + + local module=$1 + # ZSHISM? (associative array indexing) + local moduledir=$MODULEMAPPING[$module] + + local fl=${FILELIST/'$mdir'/$moduledir} + local cntallserverfiles cntallserverdirs cntnewserverfiles cntnewserverdirs + local cntchangedpaths cntlocalfiles cntlocaldirs cntextrafiles cntextradirsi + local cntmissingfiles cntmissingdirs cntsizechanged cntupdatetimestamps cntchecksumfailed + local extra + + if [[ -z $alwayscheck && \ + -n $checksums[$module] && \ + $(sha1sum $fl | cut -d' ' -f1) == $checksums[$module] ]]; then + logit N No change in file list for $module + db2 No change in file list checksum. Skipping $module. + continue + fi + + sep + logit P Processing start: $module + db2 Processing $module + changed_modules+=$module + + # Make sure the list is complete. + tail -2 $fl | grep -q '^\[End\]$' + if (( ? != 0 )); then + logit e "Invalid file list; skipping $module" + (>&2 echo "No end marker. Corrupted file list?" + echo Skipping $module.) + return + fi + + process_remote_file_list $fl $module $moduledir + + cntallserverfiles=$(wc -l < allfiles-$module) + cntallserverdirs=$(wc -l < alldirs-$module) + db2f "Total on server: %7d files, %4d dirs.\n" $cntallserverfiles $cntallserverdirs + + cntnewserverfiles=$(wc -l < newfiles-$module) + cntnewserverdirs=$(wc -l < newdirs-$module) + db2f "New on server: %7d files, %4d dirs.\n" $cntnewserverfiles $cntnewserverdirs + + # Add extra files to the transfer list + echo $moduledir/$fl >> newfiles-$module + for extra in $EXTRAFILES; do + extra=${extra/'$mdir'/$moduledir} + echo $moduledir/$extra >> newfiles-$module + done + cat newfiles-$module >> transferlist-$module + cat newdirs-$module >> transferlist-$module + + if [[ -d $DESTD/$moduledir ]]; then + db3 Finding file list changes since last run + process_file_list_diff $fl $module $moduledir + cat changedpaths-$module >> transferlist-$module + + generate_local_file_list $module $moduledir + + if [[ -s staletmpdirs-$module ]]; then + clean_stale_rsync_temps $module + fi + + # Find files on the client which don't exist on the server + process_local_file_list $module $moduledir + update_master_file_lists $module + + # Count some things we want to use for stats later. + cntchangedpaths=$(wc -l < changedpaths-$module) + cntlocalfiles=$(wc -l < localfiles-$module) + cntlocaldirs=$(wc -l < localdirs-$module) + cntextrafiles=$(wc -l < deletefiles-$module) + cntextradirs=$(wc -l < deletedirs-$module) + cntmissingfiles=$(wc -l < missingfiles-$module) + cntmissingdirs=$(wc -l < missingdirs-$module) + cntsizechanged=$(wc -l < updatedfiles-$module) + cntupdatetimestamps=$(wc -l < updatetimestamps-$module) + cntchecksumfailed=$(wc -l < checksumfailed-$module) + + db2f "Total on client: %7d files, %4d dirs.\n" $cntlocalfiles $cntlocaldirs + db2f "Not present on server: %7d files, %4d dirs.\n" $cntextrafiles $cntextradirs + db2f "Missing on client: %7d files, %4d dirs.\n" $cntmissingfiles $cntmissingdirs + db2f "Size Changed: %7d files.\n" $cntsizechanged + db2f "Timestamps to restore: %7d files.\n" $cntupdatetimestamps + db2f "Checksum Failed: %7d files.\n" $cntchecksumfailed + db2f "Filelist changes: %7d paths.\n" $cntchangedpaths + fi + + sort -u transferlist-$module >> transferlist-sorted-$module + cat transferlist-sorted-$module >> master-transferlist + local cnttotaltransfer=$(wc -l < transferlist-sorted-$module) + db2f "Total to transfer: %7d paths.\n" $cnttotaltransfer + + logit L "Counts for $module: Svr:$cntallserverfiles/$cntallserverdirs Loc:$cntlocalfiles/$cntlocaldirs Diff:$cntchangedpaths New:$cntnewserverfiles/$cntnewserverdirs Xtra:$cntextrafiles/$cntextradirs Miss:$cntmissingfiles/$cntmissingdirs Size:$cntsizechanged Csum:$cntchecksumfailed Dtim:$cntupdatetimestamps" + logit P Processing end: $module + db2 Finished processing $module. + + # Some basic info about the transfer. + db1 Changes in $module: $cnttotaltransfer files/dirs + if (( cnttotaltransfer <= 5 )); then + for i in $(cat transferlist-sorted-$module); do + db1 " $i" + done + fi + + # XXX We should clean some things up at this point, but we also need some + # files for the checkin later. + # Should be able to delete all *-$module, except for the dirlists, to give + # the current mirrormanager versions the things it needs. + #if (( VERBOSE <= 4 )); then + # rm *-$module + #fi +} + + +# Main program execution +# ====================== +parse_args "$@" +set_default_vars +read_config +# XXX check_dependencies + +# Paranoia; give us a few extra seconds. +[[ -z $noparanoia ]] && starttime=$(($starttime-5)) + +# Find the previous mirror time, and backdate if necessary +LASTTIME=0 +if [[ -r $TIMEFILE ]]; then + source $TIMEFILE +fi +if [[ -n $backdate ]]; then + LASTTIME=$backdate +fi + +# Make a temp dir and clean it up unless we're doing a lot of debugging +if [[ -z $TMPDIR ]]; then + tempd=$(mktemp -d -t quick-mirror.XXXXXXXXXX) +else + tempd=$(mktemp -d -p $TMPDIR -t quick-mirror.XXXXXXXXXX) +fi + +if [[ $? -ne 0 ]]; then + (>&2 echo "Creating temporary directory failed?") + exit 1 +fi +if (( VERBOSE <= 8 )); then + trap "rm -rf $tempd" EXIT +fi + +# Set up a FIFO for logging. Just calling systemd-cat repeatedly just gives us +# a different PID every time, which is annoying. +if [[ -n $LOGJOURNAL ]]; then + logfifo=$tempd/journal.fifo + mkfifo $logfifo + systemd-cat -t quick-fedora-mirror < $logfifo & + exec 3>$logfifo +fi + +outfile=$tempd/output +touch $outfile + +cd $tempd + +# At this point we can acquire the lock +lock $TIMEFILE +if (( ? != 0 )); then + db4 Could not acquire lock. + logit k lock contention + # Maybe we haven't been able to mirror for some time.... + delay=$(( starttime - LASTTIME )) + if [[ -n $backdate || $LASTTIME -eq 0 ]]; then + delay=0 + fi + + if (( delay > WARNDELAY )); then + (>&2 echo No completed run since $(date -d @$LASTTIME ).) + logit E No completed run since $(date -d @$LASTTIME ). + fi + exit 1 +fi + +db1 "Mirror starting: $(date)" +logit r Run start: cfg $cfgfile, tmp $tempd + +if (( VERBOSE >= 6 )); then + echo Times: + echo LASTTIME=$LASTTIME + echo starttime=$starttime + echo TIMEFILE=$TIMEFILE + echo Dirs: + echo tempd=$tempd + echo DESTD=$DESTD + echo Rsync: + echo REMOTE=$REMOTE + echo MASTERMODULE=$MASTERMODULE + echo RSYNC=$RSYNC + echo RSYNCOPTS=$RSYNCOPTS + echo Modules: + echo MODULES=$MODULES + echo MODULEMAPPING=$MODULEMAPPING + echo Misc: + echo VERBOSE=$VERBOSE +fi + +(( VERBOSE >= 8 )) && set -x + +if [[ -n $MIRRORBUFFET ]]; then + # We want to mirror everything, so save the admin from listing the + # individual modules. + # ZSHISM (get keys from an associative array with (k)) + MODULES=(${(k)MODULEMAPPING}) + # BASHEQ MODULES=${!MODULEMAPPING[@]} + # bash3 equivalent is terrible +fi + +fetch_file_lists + +logit p Processing start +changed_modules=() +for module in $MODULES; do + process_module $module +done + +if [[ ! -e master-transferlist ]]; then + logit n No changes to synchronize + db2 No changed files. + finish 0 +fi + +if [[ -n $MIRRORBUFFET ]]; then + echo DIRECTORY_SIZES.txt >> master-transferlist +fi + +# The actual transfer +# =================== +sort -u master-transferlist > master-transferlist.sorted +linecount=$(wc -l < master-transferlist.sorted) +sep; sep +db2 Transferring $linecount files. +# XXX send total count to log as well + +# Now we have a list of everything which has changed recently in every module +# we want, pass that to rsync (non recursive mode!) and it should transfer just +# the changed files without having to pull the entire huge file list. +extra=() +if [[ -n $rsyncdryrun ]]; then + extra+=(-n) +fi +do_rsync $REMOTE/$MASTERMODULE/ $DESTD master-transferlist.sorted extra +if (( ? != 0 )); then + (>&2 echo "rsync failed; aborting run.\nWill not check in or delete anything.") + logit "E Skipping further operations due to rsync failure." + finish 1 +fi + +# Total downloaded file count, bytes received, transfer speed +logit s "stat: downloaded $rsfilestransferred files" +logit s "stat: received $(hr_b $rstotalbytesreceived)" +logit s "stat: transfer speed $(hr_b $rstransferspeed)/s" + +# Everything we can extract from rsync +logit S "stat: sent $(hr_b $rstotalbytessent)" +logit S "stat: speedup: $rsspeedup" +logit S "stat: total size of transferred files: $(hr_b $rsfilesize)" +logit S "stat: file list gen time $(hr_s $rsfilelistgentime)" +logit S "stat: file list transfer time $(hr_s $rsfilelisttransfertime)" + +db1 "=========================" +db1 "Main transfer statistics:" +db1 " Downloaded files: $rsfilestransferred" +db1 " Total size of those files: $(hr_b $rsfilesize)" +db1 " Received: $(hr_b $rstotalbytesreceived)" +db1 " Sent: $(hr_b $rstotalbytessent)" +db1 " Speedup: $rsspeedup" +db1 " Trasfer speed: $(hr_b $rstransferspeed)/s" +db1 " File list generation time: $(hr_s $rsfilelistgentime)" +db1 " File list transfer time: $(hr_s $rsfilelisttransfertime)" + +# Local dir/file deletion +# ======================= +if [[ -s master-deletedirs ]]; then + linecount=$(wc -l < master-deletedirs) + + if [[ -n $skipdelete && $VERBOSE -ge 2 ]]; then + logit d Directory deletion skipped + echo "Not deleting $linecount directories. Delete list is:" + cat master-deletedirs + echo + else + logit d Directory deletion start: $linecount directories + db2 Removing $linecount stale directories. + for nuke in $(cat master-deletedirs); do + if [[ -d "$DESTD/$nuke" ]]; then + logit D Deleting directory $nuke + db4 Removing $nuke + rm -rf "$DESTD/$nuke" + deletedsomething=1 + fi + done + logit d Directory deletion end + fi +else + db2 No stale directories to delete. +fi + +if [[ -s master-deletefiles ]]; then + linecount=$(wc -l < master-deletefiles) + + if [[ -n $skipdelete ]]; then + logit d File deletion skipped + echo Not deleting $linecount stale files. Delete list is: + cat master-deletefiles + echo + else + logit d File deletion begin: $linecount files + db2 Removing $linecount stale files. + # xopts=() + # (( VERBOSE >= 4 )) && xopts=(-t) + tr '\n' '\0' < master-deletefiles \ + | (pushd $DESTD; xargs $xopts -0 rm -f ; popd) + # for nuke in $(cat master-deletefiles); do + # logit D Deleting file $nuke + # rm -f "$DESTD/$nuke" + # done + deletedsomething=1 + logit d File deletion end + fi +else + db2 No stale files to delete. +fi + +if [[ ( -n $KEEPDIRTIMES || -n $updatealldirtimes ) && -s master-updatetimestamps ]]; then + extra=() + if [[ -n $rsyncdryrun ]]; then + extra+=(-n) + fi + logit d "Updating timestamps on $(wc -l < master-updatetimestamps) dirs" + do_rsync $REMOTE/$MASTERMODULE/ $DESTD master-updatetimestamps extra +fi + +# We've completed a run, so save the timestamp +save_state + +# Mirrormanager Checkin and Callout +# ================================= +# At this point we know that we had a clean run with no complaints from rsync, +# and as far as we're concerned the run is now complete and recorded. +# +# So for each module we mirrored, the filtered file list is correct. This +# means that the alldirs-$module file is accurate and we can simply report its +# contents to mirrormanager. +if [[ -z $skipcheckin || -n $dumpmmcheckin ]]; then + db2 Performing mirrormanager checkin + logit m "mirrormanager checkin start" + + # Check in just the changed modules + for module in $changed_modules; do + checkin_module $module + done + + logit m "mirrormanager checkin end" +fi +finish 0 yes diff --git a/quick-fedora-mirror/quick-fedora-mirror.conf b/quick-fedora-mirror/quick-fedora-mirror.conf new file mode 100644 index 0000000..3b6827f --- /dev/null +++ b/quick-fedora-mirror/quick-fedora-mirror.conf @@ -0,0 +1,196 @@ +# Configuration file for quick-fedora-mirror +# This file is sourced by the shell and must be in valid sh syntax. + +#### Required settings +# Required: The the directory holding your copy of all of the modules you +# mirror. Does not include any module name +DESTD=/mirror/root/fedora + +# Required: The file in which to store the last mirror time. +# Note: this really should not be in the repository itself. +TIMEFILE=/home/mirror/quick-fedora-mirror/timefile + +# Other settings +# The remote host to rsync from, not including a module name +REMOTE=rsync://mirrors.rit.edu + +# The master module, which holds the other modules +# Note that the mirror you pull from MUST have this master module. You cannot +# use quick-fedora-mirror to pull from a host which does not have a master +# module containing the other modules. +# MASTERMODULE=fedora-buffet + +# Tier 1 Fedora mirrors should uncomment the following to get the proper +# pre-bitflip content. +# MASTERMODULE=fedora-buffet0 +# PREBITFLIP=1 + +# Define if the entire repository (all modules under fedora-buffet) should be +# mirrored. If set, MODULES (below) is ignored +# MIRRORBUFFET= + +# An array containing the modules to be mirrored +# MODULES=(fedora-enchilada fedora-epel) +MODULES=(fedora fedora-epel) + +# The name of the file holding the file list on the mirror host Note: the +# string '$mdir' will be replaced with the name of the current module directory +# in context, and so the '$' must be escaped or the string quoted. +# FILELIST='fullfiletimelist-$mdir' + +# An array of extra file lists to be transferred. They won't be processed, but +# will implicitly be included in every transfer because file lists can't be +# included in the file lists. '$mdir' is substituted as above. +# Note that if you change this, you will want to do a run with -a to pick up +# those extra files even in unchanged modules. +# EXTRAFILES=(fullfilelist imagelist-\$mdir) + +# Mapping of MODULES to directory names, as an associative array +# MODULEMAPPING=(fedora-alt alt fedora-archive archive +# fedora-enchilada fedora fedora-epel epel) +MODULEMAPPING=(fedora fedora fedora-epel epel) + +# curl binary (only if MirrorManager checkins are enabled; see below). +# CURL=/usr/bin/curl + +# rsync binary +# RSYNC=/usr/bin/rsync + +# Rsync timeout value, in seconds +# Will be passed to rsync via --timeout. +# RSYNCTIMEOUT=600 + +# Array of default options to pass to rsync +# Will be modified automatically according to VERBOSITY level; no need to set +# -q, -v or --info here. +# +# You can add excludes here, but the script will always detect those files as +# missing and will add them back to the file list. This may generate +# complaints from rsync, but should not cause any problems, though it is almost +# certainly better to use the provided FILTEREXP to do exclusion instead. +# +# Note that some of these options may be required for proper operation of the +# script. You really should be careful if you change things here, as the +# default value is carefully crafted and rsync can react in unexpected ways to +# seeminly compatible sets of arguments. +# RSYNCOPTS=(-aSH -f 'R .~tmp~' --stats --preallocate --delay-updates --out-format='@ %i %n%L') + +# By default quick-fedora-mirror will try to detect and recover from an aborted +# rsync run by moving any already downloaded files into place before +# processing, Define NORSYNCRECOVERY (to anything) to prevent this. +# NORSYNCRECOVERY= + +# Define KEEPDIRTIMES (to anything) to make a third rsync call which restores +# the timestamps of any directories which were modified after file removal. +# This won't ensure that timestamps are always up to date, but its good enough +# if you don't modify your repository locally. Maintaining directory +# timestamps isn't important for mirroring in any case. +# KEEPDIRTIMES= + +# DEFINE CHECKIN_SITE and CHECKIN_PASSWORD to perform a mirrormanager checkin +# at the completion of the run if rsync succeeded without error. These values +# should match the master site name and site password you have configured in +# mirrormanager. +# CHECKIN_HOST will default to the output of the hostname command; you must set +# it manually if that does not match the hostname you have configured in +# mirrormanager. +# The "curl", "bzip2" and "base64" commands must be installed in order to +# perform mirrormanager checkins. +# CHECKIN_SITE= +# CHECKIN_PASSWORD= +# CHECKIN_HOST= + +# If you have configured multiple mirrormanager hostnames to virtually +# spread the modules/categories you mirror between them, then you can configure +# per-module checkin hosts with variables liks the following. +# If a specific CHECKIN_HOST_* setting is not configured, then CHECKIN_HOST +# above will be used. +# Note that the module name is used, not the mirrormanager category, and that +# '-' in the module name is replaced by '_' to make a legal variable name. +# Most sites will hnot need this; it is only necessary when you mirror too many +# modules for mirrormanager to handle in a single checkin or crawl. +# CHECKIN_HOST_fedora_archive= +# CHECKIN_HOST_fedora_alt= + +# Verbosity levels - info sent to stdout; usually this gets mailed to root when +# being run by cron +# 0 - quiet +# 1 - quiet if no changes, otherwise basic transfer info +# 2 - no -q to rsync +# 3 - very slightly more verbosity +# 4 - One -v to rsync +# 5 - Another -v to rsync +# 6 - Output of all settings +# 7 - Add --progress to rsync +# 8 - Shell script trace +# VERBOSE=0 +VERBOSE=4 + +# Define (to anything) to enable logging to the systemd journal (via +# systemd-cat). the identifier "quick-fedora-mirror" is used, so logs can be +# retrieved with: journalctl -t quick-fedora-mirror +# LOGJOURNAL= + +# Define to a full path to enable logging to that file. +# The provided file must already exist and be writable. +# Is only considered if LOGJOURNAL above is not defined. +# LOGFILE= +LOGFILE=/home/mirror/merlin/log/fedora.log + +# Logging fields - Each character selects a piece of information to log. +# @ - Absolutely everything. +# a - aborted run recovery +# A - each recovered file from an aborted run +# c - rsync calls +# C - rsync call completions +# d - File/directory deletion start/end +# D - all file/directory deletes +# e - minor errors +# E - serious errors +# F - all transferred files (not impl) +# g - file list generation start/end +# k - lock contention +# l - per-module local file list generation (recursive find) start/end +# L - file list generation counts +# m - mirrormanager checkin +# M - mirrormanager checkin detail +# n - lack of updates in a run +# N - lack of updates in a module +# o - remote file list download start/end +# p - module processing start/end +# p - per-module module processing start/end +# r - run start +# R - run end +# s - Basic transfer statistics +# S - Detailed transfer statistics +# t - directory time updates (not impl) +# LOGITEMS=aeElrRs + +# When q-f-m fails to run becuse it is already running, it checks the time +# since the last successful run. If that is larger than this value, it logs a +# serious error. Thid helps to detect a hung run or issues with slow +# transfers. +# WARNDELAY=$((60 * 60 * 24)) # One day + +# When q-f-m encounters an error calling rsync, it may (depending on the error) +# sleep and retry. It will always sleep with exponential backup. Set +# MAXRETRIES to limit the number of times it retries. +# MAXRETRIES=10 + +# mktemp will be called after this file is sourced to make a temporary +# directory. This directory can contain a large amount of data, and that data +# is specified by the server. If your /tmp is small and you are concerned +# about the server potentially sending extra-huge files and filling things up, +# you can set TMPDIR here. +# TMPDIR= + +# A regular expression used to filter the file lists. It must be quoted (or +# very carefully escaped). Entries matching this expression will not be synced +# and are expected not to be present locally. They will also be ignored by +# quick-fedora-hardlink. Cannot contain commas. Run against the file list that +# includes sizes (by quick-fedora-mirror) and the fullfiletimelist (by +# quick-fedora-hardlink), so don't use expressions that would match those +# metadata (which are digit strings and single characters). Example is a heavy +# filter which gives you an x86_64-only mirror with source packages, debuginfo +# packages, Alpha and Beta releases, and most image files excluded. +# FILTEREXP='(/i386|/armhfp|/source|/SRPMS|/debug/|\.iso|\.img|\.qcow2|\.raw\.xz|\.box|/releases/test)'