ansitest/proxmox/proxmox-backup-zfs-bootroot.sh
kneutron 2063a16a2e
Add files via upload
extra notes, add sshfs serverkeepalive
2024-06-10 00:40:17 -06:00

206 lines
6.7 KiB
Bash

#!/bin/bash
# 2024.Jun kneutron
# REF: https://github.com/kneutron/ansitest/blob/master/proxmox/HOWTO-make-a-file-backed-backup-of-proxmox-rpool.docx
# Objective: bare-metal backup of proxmox boot/root ZFS partition and EFI
# NOTE - this is a PROOF OF CONCEPT script and is NOT meant to be run blindly!
# you need to EDIT IT before running! And make sure the destination disk has sufficient free space!
# TODO EDITME before running!
destdir=/mnt/macpro-sgtera2
# Will be mounted on this server / local dir
sshfsmountthis=/Volumes/sgtera2
# destination directory on the remote side
# TODO add a subdir here for different systems
loginid=dave
# for sshfs
destserver=macpro-static
# hostname or IP
# NOTE this **needs** to be the correct disk!
zfsroot=sdb
# Obtain from /dev/disk/by-id
disktomir="ata-Samsung_Portable_SSD_T5_S49WNV0MC04217F-part3"
# Obtain from zpool status -v
# pool: rpool
# state: ONLINE
# scan: resilvered 3.48G in 00:04:03 with 0 errors on Fri Jun 7 22:16:20 2024
#config:
# NAME STATE READ WRITE CKSUM
# rpool ONLINE 0 0 0
# ata-Samsung_Portable_SSD_T5_S49WNV0MC04217F-part3 ONLINE 0 0 0 *** this one
# Disk size in GB will be determined auto by script
# failexit.mrg
function failexit () {
echo '! Something failed! Code: '"$1 $2" # code # (and optional description)
exit $1
}
set -u # abort if var undefined
# Takes only 1 arg, use in loop if multiple
function install-if-missing () {
[ $(which $1 |wc -l) -gt 0 ] || apt-get install -y $1
}
install-if-missing sshfs
install-if-missing bc
mkdir -pv $destdir
# if not already mounted
if [ $(df -T |grep sshfs |grep -c $destdir) -eq 0 ]; then
echo "Mounting $destdir on $destserver"
# TODO change this to samba, nfs, whatever works for you
# TODO ssh-copy-id for passwordless access
sshfs -o Ciphers=chacha20-poly1305@openssh.com \
-o ServerAliveInterval=30 \
-o ServerAliveCountMax=1 \
$loginid@$destserver:$sshfsmountthis $destdir
fi
# hopefully by adding the serveralives, sshfs will drop the connection if it doesnt hear back (network issues)
if [ $(df -T |grep -c $destdir) -eq 0 ]; then
failexit 40 "$destdir is not mounted, cannot proceed"
fi
cd $destdir
fdisk -l /dev/$zfsroot >fdisk-l-pve-zfs-bootroot.txt
# (OLD)
# # fdisk -l /dev/nvme0n1 2>/dev/null|head -n 1
#Disk /dev/nvme0n1: 238.47 GiB, 256060514304 bytes, 500118192 sectors
# 1 2 3 4 5
# (OLD)
#bytes=$(fdisk -l /dev/$zfsroot |head -n 1 |awk '{print $5}')
# pi=$(echo "scale=10; 4*a(1)" | bc -l)
#gigs=$(echo "scale=0; ${bytes}/1024/1024/1024" |bc -l)
# fdisk -l /dev/sdb|grep '^/dev/sdb3'
#/dev/sdb3 1050624 88080384 87029761 41.5G Solaris /usr & Apple ZFS
# 1 2 3 4 5
# We dont actually need to allocate the whole disk size, just the partition 3 size
gigs=$(fdisk -l /dev/$zfsroot |grep "^/dev/${zfsroot}3" |awk '{print $5}') # 41.5G
# truncate needs an INTEGER
# echo "41.5G" |numfmt --round=up --from=iec
#44560285696
gigactual=$(echo ${gigs} |numfmt --round=up --from=iec)
# NOTE always starting over with a new one, back it up or move it somewhere if you want to keep previous versions!
mirfile="proxmox-rpool-mirror-${gigs}-zfs-efi.disk"
if [ $(zpool status rpool -v |grep -c $mirfile) -gt 0 ]; then
failexit 202 "rpool still has $mirfile attached - you need to detach it first before running another backup!!"
fi
[ -e $mirfile ] && rm -fv "$destdir/$mirfile"
#truncate -s ${gigs}G $mirfile
truncate -s ${gigactual} $mirfile || failexit 99 "Failed to create sparsefile $mirfile $gigs"
ls -lh $mirfile
echo "$(date) - Backing up the EFI partition" # to a lightly gzipped file (for speed):
time dd if=/dev/${zfsroot}2 bs=1M status=progress |gzip -1 >dd-efi-part2-rpool-mirror-${gigs}-zfs.dd.gz
echo "$(date) - Beginning mirror process"
time zpool attach rpool $disktomir \
$PWD/$mirfile
function watchresilver () {
sdate=$(date)
# do forever
while :; do
clear
echo "Pool: rpool - NOW: $(date) -- Watchresilver started: $sdate"
zpool status rpool |grep -A 2 'in progress' || break 2
zpool iostat -v rpool #2 3 &
# zpool iostat -T d -v $1 2 3 & # with timestamp
sleep 5
date
done
ndate=$(date)
zpool status -v rpool |awk 'NF>0' # skip blank lines
echo "o Resilver watch rpool start: $sdate // Completed: $ndate"
}
watchresilver;
#echo "At this point, you may want to run a ' zpool scrub ' and a ' zpool clear rpool ' afterward"
sleep 3
echo "$(date) - commencing automatic scrub to verify backup"
zpool scrub rpool
watchresilver;
echo ''
echo "$(date) - Check results of scrub and ^C to abort, or Enter to offline the file-backed mirror"
echo "(Waiting for input)"
read
zpool offline rpool $destdir/$mirfile
sleep 1
zpool status -v rpool |awk 'NF>0'
echo ''
echo "$(date) - At this point you can reboot (or if possible, unmount the backup / sshfs destination)"
echo " and (as long as the target containing the rpool*efi.disk isn't mounted)"
echo " you can detach the efi.disk mirror copy from the pool without issues, the data on the backup file will stay intact."
echo "TODO: # zpool detach rpool $destdir/$mirfile"
exit;
EXPERIMENTAL: In order to update the mirror file occasionally, you might leave the sshfs mounted and simply:
# zpool online rpool /mnt/macpro-sgtera2/proxmox-rpool-mirror-41.5G-zfs-efi.disk
And allow it to resilver. Then bring it offline again.
# zpool offline rpool /mnt/macpro-sgtera2/proxmox-rpool-mirror-41.5G-zfs-efi.disk
I have NO IDEA how "stable" this would be, but would definitely NOT recommend allowing
the 2nd disk in a zfs rpool mirror to run over an sshfs network mount for
any length of time - beyond what it takes to update the file-based backup.
Unstable behavior could result if you have a network outage / need to bounce a switch.
You can test this in a VM restore - start a GNU ' screen ' session, then disconnect the primary network interface
while the sshfs is mounted and the file-backed mirror is still attached to the rpool.
Now try to dd /dev/random to a file, bs=1M, count=100 and issue a ' sync '
You will start seeing "task blocked for more than X seconds" RIP messages.
ZFS will still think everything is fine, ' zpool status ' still responds, but comms to the mirror disk are hanging.
' df ' will hang. Bash tab-completion may hang. Load average on a single-cpu instance shot up to (15).
Even trying to take the mirror disk offline while comms were down hung up for almost 3 minutes,
but finally completed. Once the dead mirror disk finally went offline from the command, the several
hung commands also finally completed.
Reconnecting the vNIC after 10-15 minutes finally dropped the sshfs mount. (The Samba mounts recovered.)