Files
home-server/talos/scripts/longhorn-gpt-recovery.sh

64 lines
2.4 KiB
Bash
Executable File

#!/usr/bin/env bash
# Recover from GPT on Longhorn data disk: apply wipe-phase config → wipe → restore Longhorn talconfig.
# Prereq: talos/talconfig.yaml is the WIPE phase (no userVolumes longhorn); talhelper genconfig -o out already run.
set -euo pipefail
TALOS_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
export TALOSCONFIG="${TALOSCONFIG:-$TALOS_ROOT/out/talosconfig}"
DISK="${DISK:-sdb}"
NODES=(
"192.168.50.10:noble-helium.yaml"
"192.168.50.20:noble-neon.yaml"
"192.168.50.30:noble-argon.yaml"
"192.168.50.40:noble-krypton.yaml"
)
die() { echo "error: $*" >&2; exit 1; }
[[ -f "$TALOSCONFIG" ]] || die "missing $TALOSCONFIG — run: cd $TALOS_ROOT && talhelper genconfig -o out"
phase_apply_wipe() {
echo "=== Phase 1: apply WIPE-phase machine config to every node (releases u-longhorn) ==="
for entry in "${NODES[@]}"; do
ip="${entry%%:*}"
file="${entry##*:}"
echo "Applying $file to $ip ..."
talosctl apply-config -n "$ip" --file "$TALOS_ROOT/out/$file"
done
echo "Reboot all Talos nodes now (or wait for volume controller), then confirm u-longhorn is gone:"
echo " talosctl get volumestatus -n 192.168.50.20"
echo "When wipe would succeed, run: $0 phase2"
}
phase_wipe_disks() {
echo "=== Phase 2: wipe data disk $DISK on each node (must NOT be 'in use by volume u-longhorn') ==="
for entry in "${NODES[@]}"; do
ip="${entry%%:*}"
echo "Wiping $DISK on $ip ..."
talosctl wipe disk "$DISK" -n "$ip"
done
echo "=== Phase 3: restore Longhorn talconfig, genconfig, apply to all nodes ==="
cp -f "$TALOS_ROOT/talconfig.with-longhorn.yaml" "$TALOS_ROOT/talconfig.yaml"
(cd "$TALOS_ROOT" && talhelper genconfig -o out)
for entry in "${NODES[@]}"; do
ip="${entry%%:*}"
file="${entry##*:}"
echo "Applying restored $file to $ip ..."
talosctl apply-config -n "$ip" --file "$TALOS_ROOT/out/$file"
done
echo "Done. Reboot nodes if Longhorn volume does not come up clean."
}
case "${1:-}" in
phase1|apply) phase_apply_wipe ;;
phase2|wipe) phase_wipe_disks ;;
"")
echo "Usage: cd talos && talhelper genconfig -o out && export TALOSCONFIG=\"\$(pwd)/out/talosconfig\""
echo " $0 phase1 # apply WIPE machine config to all nodes — reboot if u-longhorn lingers"
echo " DISK=vdb $0 phase2 # wipe disk, restore Longhorn talconfig, genconfig, apply all"
echo "Env DISK defaults to sdb."
;;
*) die "unknown arg: $1" ;;
esac