#!/usr/bin/env bash # Recover from GPT on Longhorn data disk: apply wipe-phase config → wipe → restore Longhorn talconfig. # Prereq: talos/talconfig.yaml is the WIPE phase (no userVolumes longhorn); talhelper genconfig -o out already run. set -euo pipefail TALOS_ROOT="$(cd "$(dirname "$0")/.." && pwd)" export TALOSCONFIG="${TALOSCONFIG:-$TALOS_ROOT/out/talosconfig}" DISK="${DISK:-sdb}" NODES=( "192.168.50.10:noble-helium.yaml" "192.168.50.20:noble-neon.yaml" "192.168.50.30:noble-argon.yaml" "192.168.50.40:noble-krypton.yaml" ) die() { echo "error: $*" >&2; exit 1; } [[ -f "$TALOSCONFIG" ]] || die "missing $TALOSCONFIG — run: cd $TALOS_ROOT && talhelper genconfig -o out" phase_apply_wipe() { echo "=== Phase 1: apply WIPE-phase machine config to every node (releases u-longhorn) ===" for entry in "${NODES[@]}"; do ip="${entry%%:*}" file="${entry##*:}" echo "Applying $file to $ip ..." talosctl apply-config -n "$ip" --file "$TALOS_ROOT/out/$file" done echo "Reboot all Talos nodes now (or wait for volume controller), then confirm u-longhorn is gone:" echo " talosctl get volumestatus -n 192.168.50.20" echo "When wipe would succeed, run: $0 phase2" } phase_wipe_disks() { echo "=== Phase 2: wipe data disk $DISK on each node (must NOT be 'in use by volume u-longhorn') ===" for entry in "${NODES[@]}"; do ip="${entry%%:*}" echo "Wiping $DISK on $ip ..." talosctl wipe disk "$DISK" -n "$ip" done echo "=== Phase 3: restore Longhorn talconfig, genconfig, apply to all nodes ===" cp -f "$TALOS_ROOT/talconfig.with-longhorn.yaml" "$TALOS_ROOT/talconfig.yaml" (cd "$TALOS_ROOT" && talhelper genconfig -o out) for entry in "${NODES[@]}"; do ip="${entry%%:*}" file="${entry##*:}" echo "Applying restored $file to $ip ..." talosctl apply-config -n "$ip" --file "$TALOS_ROOT/out/$file" done echo "Done. Reboot nodes if Longhorn volume does not come up clean." } case "${1:-}" in phase1|apply) phase_apply_wipe ;; phase2|wipe) phase_wipe_disks ;; "") echo "Usage: cd talos && talhelper genconfig -o out && export TALOSCONFIG=\"\$(pwd)/out/talosconfig\"" echo " $0 phase1 # apply WIPE machine config to all nodes — reboot if u-longhorn lingers" echo " DISK=vdb $0 phase2 # wipe disk, restore Longhorn talconfig, genconfig, apply all" echo "Env DISK defaults to sdb." ;; *) die "unknown arg: $1" ;; esac