#!/usr/bin/env bash set -euo pipefail # Safe Talos rolling upgrade script: # 1.8.4 -> 1.9.5 -> 1.10.7 -> 1.11.6 -> 1.12.5 # Order: cp-1, cp-2, cp-3, worker-1 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" TALOSCONFIG_PATH="${TALOSCONFIG_PATH:-${REPO_ROOT}/talos/clusterconfig/talosconfig}" ENDPOINT="${ENDPOINT:-192.168.50.230}" CONTROL_PLANES=("192.168.50.20" "192.168.50.30" "192.168.50.40") WORKERS=("192.168.50.10") UPGRADE_VERSIONS=("v1.12.5") if [[ ! -f "${TALOSCONFIG_PATH}" ]]; then echo "Talos config not found: ${TALOSCONFIG_PATH}" echo "Set TALOSCONFIG_PATH=/absolute/path/to/talosconfig and retry." exit 1 fi run_talosctl() { talosctl --talosconfig "${TALOSCONFIG_PATH}" "$@" } normalize_version() { local version="$1" echo "${version#v}" } version_ge() { local left local right left="$(normalize_version "$1")" right="$(normalize_version "$2")" [[ "$(printf "%s\n%s\n" "${left}" "${right}" | sort -V | tail -n1)" == "${left}" ]] } get_node_talos_version() { local node_ip="$1" local output output="$(run_talosctl -n "${node_ip}" version 2>/dev/null || true)" # Prefer the server tag for the requested node from the NODE/Tag block. local node_tag node_tag="$( printf "%s\n" "${output}" | awk -v node="${node_ip}" ' $1=="NODE:" && $2==node { seen=1; next } seen && $1=="Tag:" { print $2; exit } ' )" if [[ -n "${node_tag}" ]]; then echo "${node_tag}" return 0 fi return 1 } check_cluster_ready() { echo "Checking cluster health via endpoint ${ENDPOINT}..." run_talosctl -e "${ENDPOINT}" -n "${CONTROL_PLANES[0]}" health kubectl get nodes -o wide } upgrade_node_to_version() { local node_ip="$1" local version="$2" local image="ghcr.io/siderolabs/installer:${version}" local current_version="" echo echo "=== Upgrading node ${node_ip} to ${version} ===" if current_version="$(get_node_talos_version "${node_ip}")"; then echo "Current Talos version on ${node_ip}: ${current_version}" if version_ge "${current_version}" "${version}"; then echo "Node ${node_ip} already at or above ${version}; skipping upgrade/reboot." return 0 fi else echo "Could not determine current server version for ${node_ip}; continuing with upgrade." fi run_talosctl -n "${node_ip}" upgrade --image "${image}" run_talosctl -n "${node_ip}" reboot echo "Waiting for cluster and node health after ${node_ip} reboot..." run_talosctl -e "${ENDPOINT}" -n "${CONTROL_PLANES[0]}" health run_talosctl -n "${node_ip}" version kubectl get nodes -o wide } echo "Using TALOSCONFIG: ${TALOSCONFIG_PATH}" echo "Control planes: ${CONTROL_PLANES[*]}" echo "Workers: ${WORKERS[*]}" echo "Upgrade hops: ${UPGRADE_VERSIONS[*]}" echo check_cluster_ready for version in "${UPGRADE_VERSIONS[@]}"; do echo echo "##### Starting upgrade hop ${version} #####" for node in "${CONTROL_PLANES[@]}"; do upgrade_node_to_version "${node}" "${version}" done for node in "${WORKERS[@]}"; do upgrade_node_to_version "${node}" "${version}" done echo "Completed hop ${version}. Verifying cluster state..." check_cluster_ready done echo echo "All upgrade hops complete." run_talosctl version kubectl get nodes -o wide