diff --git a/clusters/noble/apps/kustomization.yaml b/clusters/noble/apps/kustomization.yaml index fbd73fb..219efed 100644 --- a/clusters/noble/apps/kustomization.yaml +++ b/clusters/noble/apps/kustomization.yaml @@ -4,6 +4,7 @@ resources: - argocd/application.yaml - cilium/application.yaml - kube-vip/application.yaml + - longhorn/application.yaml - monitoring-kube-prometheus/application.yaml - monitoring-loki/application.yaml diff --git a/clusters/noble/apps/longhorn/application.yaml b/clusters/noble/apps/longhorn/application.yaml new file mode 100644 index 0000000..f96ac35 --- /dev/null +++ b/clusters/noble/apps/longhorn/application.yaml @@ -0,0 +1,27 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: longhorn + namespace: argocd + annotations: + argocd.argoproj.io/sync-wave: "1" +spec: + project: default + destination: + server: https://kubernetes.default.svc + namespace: longhorn-system + sources: + - repoURL: https://charts.longhorn.io + chart: longhorn + targetRevision: "*" + helm: + valuesObject: + defaultSettings: + createDefaultDiskLabeledNodes: false + defaultDataPath: /var/mnt/longhorn + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/clusters/noble/apps/monitoring-kube-prometheus/application.yaml b/clusters/noble/apps/monitoring-kube-prometheus/application.yaml index d4978d1..6d77476 100644 --- a/clusters/noble/apps/monitoring-kube-prometheus/application.yaml +++ b/clusters/noble/apps/monitoring-kube-prometheus/application.yaml @@ -4,7 +4,7 @@ metadata: name: monitoring-kube-prometheus namespace: argocd annotations: - argocd.argoproj.io/sync-wave: "1" + argocd.argoproj.io/sync-wave: "2" spec: project: default destination: @@ -20,12 +20,26 @@ spec: prometheusSpec: retention: 15d storageSpec: - emptyDir: {} + volumeClaimTemplate: + spec: + storageClassName: longhorn + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi alertmanager: alertmanagerSpec: retention: 120h storage: - emptyDir: {} + volumeClaimTemplate: + spec: + storageClassName: longhorn + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi kubeEtcd: enabled: false kubeScheduler: diff --git a/clusters/noble/apps/monitoring-loki/application.yaml b/clusters/noble/apps/monitoring-loki/application.yaml index c475731..bb5e2e1 100644 --- a/clusters/noble/apps/monitoring-loki/application.yaml +++ b/clusters/noble/apps/monitoring-loki/application.yaml @@ -4,7 +4,7 @@ metadata: name: monitoring-loki namespace: argocd annotations: - argocd.argoproj.io/sync-wave: "1" + argocd.argoproj.io/sync-wave: "2" spec: project: default destination: @@ -19,7 +19,9 @@ spec: loki: enabled: true persistence: - enabled: false + enabled: true + storageClassName: longhorn + size: 20Gi promtail: enabled: true grafana: diff --git a/talos/README.md b/talos/README.md index dc46bcd..b66d403 100644 --- a/talos/README.md +++ b/talos/README.md @@ -125,3 +125,117 @@ After syncing manifests, create a Pi-hole DNS A record: - `argo.noble.lab.pcenicni.dev` -> `192.168.50.231` +## 10) Longhorn storage and extra disks + +Longhorn is deployed from: + +- `clusters/noble/apps/longhorn/application.yaml` + +Monitoring apps are configured to use `storageClassName: longhorn`, so you can +persist Prometheus/Alertmanager/Loki data once Longhorn is healthy. + +### Extra drive layout (this cluster) + +Each node uses: + +- `/dev/sda` — Talos install disk (`installDisk` in `talconfig.yaml`) +- `/dev/sdb` — dedicated Longhorn data disk + +`talconfig.yaml` includes a global patch that partitions `/dev/sdb` and mounts it +at `/var/mnt/longhorn`, which matches Longhorn `defaultDataPath` in the Argo +Helm values. + +After editing `talconfig.yaml`, regenerate and apply configs: + +```bash +cd talos +talhelper genconfig +# apply each node’s YAML from clusterconfig/ with talosctl apply-config +``` + +Then reboot each node once so the new disk layout is applied. + +### `talosctl` TLS errors (`unknown authority`, `Ed25519 verification failure`) + +`talosctl` **does not** automatically use `talos/clusterconfig/talosconfig`. If you +omit it, the client falls back to **`~/.talos/config`**, which is usually a +**different** cluster CA — you then get TLS handshake failures against the noble +nodes. + +**Always** set this in the shell where you run `talosctl` (use an absolute path +if you change directories): + +```bash +cd talos +export TALOSCONFIG="$(pwd)/clusterconfig/talosconfig" +export ENDPOINT=192.168.50.230 +``` + +Sanity check (should print Talos and Kubernetes versions, not TLS errors): + +```bash +talosctl -e "${ENDPOINT}" -n 192.168.50.20 version +``` + +Then use the same shell for `apply-config`, `reboot`, and `health`. + +If it **still** fails after `TALOSCONFIG` is set, the running cluster was likely +bootstrapped with **different** secrets than the ones in your current +`talsecret.sops.yaml` / regenerated `clusterconfig/`. In that case you need the +**original** `talosconfig` that matched the cluster when it was created, or you +must align secrets and cluster state (recovery / rebuild is a larger topic). + +Keep **`talosctl`** roughly aligned with the node Talos version (for example +`v1.12.x` clients for `v1.12.5` nodes). + +**Paste tip:** run **one** command per line. Pasting `...cp-3.yaml` and +`talosctl` on the same line breaks the filename and can confuse the shell. + +### More than one extra disk per node + +If you add a third disk later, extend `machine.disks` in `talconfig.yaml` (for +example `/dev/sdc` → `/var/mnt/longhorn-disk2`) and register that path in +Longhorn as an additional disk for that node. + +Recommended: + +- use one dedicated filesystem per Longhorn disk path +- avoid using the Talos system disk for heavy Longhorn data +- spread replicas across nodes for resiliency + +## 11) Upgrade Talos to `v1.12.x` + +This repo now pins: + +- `talosVersion: v1.12.5` in `talconfig.yaml` + +### Regenerate configs + +From `talos/`: + +```bash +talhelper genconfig +``` + +### Rolling upgrade order + +Upgrade one node at a time, waiting for it to return healthy before moving on. + +1. Control plane nodes (`noble-cp-1`, then `noble-cp-2`, then `noble-cp-3`) +2. Worker node (`noble-worker-1`) + +Example commands (adjust node IP per step): + +```bash +talosctl --talosconfig ./clusterconfig/talosconfig -n 192.168.50.20 upgrade --image ghcr.io/siderolabs/installer:v1.12.5 +talosctl --talosconfig ./clusterconfig/talosconfig -n 192.168.50.20 reboot +talosctl --talosconfig ./clusterconfig/talosconfig -n 192.168.50.20 health +``` + +After all nodes are upgraded, verify: + +```bash +talosctl --talosconfig ./clusterconfig/talosconfig version +kubectl get nodes -o wide +``` + diff --git a/talos/talconfig.yaml b/talos/talconfig.yaml index a3c5ef0..a944186 100644 --- a/talos/talconfig.yaml +++ b/talos/talconfig.yaml @@ -1,6 +1,6 @@ clusterName: noble endpoint: https://192.168.50.230:6443 -talosVersion: v1.8.4 +talosVersion: v1.12.5 kubernetesVersion: v1.31.1 allowSchedulingOnControlPlanes: true @@ -13,6 +13,16 @@ clusterPodNets: clusterSvcNets: - 10.96.0.0/12 +# Secondary disk on every node (OS stays on installDisk: /dev/sda). +# Mount matches Longhorn defaultDataPath in clusters/noble/apps/longhorn/application.yaml. +patches: + - |- + machine: + disks: + - device: /dev/sdb + partitions: + - mountpoint: /var/mnt/longhorn + nodes: - hostname: noble-cp-1 ipAddress: 192.168.50.20 diff --git a/talos/upgrade-talos-1.8.4-to-1.12.5.sh b/talos/upgrade-talos-1.8.4-to-1.12.5.sh new file mode 100755 index 0000000..7236f8c --- /dev/null +++ b/talos/upgrade-talos-1.8.4-to-1.12.5.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Safe Talos rolling upgrade script: +# 1.8.4 -> 1.9.5 -> 1.10.7 -> 1.11.6 -> 1.12.5 +# Order: cp-1, cp-2, cp-3, worker-1 + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +TALOSCONFIG_PATH="${TALOSCONFIG_PATH:-${REPO_ROOT}/talos/clusterconfig/talosconfig}" +ENDPOINT="${ENDPOINT:-192.168.50.230}" + +CONTROL_PLANES=("192.168.50.20" "192.168.50.30" "192.168.50.40") +WORKERS=("192.168.50.10") + +UPGRADE_VERSIONS=("v1.12.5") + +if [[ ! -f "${TALOSCONFIG_PATH}" ]]; then + echo "Talos config not found: ${TALOSCONFIG_PATH}" + echo "Set TALOSCONFIG_PATH=/absolute/path/to/talosconfig and retry." + exit 1 +fi + +run_talosctl() { + talosctl --talosconfig "${TALOSCONFIG_PATH}" "$@" +} + +normalize_version() { + local version="$1" + echo "${version#v}" +} + +version_ge() { + local left + local right + left="$(normalize_version "$1")" + right="$(normalize_version "$2")" + [[ "$(printf "%s\n%s\n" "${left}" "${right}" | sort -V | tail -n1)" == "${left}" ]] +} + +get_node_talos_version() { + local node_ip="$1" + local output + + output="$(run_talosctl -n "${node_ip}" version 2>/dev/null || true)" + + # Prefer the server tag for the requested node from the NODE/Tag block. + local node_tag + node_tag="$( + printf "%s\n" "${output}" | awk -v node="${node_ip}" ' + $1=="NODE:" && $2==node { seen=1; next } + seen && $1=="Tag:" { print $2; exit } + ' + )" + + if [[ -n "${node_tag}" ]]; then + echo "${node_tag}" + return 0 + fi + + return 1 +} + +check_cluster_ready() { + echo "Checking cluster health via endpoint ${ENDPOINT}..." + run_talosctl -e "${ENDPOINT}" -n "${CONTROL_PLANES[0]}" health + kubectl get nodes -o wide +} + +upgrade_node_to_version() { + local node_ip="$1" + local version="$2" + local image="ghcr.io/siderolabs/installer:${version}" + local current_version="" + + echo + echo "=== Upgrading node ${node_ip} to ${version} ===" + if current_version="$(get_node_talos_version "${node_ip}")"; then + echo "Current Talos version on ${node_ip}: ${current_version}" + if version_ge "${current_version}" "${version}"; then + echo "Node ${node_ip} already at or above ${version}; skipping upgrade/reboot." + return 0 + fi + else + echo "Could not determine current server version for ${node_ip}; continuing with upgrade." + fi + + run_talosctl -n "${node_ip}" upgrade --image "${image}" + run_talosctl -n "${node_ip}" reboot + + echo "Waiting for cluster and node health after ${node_ip} reboot..." + run_talosctl -e "${ENDPOINT}" -n "${CONTROL_PLANES[0]}" health + run_talosctl -n "${node_ip}" version + kubectl get nodes -o wide +} + +echo "Using TALOSCONFIG: ${TALOSCONFIG_PATH}" +echo "Control planes: ${CONTROL_PLANES[*]}" +echo "Workers: ${WORKERS[*]}" +echo "Upgrade hops: ${UPGRADE_VERSIONS[*]}" +echo + +check_cluster_ready + +for version in "${UPGRADE_VERSIONS[@]}"; do + echo + echo "##### Starting upgrade hop ${version} #####" + + for node in "${CONTROL_PLANES[@]}"; do + upgrade_node_to_version "${node}" "${version}" + done + + for node in "${WORKERS[@]}"; do + upgrade_node_to_version "${node}" "${version}" + done + + echo "Completed hop ${version}. Verifying cluster state..." + check_cluster_ready +done + +echo +echo "All upgrade hops complete." +run_talosctl version +kubectl get nodes -o wide