Remove deprecated Argo CD application configurations and related files for noble cluster, including root-application.yaml, kustomization.yaml, and individual application manifests for argocd, cilium, longhorn, kube-vip, and monitoring components. Update kube-vip daemonset.yaml to enhance deployment strategy and environment variables for improved configuration.
This commit is contained in:
@@ -1,23 +0,0 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: argocd
|
||||
namespace: argocd
|
||||
annotations:
|
||||
argocd.argoproj.io/sync-wave: "-2"
|
||||
spec:
|
||||
project: default
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: argocd
|
||||
source:
|
||||
repoURL: https://gitea.pcenicni.ca/gsdavidp/home-server.git
|
||||
targetRevision: HEAD
|
||||
path: clusters/noble/bootstrap/argocd
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
|
||||
34
clusters/noble/apps/cilium/README.md
Normal file
34
clusters/noble/apps/cilium/README.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# Cilium — noble (Talos)
|
||||
|
||||
Talos uses **`cluster.network.cni.name: none`**; you must install Cilium (or another CNI) before nodes become **Ready** and before **MetalLB** / most workloads. See `talos/CLUSTER-BUILD.md` ordering.
|
||||
|
||||
## 1. Install (phase 1 — required)
|
||||
|
||||
Uses **`values.yaml`**: IPAM **kubernetes**, **`k8sServiceHost` / `k8sServicePort`** pointing at **KubePrism** (`127.0.0.1:7445`, Talos default), Talos cgroup paths, **drop `SYS_MODULE`** from agent caps, **`bpf.masquerade: false`** ([Talos Cilium](https://www.talos.dev/latest/kubernetes-guides/network/deploying-cilium/), [KubePrism](https://www.talos.dev/latest/kubernetes-guides/configuration/kubeprism/)). Without this, host-network CNI clients may **`dial tcp <VIP>:6443`** and fail if the VIP path is unhealthy.
|
||||
|
||||
From **repository root**:
|
||||
|
||||
```bash
|
||||
helm repo add cilium https://helm.cilium.io/
|
||||
helm repo update
|
||||
helm upgrade --install cilium cilium/cilium \
|
||||
--namespace kube-system \
|
||||
--version 1.16.6 \
|
||||
-f clusters/noble/apps/cilium/values.yaml \
|
||||
--wait
|
||||
```
|
||||
|
||||
Verify:
|
||||
|
||||
```bash
|
||||
kubectl -n kube-system rollout status ds/cilium
|
||||
kubectl get nodes
|
||||
```
|
||||
|
||||
When nodes are **Ready**, continue with **MetalLB** (`clusters/noble/apps/metallb/README.md`) and other Phase B items. **kube-vip** for the Kubernetes API VIP is separate (L2 ARP); it can run after the API is reachable.
|
||||
|
||||
## 2. Optional: kube-proxy replacement (phase 2)
|
||||
|
||||
To replace **`kube-proxy`** with Cilium entirely, use **`values-kpr.yaml`** and **`cluster.proxy.disabled: true`** in Talos on every node (see comments inside `values-kpr.yaml`). Follow the upstream [Deploy Cilium CNI](https://www.talos.dev/latest/kubernetes-guides/network/deploying-cilium/) section *without kube-proxy*.
|
||||
|
||||
Do **not** skip phase 1 unless you already know your cluster matches the “bootstrap window” flow from the Talos docs.
|
||||
@@ -1,46 +0,0 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: cilium
|
||||
namespace: argocd
|
||||
annotations:
|
||||
argocd.argoproj.io/sync-wave: "0"
|
||||
spec:
|
||||
project: default
|
||||
# Argo SSA vs CLI helm: ignore generated TLS and fields Argo commonly owns so
|
||||
# RespectIgnoreDifferences can skip fighting Helm on sync.
|
||||
ignoreDifferences:
|
||||
- group: ""
|
||||
kind: Secret
|
||||
name: hubble-server-certs
|
||||
namespace: kube-system
|
||||
jqPathExpressions:
|
||||
- .data
|
||||
- group: apps
|
||||
kind: Deployment
|
||||
name: cilium-operator
|
||||
namespace: kube-system
|
||||
jsonPointers:
|
||||
- /spec/replicas
|
||||
- /spec/strategy/rollingUpdate/maxUnavailable
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: kube-system
|
||||
sources:
|
||||
- repoURL: https://helm.cilium.io/
|
||||
chart: cilium
|
||||
targetRevision: 1.16.6
|
||||
helm:
|
||||
valueFiles:
|
||||
- $values/clusters/noble/apps/cilium/helm-values.yaml
|
||||
- repoURL: https://gitea.pcenicni.ca/gsdavidp/home-server.git
|
||||
targetRevision: HEAD
|
||||
ref: values
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
- RespectIgnoreDifferences=true
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
# Same settings as the Argo CD Application (keep in sync).
|
||||
# Used for manual `helm install` before Argo when Talos uses cni: none.
|
||||
#
|
||||
# operator.replicas: chart default is 2 with required pod anti-affinity. If fewer
|
||||
# than two nodes can schedule (e.g. NotReady / taints), `helm --wait` never finishes.
|
||||
k8sServiceHost: 192.168.50.20
|
||||
k8sServicePort: 6443
|
||||
cgroup:
|
||||
autoMount:
|
||||
enabled: false
|
||||
hostRoot: /sys/fs/cgroup
|
||||
ipam:
|
||||
operator:
|
||||
clusterPoolIPv4PodCIDRList:
|
||||
- 10.244.0.0/16
|
||||
securityContext:
|
||||
capabilities:
|
||||
ciliumAgent:
|
||||
- CHOWN
|
||||
- KILL
|
||||
- NET_ADMIN
|
||||
- NET_RAW
|
||||
- IPC_LOCK
|
||||
- SYS_ADMIN
|
||||
- SYS_RESOURCE
|
||||
- DAC_OVERRIDE
|
||||
- FOWNER
|
||||
- SETGID
|
||||
- SETUID
|
||||
cleanCiliumState:
|
||||
- NET_ADMIN
|
||||
- SYS_ADMIN
|
||||
- SYS_RESOURCE
|
||||
|
||||
operator:
|
||||
replicas: 1
|
||||
49
clusters/noble/apps/cilium/values-kpr.yaml
Normal file
49
clusters/noble/apps/cilium/values-kpr.yaml
Normal file
@@ -0,0 +1,49 @@
|
||||
# Optional phase 2: kube-proxy replacement via Cilium + KubePrism (Talos apid forwards :7445 → :6443).
|
||||
# Prerequisites:
|
||||
# 1. Phase 1 Cilium installed and healthy; nodes Ready.
|
||||
# 2. Add to Talos machine config on ALL nodes:
|
||||
# cluster:
|
||||
# proxy:
|
||||
# disabled: true
|
||||
# (keep cluster.network.cni.name: none). Regenerate, apply-config, reboot as needed.
|
||||
# 3. Remove legacy kube-proxy objects if still present:
|
||||
# kubectl delete ds -n kube-system kube-proxy --ignore-not-found
|
||||
# kubectl delete cm -n kube-system kube-proxy --ignore-not-found
|
||||
# 4. helm upgrade cilium ... -f values-kpr.yaml
|
||||
#
|
||||
# Ref: https://www.talos.dev/latest/kubernetes-guides/network/deploying-cilium/
|
||||
|
||||
ipam:
|
||||
mode: kubernetes
|
||||
|
||||
kubeProxyReplacement: "true"
|
||||
|
||||
k8sServiceHost: localhost
|
||||
k8sServicePort: "7445"
|
||||
|
||||
securityContext:
|
||||
capabilities:
|
||||
ciliumAgent:
|
||||
- CHOWN
|
||||
- KILL
|
||||
- NET_ADMIN
|
||||
- NET_RAW
|
||||
- IPC_LOCK
|
||||
- SYS_ADMIN
|
||||
- SYS_RESOURCE
|
||||
- DAC_OVERRIDE
|
||||
- FOWNER
|
||||
- SETGID
|
||||
- SETUID
|
||||
cleanCiliumState:
|
||||
- NET_ADMIN
|
||||
- SYS_ADMIN
|
||||
- SYS_RESOURCE
|
||||
|
||||
cgroup:
|
||||
autoMount:
|
||||
enabled: false
|
||||
hostRoot: /sys/fs/cgroup
|
||||
|
||||
bpf:
|
||||
masquerade: false
|
||||
44
clusters/noble/apps/cilium/values.yaml
Normal file
44
clusters/noble/apps/cilium/values.yaml
Normal file
@@ -0,0 +1,44 @@
|
||||
# Cilium on Talos — phase 1: bring up CNI while kube-proxy still runs.
|
||||
# See README.md for install order (before MetalLB scheduling) and optional kube-proxy replacement.
|
||||
#
|
||||
# Chart: cilium/cilium — pin version in helm command (e.g. 1.16.6).
|
||||
# Ref: https://www.talos.dev/latest/kubernetes-guides/network/deploying-cilium/
|
||||
|
||||
ipam:
|
||||
mode: kubernetes
|
||||
|
||||
kubeProxyReplacement: "false"
|
||||
|
||||
# Host-network components cannot use kubernetes.default ClusterIP; Talos KubePrism (enabled by default)
|
||||
# on 127.0.0.1:7445 proxies to healthy apiservers and avoids flaky dials to cluster.controlPlane.endpoint (VIP).
|
||||
# Ref: https://www.talos.dev/latest/kubernetes-guides/configuration/kubeprism/
|
||||
k8sServiceHost: "127.0.0.1"
|
||||
k8sServicePort: "7445"
|
||||
|
||||
securityContext:
|
||||
capabilities:
|
||||
ciliumAgent:
|
||||
- CHOWN
|
||||
- KILL
|
||||
- NET_ADMIN
|
||||
- NET_RAW
|
||||
- IPC_LOCK
|
||||
- SYS_ADMIN
|
||||
- SYS_RESOURCE
|
||||
- DAC_OVERRIDE
|
||||
- FOWNER
|
||||
- SETGID
|
||||
- SETUID
|
||||
cleanCiliumState:
|
||||
- NET_ADMIN
|
||||
- SYS_ADMIN
|
||||
- SYS_RESOURCE
|
||||
|
||||
cgroup:
|
||||
autoMount:
|
||||
enabled: false
|
||||
hostRoot: /sys/fs/cgroup
|
||||
|
||||
# Workaround: Talos host DNS forwarding + bpf masquerade can break CoreDNS; see Talos Cilium guide "Known issues".
|
||||
bpf:
|
||||
masquerade: false
|
||||
@@ -1,23 +0,0 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: kube-vip
|
||||
namespace: argocd
|
||||
annotations:
|
||||
argocd.argoproj.io/sync-wave: "-1"
|
||||
spec:
|
||||
project: default
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: kube-system
|
||||
source:
|
||||
repoURL: https://gitea.pcenicni.ca/gsdavidp/home-server.git
|
||||
targetRevision: HEAD
|
||||
path: clusters/noble/apps/kube-vip
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
|
||||
@@ -3,4 +3,3 @@ kind: Kustomization
|
||||
resources:
|
||||
- vip-rbac.yaml
|
||||
- vip-daemonset.yaml
|
||||
|
||||
|
||||
@@ -4,6 +4,11 @@ metadata:
|
||||
name: kube-vip-ds
|
||||
namespace: kube-system
|
||||
spec:
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 1
|
||||
maxSurge: 0
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: kube-vip-ds
|
||||
@@ -13,6 +18,9 @@ spec:
|
||||
app.kubernetes.io/name: kube-vip-ds
|
||||
spec:
|
||||
hostNetwork: true
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
priorityClassName: system-node-critical
|
||||
terminationGracePeriodSeconds: 90
|
||||
serviceAccountName: kube-vip
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/control-plane: ""
|
||||
@@ -32,6 +40,12 @@ spec:
|
||||
args:
|
||||
- manager
|
||||
env:
|
||||
# Leader election identity must be the Kubernetes node name (hostNetwork
|
||||
# hostname is not always the same; without this, no leader → no VIP).
|
||||
- name: vip_nodename
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
- name: vip_arp
|
||||
value: "true"
|
||||
- name: address
|
||||
@@ -41,29 +55,29 @@ spec:
|
||||
# Physical uplink from `talosctl -n <cp-ip> get links` (this cluster: ens18).
|
||||
- name: vip_interface
|
||||
value: "ens18"
|
||||
# Must include "/" — kube-vip does netlink.ParseAddr(address + subnet); "32" breaks (192.168.50.x32).
|
||||
- name: vip_subnet
|
||||
value: "32"
|
||||
value: "/32"
|
||||
- name: vip_leaderelection
|
||||
value: "true"
|
||||
- name: cp_enable
|
||||
value: "true"
|
||||
- name: cp_namespace
|
||||
value: "kube-system"
|
||||
# Control-plane VIP only until stable: with svc_enable=true the services leader-election
|
||||
# path calls log.Fatal on many failures / leadership moves → CrashLoopBackOff on all CP nodes.
|
||||
# Re-enable "true" after pods are 1/1; if they loop again, capture: kubectl logs -n kube-system -l app.kubernetes.io/name=kube-vip-ds --previous --tail=100
|
||||
- name: svc_enable
|
||||
value: "true"
|
||||
# Env is svc_election (not servicesElection); see pkg/kubevip/config_envvar.go
|
||||
- name: svc_election
|
||||
value: "true"
|
||||
value: "false"
|
||||
- name: vip_leaseduration
|
||||
value: "5"
|
||||
value: "15"
|
||||
- name: vip_renewdeadline
|
||||
value: "3"
|
||||
value: "10"
|
||||
- name: vip_retryperiod
|
||||
value: "1"
|
||||
value: "2"
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- NET_ADMIN
|
||||
- NET_RAW
|
||||
- SYS_TIME
|
||||
|
||||
|
||||
@@ -10,14 +10,20 @@ metadata:
|
||||
name: kube-vip-role
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["services", "services/status", "nodes", "endpoints"]
|
||||
resources: ["services", "services/status", "endpoints"]
|
||||
verbs: ["get", "list", "watch", "update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["get", "list", "watch", "update", "patch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["events"]
|
||||
verbs: ["create", "patch", "update"]
|
||||
- apiGroups: ["coordination.k8s.io"]
|
||||
resources: ["leases"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch"]
|
||||
- apiGroups: ["discovery.k8s.io"]
|
||||
resources: ["endpointslices"]
|
||||
verbs: ["get", "list", "watch", "update"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
@@ -31,4 +37,3 @@ subjects:
|
||||
- kind: ServiceAccount
|
||||
name: kube-vip
|
||||
namespace: kube-system
|
||||
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- argocd/application.yaml
|
||||
- cilium/application.yaml
|
||||
- kube-vip/application.yaml
|
||||
- longhorn/application.yaml
|
||||
- monitoring-kube-prometheus/application.yaml
|
||||
- monitoring-loki/application.yaml
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: longhorn
|
||||
namespace: argocd
|
||||
annotations:
|
||||
argocd.argoproj.io/sync-wave: "1"
|
||||
spec:
|
||||
project: default
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: longhorn-system
|
||||
sources:
|
||||
- repoURL: https://charts.longhorn.io
|
||||
chart: longhorn
|
||||
targetRevision: "1.11.1"
|
||||
helm:
|
||||
skipCrds: false
|
||||
valuesObject:
|
||||
defaultSettings:
|
||||
createDefaultDiskLabeledNodes: false
|
||||
defaultDataPath: /var/mnt/longhorn
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
retry:
|
||||
limit: 5
|
||||
backoff:
|
||||
duration: 20s
|
||||
factor: 2
|
||||
maxDuration: 3m
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
- PruneLast=true
|
||||
4
clusters/noble/apps/longhorn/kustomization.yaml
Normal file
4
clusters/noble/apps/longhorn/kustomization.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
10
clusters/noble/apps/longhorn/namespace.yaml
Normal file
10
clusters/noble/apps/longhorn/namespace.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
# Longhorn Manager uses hostPath + privileged; incompatible with Pod Security "baseline".
|
||||
# Apply before or after Helm — merges labels onto existing longhorn-system.
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: longhorn-system
|
||||
labels:
|
||||
pod-security.kubernetes.io/enforce: privileged
|
||||
pod-security.kubernetes.io/audit: privileged
|
||||
pod-security.kubernetes.io/warn: privileged
|
||||
21
clusters/noble/apps/longhorn/values.yaml
Normal file
21
clusters/noble/apps/longhorn/values.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
# Longhorn Helm values — use with Talos user volume + kubelet mounts (see talos/talconfig.yaml).
|
||||
# 1) PSA: `kubectl apply -k clusters/noble/apps/longhorn` (privileged namespace) before or after Helm.
|
||||
# 2) Talos: bind `/var/lib/longhorn` → `/var/mnt/longhorn` in kubelet extraMounts — chart hostPath is fixed to /var/lib/longhorn.
|
||||
# Example (run from home-server repo root so -f path resolves):
|
||||
# kubectl apply -k clusters/noble/apps/longhorn
|
||||
# helm repo add longhorn https://charts.longhorn.io && helm repo update
|
||||
# helm upgrade --install longhorn longhorn/longhorn -n longhorn-system --create-namespace \
|
||||
# -f clusters/noble/apps/longhorn/values.yaml
|
||||
# "helm upgrade --install" needs two arguments: RELEASE_NAME and CHART (e.g. longhorn longhorn/longhorn).
|
||||
#
|
||||
# If you already installed Longhorn without this file: fix Default Settings in the UI or edit each
|
||||
# node's disk path to /var/mnt/longhorn; wrong path → "wrong format" (root fs / overlay).
|
||||
|
||||
defaultSettings:
|
||||
defaultDataPath: /var/mnt/longhorn
|
||||
# Default 30% reserved often makes small data disks look "full" to the scheduler.
|
||||
storageReservedPercentageForDefaultDisk: "10"
|
||||
|
||||
# Pre-upgrade Job waits for healthy managers; disable while fixing Talos image (iscsi-tools) / kubelet binds, then re-enable.
|
||||
preUpgradeChecker:
|
||||
jobEnabled: false
|
||||
52
clusters/noble/apps/metallb/README.md
Normal file
52
clusters/noble/apps/metallb/README.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# MetalLB (layer 2) — noble
|
||||
|
||||
**Prerequisite (Talos + `cni: none`):** install **Cilium** (or your CNI) **before** MetalLB.
|
||||
|
||||
Until the CNI is up, nodes stay **`NotReady`** and carry taints such as **`node.kubernetes.io/network-unavailable`** (and **`not-ready`**). The scheduler then reports **`0/N nodes are available: N node(s) had untolerated taint(s)`** and MetalLB stays **`Pending`** — its chart does not tolerate those taints, by design. **Install Cilium first** (`talos/CLUSTER-BUILD.md` Phase B); when nodes are **`Ready`**, reinstall or rollout MetalLB if needed.
|
||||
|
||||
**Order:** namespace (Pod Security) → **Helm** (CRDs + controller) → **kustomize** (pool + L2).
|
||||
|
||||
If `kubectl apply -k` fails with **`no matches for kind "IPAddressPool"`** / **`ensure CRDs are installed first`**, Helm is not installed yet.
|
||||
|
||||
**Pod Security warnings** (`would violate PodSecurity "restricted"`): MetalLB’s speaker/FRR use `hostNetwork`, `NET_ADMIN`, etc. That is expected unless `metallb-system` is labeled **privileged**. Apply `namespace.yaml` **before** Helm so the namespace is created with the right labels (omit `--create-namespace` on Helm), or patch an existing namespace:
|
||||
|
||||
```bash
|
||||
kubectl apply -f clusters/noble/apps/metallb/namespace.yaml
|
||||
```
|
||||
|
||||
If you already ran Helm with `--create-namespace`, either `kubectl apply -f namespace.yaml` (merges labels) or:
|
||||
|
||||
```bash
|
||||
kubectl label namespace metallb-system \
|
||||
pod-security.kubernetes.io/enforce=privileged \
|
||||
pod-security.kubernetes.io/audit=privileged \
|
||||
pod-security.kubernetes.io/warn=privileged --overwrite
|
||||
```
|
||||
|
||||
Then restart MetalLB pods if they were failing (`kubectl get pods -n metallb-system`; delete stuck pods or `kubectl rollout restart` each `Deployment` / `DaemonSet` in that namespace).
|
||||
|
||||
1. Install the MetalLB chart (CRDs + controller). If you applied `namespace.yaml` above, **skip** `--create-namespace`:
|
||||
|
||||
```bash
|
||||
helm repo add metallb https://metallb.github.io/metallb
|
||||
helm repo update
|
||||
helm upgrade --install metallb metallb/metallb \
|
||||
--namespace metallb-system \
|
||||
--wait
|
||||
```
|
||||
|
||||
2. Apply this folder’s pool and L2 advertisement:
|
||||
|
||||
```bash
|
||||
kubectl apply -k clusters/noble/apps/metallb
|
||||
```
|
||||
|
||||
3. Confirm a test `Service` `type: LoadBalancer` receives an address in `192.168.50.210`–`192.168.50.229`.
|
||||
|
||||
Reserve **one** IP in that range for Argo CD (e.g. `192.168.50.210`) via `spec.loadBalancerIP` or chart values when you expose the server.
|
||||
|
||||
### `Pending` MetalLB pods
|
||||
|
||||
1. `kubectl get nodes` — every node **`Ready`**? If **`NotReady`** or **`NetworkUnavailable`**, finish **CNI** install first.
|
||||
2. `kubectl describe pod -n metallb-system <pod-name>` — read **Events** at the bottom (`0/N nodes are available: …`).
|
||||
3. L2 speaker uses the node’s uplink; kube-vip in this repo expects **`ens18`** on control planes (`clusters/noble/apps/kube-vip/vip-daemonset.yaml`). If your NIC name differs, change `vip_interface` there.
|
||||
19
clusters/noble/apps/metallb/ip-address-pool.yaml
Normal file
19
clusters/noble/apps/metallb/ip-address-pool.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
# Apply after MetalLB controller is installed (Helm chart or manifest).
|
||||
# Namespace must match where MetalLB expects pools (commonly metallb-system).
|
||||
apiVersion: metallb.io/v1beta1
|
||||
kind: IPAddressPool
|
||||
metadata:
|
||||
name: noble-l2
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
addresses:
|
||||
- 192.168.50.210-192.168.50.229
|
||||
---
|
||||
apiVersion: metallb.io/v1beta1
|
||||
kind: L2Advertisement
|
||||
metadata:
|
||||
name: noble-l2
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
ipAddressPools:
|
||||
- noble-l2
|
||||
4
clusters/noble/apps/metallb/kustomization.yaml
Normal file
4
clusters/noble/apps/metallb/kustomization.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ip-address-pool.yaml
|
||||
11
clusters/noble/apps/metallb/namespace.yaml
Normal file
11
clusters/noble/apps/metallb/namespace.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
# Apply before Helm if you do not use --create-namespace, or use this to fix PSA after the fact:
|
||||
# kubectl apply -f clusters/noble/apps/metallb/namespace.yaml
|
||||
# MetalLB speaker needs hostNetwork + NET_ADMIN; incompatible with Pod Security "restricted".
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: metallb-system
|
||||
labels:
|
||||
pod-security.kubernetes.io/enforce: privileged
|
||||
pod-security.kubernetes.io/audit: privileged
|
||||
pod-security.kubernetes.io/warn: privileged
|
||||
10
clusters/noble/apps/metrics-server/values.yaml
Normal file
10
clusters/noble/apps/metrics-server/values.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
# metrics-server — noble (Talos)
|
||||
# Kubelet serving certs are not validated by default; see Talos docs:
|
||||
# https://www.talos.dev/latest/kubernetes-guides/configuration/deploy-metrics-server/
|
||||
#
|
||||
# helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/
|
||||
# helm upgrade --install metrics-server metrics-server/metrics-server -n kube-system \
|
||||
# --version 3.13.0 -f clusters/noble/apps/metrics-server/values.yaml --wait
|
||||
|
||||
args:
|
||||
- --kubelet-insecure-tls
|
||||
@@ -1,64 +0,0 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: monitoring-kube-prometheus
|
||||
namespace: argocd
|
||||
annotations:
|
||||
argocd.argoproj.io/sync-wave: "2"
|
||||
spec:
|
||||
project: default
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: monitoring
|
||||
sources:
|
||||
- repoURL: https://prometheus-community.github.io/helm-charts
|
||||
chart: kube-prometheus-stack
|
||||
targetRevision: "*"
|
||||
helm:
|
||||
valuesObject:
|
||||
prometheus:
|
||||
prometheusSpec:
|
||||
retention: 15d
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: longhorn
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
alertmanager:
|
||||
alertmanagerSpec:
|
||||
retention: 120h
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: longhorn
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
kubeEtcd:
|
||||
enabled: false
|
||||
kubeScheduler:
|
||||
enabled: false
|
||||
kubeControllerManager:
|
||||
enabled: false
|
||||
grafana:
|
||||
defaultDashboardsTimezone: browser
|
||||
additionalDataSources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
uid: loki
|
||||
access: proxy
|
||||
url: http://loki-stack.monitoring.svc.cluster.local:3100
|
||||
isDefault: false
|
||||
editable: true
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
@@ -1,42 +0,0 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: monitoring-loki
|
||||
namespace: argocd
|
||||
annotations:
|
||||
argocd.argoproj.io/sync-wave: "2"
|
||||
spec:
|
||||
project: default
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: monitoring
|
||||
sources:
|
||||
- repoURL: https://grafana.github.io/helm-charts
|
||||
chart: loki-stack
|
||||
targetRevision: "*"
|
||||
helm:
|
||||
valuesObject:
|
||||
loki:
|
||||
enabled: true
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClassName: longhorn
|
||||
size: 20Gi
|
||||
promtail:
|
||||
enabled: true
|
||||
grafana:
|
||||
enabled: false
|
||||
prometheus:
|
||||
enabled: false
|
||||
filebeat:
|
||||
enabled: false
|
||||
fluent-bit:
|
||||
enabled: false
|
||||
logstash:
|
||||
enabled: false
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
@@ -1,16 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: argocd-server-lb
|
||||
namespace: argocd
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerIP: 192.168.50.231
|
||||
selector:
|
||||
app.kubernetes.io/name: argocd-server
|
||||
ports:
|
||||
- name: https
|
||||
protocol: TCP
|
||||
port: 443
|
||||
targetPort: 8080
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: AppProject
|
||||
metadata:
|
||||
name: default
|
||||
namespace: argocd
|
||||
spec:
|
||||
description: Default project for noble cluster apps
|
||||
sourceRepos:
|
||||
- '*'
|
||||
destinations:
|
||||
- namespace: '*'
|
||||
server: '*'
|
||||
clusterResourceWhitelist:
|
||||
- group: '*'
|
||||
kind: '*'
|
||||
namespaceResourceWhitelist:
|
||||
- group: '*'
|
||||
kind: '*'
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: argocd
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- https://raw.githubusercontent.com/argoproj/argo-cd/v2.14.7/manifests/install.yaml
|
||||
# Apply after install.yaml CRDs are Established (see README bootstrap); same file for GitOps retries.
|
||||
- default-appproject.yaml
|
||||
- argocd-server-lb.yaml
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: argocd
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: noble-root
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: argocd
|
||||
source:
|
||||
repoURL: https://gitea.pcenicni.ca/gsdavidp/home-server.git
|
||||
targetRevision: HEAD
|
||||
path: clusters/noble/apps
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
|
||||
5
talos/.gitignore
vendored
Normal file
5
talos/.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
# talhelper / talosctl local output (prefer `talhelper genconfig -o out` → parent ignores talos/out/)
|
||||
clusterconfig/
|
||||
talsecret.yaml
|
||||
talsecret.yml
|
||||
kubeconfig
|
||||
138
talos/CLUSTER-BUILD.md
Normal file
138
talos/CLUSTER-BUILD.md
Normal file
@@ -0,0 +1,138 @@
|
||||
# Noble lab — Talos cluster build checklist
|
||||
|
||||
This document is the **exported TODO** for the **noble** Talos cluster (4 nodes). Commands and troubleshooting live in [`README.md`](./README.md).
|
||||
|
||||
## Current state (2026-03-28)
|
||||
|
||||
- **Talos** v1.12.6 (target) / **Kubernetes** as bundled — four nodes **Ready** unless upgrading; **`talosctl health`**; **`talos/kubeconfig`** for `kubectl` (root `kubeconfig` may still be a stub). **Image Factory (nocloud installer):** `factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6`
|
||||
- **Cilium** Helm **1.16.6** / app **1.16.6** (`clusters/noble/apps/cilium/`, phase 1 values).
|
||||
- **MetalLB** Helm **0.15.3** / app **v0.15.3**; **IPAddressPool** `noble-l2` + **L2Advertisement** — pool **`192.168.50.210`–`192.168.50.229`**.
|
||||
- **kube-vip** DaemonSet **3/3** on control planes; VIP **`192.168.50.230`** on **`ens18`** (`vip_subnet` **`/32`** required — bare **`32`** breaks parsing). **Verified from workstation:** `kubectl config set-cluster noble --server=https://192.168.50.230:6443` then **`kubectl get --raw /healthz`** → **`ok`** (`talos/kubeconfig`; see `talos/README.md`).
|
||||
- **metrics-server** Helm **3.13.0** / app **v0.8.0** — `clusters/noble/apps/metrics-server/values.yaml` (`--kubelet-insecure-tls` for Talos); **`kubectl top nodes`** works.
|
||||
- **Still open:** Longhorn, Traefik, cert-manager, Argo CD, observability — checklist below.
|
||||
|
||||
## Inventory
|
||||
|
||||
| Host | Role | IP |
|
||||
|------|------|-----|
|
||||
| helium | worker | `192.168.50.10` |
|
||||
| neon | control-plane + worker | `192.168.50.20` |
|
||||
| argon | control-plane + worker | `192.168.50.30` |
|
||||
| krypton | control-plane + worker | `192.168.50.40` |
|
||||
|
||||
## Network reservations
|
||||
|
||||
| Use | Value |
|
||||
|-----|--------|
|
||||
| Kubernetes API VIP (kube-vip) | `192.168.50.230` (see `talos/README.md`; align with `talos/talconfig.yaml` `additionalApiServerCertSans`) |
|
||||
| MetalLB L2 pool | `192.168.50.210`–`192.168.50.229` |
|
||||
| Argo CD `LoadBalancer` | **Pick one IP** in the MetalLB pool (e.g. `192.168.50.210`) |
|
||||
| Apps ingress DNS | `*.apps.noble.lab.pcenicni.dev` |
|
||||
| ExternalDNS | Pangolin (map to supported ExternalDNS provider when documented) |
|
||||
| Velero | S3-compatible URL — configure later |
|
||||
|
||||
## Versions
|
||||
|
||||
- Talos: **v1.12.6** — align `talosctl` client with node image
|
||||
- Talos **Image Factory** (iscsi-tools + util-linux-tools): **`factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6`** — same schematic must appear in **`machine.install.image`** after `talhelper genconfig` (bare metal may use `metal-installer/` instead of `nocloud-installer/`)
|
||||
- Kubernetes: **1.35.2** on current nodes (bundled with Talos; not pinned in repo)
|
||||
- Cilium: **1.16.6** (Helm chart; see `clusters/noble/apps/cilium/README.md`)
|
||||
- MetalLB: **0.15.3** (Helm chart; app **v0.15.3**)
|
||||
- metrics-server: **3.13.0** (Helm chart; app **v0.8.0**)
|
||||
|
||||
## Repo paths (this workspace)
|
||||
|
||||
| Artifact | Path |
|
||||
|----------|------|
|
||||
| This checklist | `talos/CLUSTER-BUILD.md` |
|
||||
| Talos quick start + networking + kubeconfig | `talos/README.md` |
|
||||
| talhelper source (active) | `talos/talconfig.yaml` — may be **wipe-phase** (no Longhorn volume) during disk recovery |
|
||||
| Longhorn volume restore | `talos/talconfig.with-longhorn.yaml` — copy to `talconfig.yaml` after GPT wipe (see `talos/README.md` §5) |
|
||||
| Longhorn GPT wipe automation | `talos/scripts/longhorn-gpt-recovery.sh` |
|
||||
| kube-vip (kustomize) | `clusters/noble/apps/kube-vip/` (`vip_interface` e.g. `ens18`) |
|
||||
| Cilium (Helm values) | `clusters/noble/apps/cilium/` — `values.yaml` (phase 1), optional `values-kpr.yaml`, `README.md` |
|
||||
| MetalLB | `clusters/noble/apps/metallb/` — `namespace.yaml` (PSA **privileged**), `ip-address-pool.yaml`, `kustomization.yaml`, `README.md` |
|
||||
| Longhorn Helm values | `clusters/noble/apps/longhorn/values.yaml` |
|
||||
| metrics-server (Helm values) | `clusters/noble/apps/metrics-server/values.yaml` |
|
||||
|
||||
**Git vs cluster:** manifests and `talconfig` live in git; **`talhelper genconfig -o out`**, bootstrap, Helm, and `kubectl` run on your LAN. See **`talos/README.md`** for workstation reachability (lab LAN/VPN), **`talosctl kubeconfig`** vs Kubernetes `server:` (VIP vs node IP), and **`--insecure`** only in maintenance.
|
||||
|
||||
## Ordering (do not skip)
|
||||
|
||||
1. **Talos** installed; **Cilium** (or chosen CNI) **before** most workloads — with `cni: none`, nodes stay **NotReady** / **network-unavailable** taint until CNI is up.
|
||||
2. **MetalLB Helm chart** (CRDs + controller) **before** `kubectl apply -k` on the pool manifests.
|
||||
3. **`clusters/noble/apps/metallb/namespace.yaml`** before or merged onto `metallb-system` so Pod Security does not block speaker (see `apps/metallb/README.md`).
|
||||
4. **Longhorn:** Talos user volume + extensions in `talconfig.with-longhorn.yaml` (when restored); Helm **`defaultDataPath`** in `clusters/noble/apps/longhorn/values.yaml`.
|
||||
|
||||
## Prerequisites (before phases)
|
||||
|
||||
- [x] `talos/talconfig.yaml` checked in (VIP, API SANs, `cni: none`, `iscsi-tools` / `util-linux-tools` in schematic) — run `talhelper validate talconfig talconfig.yaml` after edits
|
||||
- [x] Workstation on a **routable path** to node IPs or VIP (same LAN / VPN); `talos/README.md` §3 if `kubectl` hits wrong `server:` or `network is unreachable`
|
||||
- [x] `talosctl` client matches node Talos version; `talhelper` for `genconfig`
|
||||
- [x] Node static IPs (helium, neon, argon, krypton)
|
||||
- [x] DHCP does not lease `192.168.50.210`–`229`, `230`, or node IPs
|
||||
- [x] DNS for API and apps as in `talos/README.md`
|
||||
- [x] Git remote ready for Argo CD (argo-cd)
|
||||
- [x] **`talos/kubeconfig`** from `talosctl kubeconfig` — root repo `kubeconfig` is a stub until populated
|
||||
|
||||
## Phase A — Talos bootstrap + API VIP
|
||||
|
||||
- [x] `talhelper gensecret` → `talhelper genconfig -o out` (re-run `genconfig` after every `talconfig` edit)
|
||||
- [x] `apply-config` all nodes (`talos/README.md` §2 — **no** `--insecure` after nodes join; use `TALOSCONFIG`)
|
||||
- [x] `talosctl bootstrap` once; other control planes and worker join
|
||||
- [x] `talosctl kubeconfig` → working `kubectl` (`talos/README.md` §3 — override `server:` if VIP not reachable from workstation)
|
||||
- [x] **kube-vip manifests** in `clusters/noble/apps/kube-vip`
|
||||
- [x] kube-vip healthy; `vip_interface` matches uplink (`talosctl get links`); VIP reachable where needed
|
||||
- [x] `talosctl health` (e.g. `talosctl health -n 192.168.50.20` with `TALOSCONFIG` set)
|
||||
|
||||
## Phase B — Core platform
|
||||
|
||||
**Install order:** **Cilium** → **metrics-server** → **Longhorn** (Talos disk + Helm) → **MetalLB** (Helm → pool manifests) → ingress / certs / DNS as planned.
|
||||
|
||||
- [x] **Cilium** (Helm **1.16.6**) — **required** before MetalLB if `cni: none` (`clusters/noble/apps/cilium/`)
|
||||
- [x] **metrics-server** — Helm **3.13.0**; values in `clusters/noble/apps/metrics-server/values.yaml`; verify `kubectl top nodes`
|
||||
- [ ] **Longhorn** — Talos: `talconfig.with-longhorn.yaml` + `talos/README.md` §5; Helm: `clusters/noble/apps/longhorn/values.yaml` (`defaultDataPath` `/var/mnt/longhorn`)
|
||||
- [x] **MetalLB** — chart installed; **pool + L2** from `clusters/noble/apps/metallb/` applied (`192.168.50.210`–`229`)
|
||||
- [ ] **`Service` `LoadBalancer`** test — assign an IP from `210`–`229` (e.g. dummy `LoadBalancer` or Traefik)
|
||||
- [ ] **Traefik** `LoadBalancer` for `*.apps.noble.lab.pcenicni.dev`
|
||||
- [ ] **cert-manager** + ClusterIssuer (staging → prod)
|
||||
- [ ] **ExternalDNS** (Pangolin-compatible provider)
|
||||
|
||||
## Phase C — GitOps
|
||||
|
||||
- [ ] **Argo CD** bootstrap (`clusters/noble/bootstrap/argocd`, root app) — path TBD when added
|
||||
- [ ] Argo CD server **LoadBalancer** with dedicated pool IP
|
||||
- [ ] SSO — later
|
||||
|
||||
## Phase D — Observability
|
||||
|
||||
- [ ] **kube-prometheus-stack** (PVCs on Longhorn)
|
||||
- [ ] **Loki** + **Fluent Bit**; Grafana datasource
|
||||
|
||||
## Phase E — Secrets
|
||||
|
||||
- [ ] **Sealed Secrets** (optional Git workflow)
|
||||
- [ ] **Vault** in-cluster on Longhorn + **auto-unseal**
|
||||
- [ ] **External Secrets Operator** + Vault `ClusterSecretStore`
|
||||
|
||||
## Phase F — Policy + backups
|
||||
|
||||
- [ ] **Kyverno** baseline policies
|
||||
- [ ] **Velero** when S3 is ready; backup/restore drill
|
||||
|
||||
## Phase G — Hardening
|
||||
|
||||
- [ ] RBAC, network policies (Cilium), Alertmanager routes
|
||||
- [ ] Runbooks: API VIP, etcd, Longhorn, Vault
|
||||
|
||||
## Quick validation
|
||||
|
||||
- [x] `kubectl get nodes` — all **Ready**
|
||||
- [x] API via VIP `:6443` — **`kubectl get --raw /healthz`** → **`ok`** with kubeconfig **`server:`** `https://192.168.50.230:6443`
|
||||
- [ ] Test `LoadBalancer` receives IP from `210`–`229`
|
||||
- [ ] Sample Ingress + cert + ExternalDNS record
|
||||
- [ ] PVC bound; Prometheus/Loki durable if configured
|
||||
|
||||
---
|
||||
|
||||
*Keep in sync with `talos/README.md` and manifests under `clusters/noble/`.*
|
||||
574
talos/README.md
574
talos/README.md
@@ -1,544 +1,182 @@
|
||||
# Talos deployment (4 nodes)
|
||||
# Talos — noble lab
|
||||
|
||||
This directory contains a `talhelper` cluster definition for a 4-node Talos
|
||||
cluster:
|
||||
- **Cluster build checklist (exported TODO):** [CLUSTER-BUILD.md](./CLUSTER-BUILD.md)
|
||||
|
||||
- 3 hybrid control-plane/worker nodes: `noble-cp-1..3`
|
||||
- 1 worker-only node: `noble-worker-1`
|
||||
- `allowSchedulingOnControlPlanes: true`
|
||||
- CNI: `none` (for Cilium via GitOps)
|
||||
## Versions
|
||||
|
||||
## 1) Update values for your environment
|
||||
Align with [CLUSTER-BUILD.md](./CLUSTER-BUILD.md): Talos **v1.12.6**; `talosctl` client should match installed node image.
|
||||
|
||||
Edit `talconfig.yaml`:
|
||||
## DNS (prerequisites)
|
||||
|
||||
- `endpoint` (Kubernetes API VIP or LB IP)
|
||||
- **`additionalApiServerCertSans`** / **`additionalMachineCertSans`**: must include the
|
||||
**same VIP** (and DNS name, if you use one) that clients and `talosctl` use —
|
||||
otherwise TLS to `https://<VIP>:6443` fails because the cert only lists node
|
||||
IPs by default. This repo sets **`192.168.50.230`** (and
|
||||
**`kube.noble.lab.pcenicni.dev`**) to match kube-vip.
|
||||
- each node `ipAddress`
|
||||
- each node `installDisk` (for example `/dev/sda`, `/dev/nvme0n1`)
|
||||
- `talosVersion` / `kubernetesVersion` if desired
|
||||
| Name | Points to |
|
||||
|------|-----------|
|
||||
| `noble.lab`, `kube.noble.lab` (API SANs) | `192.168.50.230` (kube-vip) |
|
||||
| `*.apps.noble.lab.pcenicni.dev` | Traefik `LoadBalancer` IP from MetalLB pool (`192.168.50.210`–`229`) once ingress is up |
|
||||
|
||||
After changing SANs, run **`talhelper genconfig`**, re-**apply-config** to all
|
||||
**control-plane** nodes (certs are regenerated), then refresh **`talosctl kubeconfig`**.
|
||||
|
||||
## 2) Generate cluster secrets and machine configs
|
||||
## 1. Secrets and generated configs
|
||||
|
||||
From this directory:
|
||||
|
||||
```bash
|
||||
talhelper gensecret > talsecret.sops.yaml
|
||||
talhelper genconfig
|
||||
talhelper gensecret > talsecret.yaml
|
||||
# Encrypt for git if desired: sops -e -i talsecret.sops.yaml (see talhelper docs)
|
||||
|
||||
talhelper genconfig -o out
|
||||
```
|
||||
|
||||
Generated machine configs are written to `clusterconfig/`.
|
||||
`out/` is ignored via repo root `.gitignore` (`talos/out/`). Do not commit `talsecret.yaml` or generated machine configs.
|
||||
|
||||
## 3) Apply Talos configs
|
||||
**After any `talconfig.yaml` edit, run `genconfig` again** before `apply-config`. Stale `out/*.yaml` is easy to apply by mistake. Quick check: `grep -A8 kind: UserVolumeConfig out/noble-neon.yaml` should match what you expect (e.g. Longhorn `volumeType: disk`, not `grow`/`maxSize` on a partition).
|
||||
|
||||
Apply each node file to the matching node IP from `talconfig.yaml`:
|
||||
## 2. Apply machine config
|
||||
|
||||
Order: **§1 `genconfig` → apply all nodes → §3 bootstrap** (not the reverse). Use the same `talsecret` / `out/` generation for the life of the cluster; rotating secrets without reinstalling nodes breaks client trust.
|
||||
|
||||
**A) First install — node still in maintenance mode** (no Talos OS on disk yet, or explicitly in maintenance):
|
||||
|
||||
```bash
|
||||
talosctl apply-config --insecure -n 192.168.50.20 -f clusterconfig/noble-noble-cp-1.yaml
|
||||
talosctl apply-config --insecure -n 192.168.50.30 -f clusterconfig/noble-noble-cp-2.yaml
|
||||
talosctl apply-config --insecure -n 192.168.50.40 -f clusterconfig/noble-noble-cp-3.yaml
|
||||
talosctl apply-config --insecure -n 192.168.50.10 -f clusterconfig/noble-noble-worker-1.yaml
|
||||
talosctl apply-config --insecure -n 192.168.50.20 --file out/noble-neon.yaml
|
||||
# repeat for each node; TALOSCONFIG not required for --insecure maintenance API
|
||||
```
|
||||
|
||||
## 4) Bootstrap the cluster
|
||||
|
||||
After all nodes are up (bootstrap once, from any control-plane node):
|
||||
**B) Node already installed / cluster already bootstrapped** (`tls: certificate required` if you use `--insecure` here):
|
||||
|
||||
```bash
|
||||
talosctl bootstrap -n 192.168.50.20 -e 192.168.50.230
|
||||
talosctl kubeconfig -n 192.168.50.20 -e 192.168.50.230 .
|
||||
export TALOSCONFIG="${TALOSCONFIG:-$(pwd)/out/talosconfig}"
|
||||
talosctl apply-config -n 192.168.50.20 --file out/noble-neon.yaml
|
||||
```
|
||||
|
||||
## 5) Validate
|
||||
**Do not pass `--insecure` for (B).** With `--insecure`, `talosctl` does not use client certificates from `TALOSCONFIG`, so the node still responds with `tls: certificate required`. The flag means “maintenance API only,” not “skip server verification.”
|
||||
|
||||
**Wrong (what triggers the error):**
|
||||
|
||||
```bash
|
||||
talosctl -n 192.168.50.20 -e 192.168.50.230 health
|
||||
kubectl get nodes -o wide
|
||||
export TALOSCONFIG="$(pwd)/out/talosconfig"
|
||||
talosctl apply-config --insecure -n 192.168.50.20 --file out/noble-neon.yaml # still broken on joined nodes
|
||||
```
|
||||
|
||||
### `kubectl` errors: `lookup https: no such host` or `https://https/...`
|
||||
## 3. Bootstrap and kubeconfig
|
||||
|
||||
That means the **active** kubeconfig has a broken `cluster.server` URL (often a
|
||||
**double** `https://` or **duplicate** `:6443`). Kubernetes then tries to resolve
|
||||
the hostname `https`, which fails.
|
||||
|
||||
Inspect what you are using:
|
||||
Bootstrap **once** on the first control plane **after** configs are applied (example: neon):
|
||||
|
||||
```bash
|
||||
kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}{"\n"}'
|
||||
export TALOSCONFIG="${TALOSCONFIG:-$(pwd)/out/talosconfig}"
|
||||
talosctl bootstrap -n 192.168.50.20
|
||||
```
|
||||
|
||||
It must be a **single** valid URL, for example:
|
||||
|
||||
- `https://192.168.50.230:6443` (API VIP from `talconfig.yaml`), or
|
||||
- `https://kube.noble.lab.pcenicni.dev:6443` (if DNS points at that VIP)
|
||||
|
||||
Fix the cluster entry (replace `noble` with your context’s cluster name if
|
||||
different):
|
||||
|
||||
```bash
|
||||
kubectl config set-cluster noble --server=https://192.168.50.230:6443
|
||||
```
|
||||
|
||||
Or point `kubectl` at this repo’s kubeconfig (known-good server line):
|
||||
After the API is up (direct node IP first; use VIP after kube-vip is healthy):
|
||||
|
||||
```bash
|
||||
export TALOSCONFIG="${TALOSCONFIG:-$(pwd)/out/talosconfig}"
|
||||
talosctl kubeconfig ./kubeconfig -n 192.168.50.20 -e 192.168.50.230 --merge=false
|
||||
export KUBECONFIG="$(pwd)/kubeconfig"
|
||||
kubectl cluster-info
|
||||
```
|
||||
|
||||
Avoid pasting `https://` twice when running `kubectl config set-cluster ... --server=...`.
|
||||
|
||||
### `kubectl apply` fails: `localhost:8080` / `openapi` connection refused
|
||||
|
||||
`kubectl` is **not** using a real cluster config; it falls back to the default
|
||||
`http://localhost:8080` (no `KUBECONFIG`, empty file, or wrong file).
|
||||
|
||||
Fix:
|
||||
|
||||
```bash
|
||||
cd talos
|
||||
export KUBECONFIG="$(pwd)/kubeconfig"
|
||||
kubectl config current-context
|
||||
kubectl cluster-info
|
||||
```
|
||||
|
||||
Then run `kubectl apply` from the **repository root** (parent of `talos/`) in
|
||||
the same shell. Do **not** use a literal `cd /path/to/...` — that was only a
|
||||
placeholder. Example (adjust to where you cloned this repo):
|
||||
|
||||
```bash
|
||||
export KUBECONFIG="${HOME}/Developer/home-server/talos/kubeconfig"
|
||||
```
|
||||
|
||||
`kubectl config set-cluster noble ...` only updates the file **`kubectl` is
|
||||
actually reading** (often `~/.kube/config`). It does nothing if `KUBECONFIG`
|
||||
points at another path.
|
||||
|
||||
## 6) GitOps-pinned Cilium values
|
||||
|
||||
The Cilium settings that worked for this Talos cluster are now persisted in:
|
||||
|
||||
- `clusters/noble/apps/cilium/helm-values.yaml`
|
||||
- `clusters/noble/apps/cilium/application.yaml` (Helm chart + `valueFiles` from this repo)
|
||||
|
||||
That Argo CD `Application` pins chart `1.16.6` and uses the same values file
|
||||
for API host/port, cgroup settings, IPAM CIDR, and security capabilities.
|
||||
|
||||
### Cilium before Argo CD (`cni: none`)
|
||||
|
||||
This cluster uses **`cniConfig.name: none`** in `talconfig.yaml` so Talos does
|
||||
not install a CNI. **Argo CD pods cannot schedule** until some CNI makes nodes
|
||||
`Ready` (otherwise the `node.kubernetes.io/not-ready` taint blocks scheduling).
|
||||
|
||||
Install Cilium **once** with Helm from your workstation (same chart and values
|
||||
Argo will manage later), **then** bootstrap Argo CD:
|
||||
|
||||
```bash
|
||||
helm repo add cilium https://helm.cilium.io/
|
||||
helm repo update
|
||||
helm upgrade --install cilium cilium/cilium \
|
||||
--namespace kube-system \
|
||||
--version 1.16.6 \
|
||||
-f clusters/noble/apps/cilium/helm-values.yaml \
|
||||
--wait --timeout 10m
|
||||
kubectl get nodes
|
||||
kubectl wait --for=condition=Ready nodes --all --timeout=300s
|
||||
```
|
||||
|
||||
If **`helm --install` seems stuck** after “Installing it now”, it is usually still
|
||||
pulling images (`quay.io/cilium/...`) or waiting for pods to become Ready. In
|
||||
another terminal run `kubectl get pods -n kube-system -w` and check for
|
||||
`ImagePullBackOff`, `Pending`, or `CrashLoopBackOff`. To avoid blocking on
|
||||
Helm’s wait logic, install without `--wait`, confirm Cilium pods, then continue:
|
||||
Adjust `-n` / `-e` if your bootstrap node or VIP differ.
|
||||
|
||||
**Reachability (same idea for Talos and Kubernetes):**
|
||||
|
||||
| Command | What it connects to |
|
||||
|---------|---------------------|
|
||||
| `talosctl … -e <addr>` | Talos **apid** on `<addr>:50000` (not 6443) |
|
||||
| `kubectl` / Helm | Kubernetes API on `https://<addr>:6443` from kubeconfig |
|
||||
|
||||
If your Mac shows **`network is unreachable`** to `192.168.50.230`, fix **L2/L3** first (same **LAN** as the nodes, **VPN**, or routing). **`talosctl kubeconfig -e 192.168.50.20`** only chooses **which Talos node** fetches the admin cert; the **`server:`** URL inside kubeconfig still comes from **`cluster.controlPlane.endpoint`** in Talos config (here **`https://192.168.50.230:6443`**). So `kubectl` can still dial the **VIP** even when `-e` used a node IP.
|
||||
|
||||
After a successful `talosctl kubeconfig`, **point kubectl at a reachable control-plane IP** (same as bootstrap node until kube-vip works from your network):
|
||||
|
||||
```bash
|
||||
helm upgrade --install cilium cilium/cilium \
|
||||
--namespace kube-system \
|
||||
--version 1.16.6 \
|
||||
-f clusters/noble/apps/cilium/helm-values.yaml
|
||||
kubectl get pods -n kube-system -l app.kubernetes.io/part-of=cilium -w
|
||||
export TALOSCONFIG="${TALOSCONFIG:-$(pwd)/out/talosconfig}"
|
||||
talosctl kubeconfig ./kubeconfig -n 192.168.50.20 -e 192.168.50.20 --merge=false
|
||||
export KUBECONFIG="$(pwd)/kubeconfig"
|
||||
# Kubeconfig still says https://192.168.50.230:6443 — override if VIP is unreachable from this machine:
|
||||
kubectl config set-cluster noble --server=https://192.168.50.20:6443
|
||||
kubectl get nodes
|
||||
```
|
||||
|
||||
`helm-values.yaml` sets **`operator.replicas: 1`** so the chart default (two
|
||||
operators with hard anti-affinity) cannot deadlock `helm --wait` when only one
|
||||
node can take the operator early in bootstrap.
|
||||
|
||||
If **`helm upgrade` fails** with server-side apply conflicts and
|
||||
**`argocd-controller`**, Argo already synced Cilium and **owns those fields**
|
||||
on live objects. Clearing **`syncPolicy`** on the Application does **not**
|
||||
remove that ownership; Helm still conflicts until you **take over** the fields
|
||||
or only use Argo.
|
||||
|
||||
**One-shot CLI fix** (Helm 3.13+): add **`--force-conflicts`** so SSA wins the
|
||||
disputed fields:
|
||||
One-liner alternative (macOS/BSD `sed -i ''`; on Linux use `sed -i`):
|
||||
|
||||
```bash
|
||||
helm upgrade --install cilium cilium/cilium \
|
||||
--namespace kube-system \
|
||||
--version 1.16.6 \
|
||||
-f clusters/noble/apps/cilium/helm-values.yaml \
|
||||
--force-conflicts
|
||||
sed -i '' 's|https://192.168.50.230:6443|https://192.168.50.20:6443|g' kubeconfig
|
||||
```
|
||||
|
||||
Typical conflicts: Secret **`hubble-server-certs`** (`.data` TLS) and
|
||||
Deployment **`cilium-operator`** (`.spec.replicas`,
|
||||
`.spec/strategy/rollingUpdate/maxUnavailable`). The **`cilium` Application**
|
||||
lists **`ignoreDifferences`** for those paths plus **`RespectIgnoreDifferences`**
|
||||
so later Argo syncs do not keep overwriting them. Apply the manifest after you
|
||||
change it: **`kubectl apply -f clusters/noble/apps/cilium/application.yaml`**.
|
||||
Quick check from your Mac: `nc -vz 192.168.50.20 50000` (Talos) and `nc -vz 192.168.50.20 6443` (Kubernetes).
|
||||
|
||||
After bootstrap, prefer syncing Cilium **only through Argo** (from Git) instead
|
||||
of ad hoc Helm, unless you suspend the **`cilium`** Application first.
|
||||
**`dial tcp 192.168.50.230:6443` on nodes:** Host-network components (including **Cilium**) cannot use the in-cluster `kubernetes` Service; they otherwise follow **`cluster.controlPlane.endpoint`** (the VIP). Talos **KubePrism** on **`127.0.0.1:7445`** (default) load-balances to healthy apiservers. Ensure the CNI Helm values set **`k8sServiceHost: "127.0.0.1"`** and **`k8sServicePort: "7445"`** — see [`clusters/noble/apps/cilium/values.yaml`](../clusters/noble/apps/cilium/values.yaml). Also confirm **kube-vip**’s **`vip_interface`** matches the uplink (`talosctl -n <ip> get links` — e.g. **`ens18`** on these nodes). A bare **`curl -k https://192.168.50.230:6443/healthz`** often returns **`401 Unauthorized`** because no client cert was sent — that still means TLS to the VIP worked.
|
||||
|
||||
Shell tip: a line like **`# comment`** must start with **`#`**; if the shell
|
||||
reports **`command not found: #`**, the character is not a real hash or the
|
||||
line was pasted wrong—run **`kubectl apply ...`** as its own command without a
|
||||
leading comment on the same paste block.
|
||||
|
||||
If nodes were already `Ready`, you can skip straight to section 7.
|
||||
|
||||
## 7) Argo CD app-of-apps bootstrap
|
||||
|
||||
This repo includes an app-of-apps structure for cluster apps:
|
||||
|
||||
- Root app: `clusters/noble/root-application.yaml`
|
||||
- Child apps index: `clusters/noble/apps/kustomization.yaml`
|
||||
- Argo CD app: `clusters/noble/apps/argocd/application.yaml`
|
||||
- Cilium app: `clusters/noble/apps/cilium/application.yaml`
|
||||
|
||||
Bootstrap once from your workstation:
|
||||
**Verify the VIP with `kubectl` (copy as-is):** use a real kubeconfig path (not ` /path/to/…`). From the **repository root**:
|
||||
|
||||
```bash
|
||||
kubectl apply -k clusters/noble/bootstrap/argocd
|
||||
kubectl wait --for=condition=Established crd/appprojects.argoproj.io --timeout=120s
|
||||
kubectl apply -f clusters/noble/bootstrap/argocd/default-appproject.yaml
|
||||
kubectl apply -f clusters/noble/root-application.yaml
|
||||
export KUBECONFIG="${KUBECONFIG:-$(pwd)/talos/kubeconfig}"
|
||||
kubectl config set-cluster noble --server=https://192.168.50.230:6443
|
||||
kubectl get --raw /healthz
|
||||
```
|
||||
|
||||
If the first command errors on `AppProject` (“no matches for kind `AppProject`”), the CRDs were not ready yet; run the `kubectl wait` and `kubectl apply -f .../default-appproject.yaml` lines, then continue.
|
||||
Expect a single line: **`ok`**. If you see **`The connection to the server localhost:8080 was refused`**, `KUBECONFIG` was missing or wrong (e.g. typo **`.export`** instead of **`export`**, or a path that does not exist). Do not put **`#` comments** on the same line as `kubectl config set-cluster` when pasting — some shells copy the comment into the command.
|
||||
|
||||
After this, Argo CD continuously reconciles all applications under
|
||||
`clusters/noble/apps/`.
|
||||
**`kubectl` → `localhost:8080` / connection refused:** `talosctl kubeconfig` did **not** write a valid kubeconfig (often because the step above failed). Fix Talos/API reachability first; do not trust `kubectl` until `talosctl kubeconfig` completes without error.
|
||||
|
||||
## 8) kube-vip API VIP (`192.168.50.230`)
|
||||
## 4. Platform manifests (this repo)
|
||||
|
||||
HAProxy has been removed in favor of `kube-vip` running on control-plane nodes.
|
||||
| Component | Apply |
|
||||
|-----------|--------|
|
||||
| Cilium | **Before** kube-vip/MetalLB scheduling: Helm from [`clusters/noble/apps/cilium/README.md`](../clusters/noble/apps/cilium/README.md) (`values.yaml`) |
|
||||
| kube-vip | `kubectl apply -k ../clusters/noble/apps/kube-vip` |
|
||||
| MetalLB pool | After MetalLB controller install: `kubectl apply -k ../clusters/noble/apps/metallb` |
|
||||
| Longhorn PSA + Helm | `kubectl apply -k ../clusters/noble/apps/longhorn` then Helm from §5 below |
|
||||
|
||||
Manifests are in:
|
||||
Set `vip_interface` in `clusters/noble/apps/kube-vip/vip-daemonset.yaml` if it does not match the control-plane uplink (`talosctl -n <cp-ip> get links`).
|
||||
|
||||
- `clusters/noble/apps/kube-vip/application.yaml`
|
||||
- `clusters/noble/apps/kube-vip/vip-rbac.yaml`
|
||||
- `clusters/noble/apps/kube-vip/vip-daemonset.yaml`
|
||||
## 5. Longhorn (Talos)
|
||||
|
||||
The DaemonSet advertises `192.168.50.230` in ARP mode and fronts the Kubernetes
|
||||
API on port `6443`.
|
||||
|
||||
Apply manually (or let Argo CD sync from root app):
|
||||
1. **Machine image:** `talconfig.yaml` includes `iscsi-tools` and `util-linux-tools` extensions. After `talhelper genconfig`, **upgrade each node** so the running installer image matches (extensions are in the image, not applied live by config alone). If `longhorn-manager` logs **`iscsiadm` / `open-iscsi`**, the node image does not include the extension yet.
|
||||
2. **Pod Security + path:** Apply `kubectl apply -k ../clusters/noble/apps/longhorn` (privileged `longhorn-system`). The Helm chart host-mounts **`/var/lib/longhorn`**; `talconfig` adds a kubelet **bind** from `/var/mnt/longhorn` → `/var/lib/longhorn` so that path matches the dedicated XFS volume.
|
||||
3. **Data path:** From the **repository root** (not `talos/`), run Helm with a real release and chart name — not literal `...`:
|
||||
|
||||
```bash
|
||||
kubectl apply -k clusters/noble/apps/kube-vip
|
||||
helm repo add longhorn https://charts.longhorn.io && helm repo update
|
||||
helm upgrade --install longhorn longhorn/longhorn -n longhorn-system --create-namespace \
|
||||
-f clusters/noble/apps/longhorn/values.yaml
|
||||
```
|
||||
|
||||
Validate:
|
||||
If Longhorn defaults to `/var/lib/longhorn`, you get **wrong format** / **no space** on the Talos root filesystem.
|
||||
4. **Disk device:** Second disk is often `/dev/vdb` under Proxmox virtio; `talconfig` selects `sdb` or `vdb`. Confirm with `talosctl get disks -n <ip>`.
|
||||
5. **`filesystem type mismatch: gpt != xfs` on `volumeType: disk`:** The data disk still has a **GPT** from an older partition attempt. Whole-disk XFS needs a **raw** disk. Talos cannot `wipe disk` while `u-longhorn` claims the device.
|
||||
|
||||
```bash
|
||||
kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip-ds -o wide
|
||||
nc -vz 192.168.50.230 6443
|
||||
```
|
||||
**Repo layout:** `talconfig.yaml` = **wipe-phase** (no Longhorn volume / no kubelet bind). `talconfig.with-longhorn.yaml` = restore after wipes.
|
||||
|
||||
If **`kube-vip-ds` pods are `CrashLoopBackOff`**, logs usually show
|
||||
`could not get link for interface '…'`. kube-vip binds the VIP to
|
||||
**`vip_interface`**; on Talos the uplink is often **`eno1`**, **`enp0s…`**, or
|
||||
**`enx…`**, not **`eth0`**. On a control-plane node IP from `talconfig.yaml`:
|
||||
**Order matters.** `blockdevice "sdb" is in use by volume "u-longhorn"` means you tried to **wipe before** the running nodes received the wipe-phase machine config. You must **`talosctl apply-config`** (wipe YAML) on **every** node first, **reboot** if `u-longhorn` still appears, **then** `talosctl wipe disk`.
|
||||
|
||||
```bash
|
||||
talosctl -n 192.168.50.20 get links
|
||||
```
|
||||
**Automated (recommended):** from `talos/` after `talhelper genconfig -o out`:
|
||||
|
||||
Do **not** paste that command’s **table output** back into the shell: zsh runs
|
||||
each line as a command (e.g. `192.168.50.20` → `command not found`), and a line
|
||||
starting with **`NODE`** can be mistaken for the **`node`** binary and try to
|
||||
load a file like **`NAMESPACE`** in the current directory. Also avoid pasting
|
||||
the **prompt** (`(base) … %`) together with the command (duplicate prompt →
|
||||
parse errors).
|
||||
```bash
|
||||
cd talos && talhelper genconfig -o out && export TALOSCONFIG="$(pwd)/out/talosconfig"
|
||||
./scripts/longhorn-gpt-recovery.sh phase1 # apply wipe config to all 4 nodes; reboot cluster if needed
|
||||
./scripts/longhorn-gpt-recovery.sh phase2 # wipe disk, restore Longhorn talconfig, genconfig, apply all nodes
|
||||
```
|
||||
|
||||
Set **`vip_interface`** in `clusters/noble/apps/kube-vip/vip-daemonset.yaml` to
|
||||
that link’s **`metadata.id`**, commit, sync (or `kubectl apply -k
|
||||
clusters/noble/apps/kube-vip`), and confirm pods go **`Running`**.
|
||||
Use `DISK=vdb ./scripts/longhorn-gpt-recovery.sh phase2` if the second disk is `vdb`.
|
||||
|
||||
## 9) Argo CD via DNS host (no port)
|
||||
**Manual:** same sequence, but do not paste comment lines into zsh as commands (`#` lines can error if copy-paste breaks).
|
||||
|
||||
Argo CD is exposed through a kube-vip managed LoadBalancer Service:
|
||||
6. **“Error fetching pod status”** in the Longhorn UI is often API connectivity (VIP/DNS), `longhorn-manager` / CSI pods not ready, or RBAC. Check `kubectl get pods -n longhorn-system` and `kubectl logs -n longhorn-system -l app=longhorn-manager --tail=50` from a working kubeconfig.
|
||||
|
||||
- `argo.noble.lab.pcenicni.dev`
|
||||
## Troubleshooting
|
||||
|
||||
Manifests:
|
||||
### `user=apiserver-kubelet-client` / `verb=get` / `resource=nodes` (authorization error)
|
||||
|
||||
- `clusters/noble/bootstrap/argocd/argocd-server-lb.yaml`
|
||||
- `clusters/noble/apps/kube-vip/vip-daemonset.yaml` (`svc_enable: "true"`)
|
||||
That identity is the **client cert the kube-apiserver uses when talking to kubelets** (logs, exec, node metrics, etc.). Audit logs often show it when the apiserver checks **Node** access before proxying. It is **not** your human `kubectl` user.
|
||||
|
||||
After syncing manifests, create a Pi-hole DNS A record:
|
||||
- If **`kubectl get nodes`** and normal workloads work, treat log noise as **informational** unless something user-facing breaks (`kubectl logs`, `kubectl exec`, **metrics-server** node metrics, **HorizontalPodAutoscaler**).
|
||||
- If **logs/exec/metrics** fail cluster-wide, check default RBAC still exists (nothing should delete `system:*` ClusterRoles):
|
||||
|
||||
- `argo.noble.lab.pcenicni.dev` -> `192.168.50.231`
|
||||
```bash
|
||||
kubectl get clusterrole system:kubelet-api-admin system:node-proxier 2>&1
|
||||
```
|
||||
|
||||
## 10) Longhorn storage and extra disks
|
||||
- If you **customized** `authorization-config` / RBAC on the API server, revert or align with [kubelet authentication/authorization](https://kubernetes.io/docs/reference/access-authn-authz/kubelet-authn-authz/) expectations.
|
||||
|
||||
Longhorn is deployed from:
|
||||
|
||||
- `clusters/noble/apps/longhorn/application.yaml`
|
||||
|
||||
Monitoring apps are configured to use `storageClassName: longhorn`, so you can
|
||||
persist Prometheus/Alertmanager/Loki data once Longhorn is healthy.
|
||||
|
||||
### Argo CD: `longhorn` OutOfSync, Health **Missing**, no `longhorn-role`
|
||||
|
||||
**Missing** means nothing has been applied yet, or a sync never completed. The
|
||||
Helm chart creates `ClusterRole/longhorn-role` on a successful install.
|
||||
|
||||
1. See the failure reason:
|
||||
|
||||
```bash
|
||||
kubectl describe application longhorn -n argocd
|
||||
```
|
||||
|
||||
Check **Status → Conditions** and **Status → Operation State** for the error
|
||||
(for example Helm render error, CRD apply failure, or repo-server cannot reach
|
||||
`https://charts.longhorn.io`).
|
||||
|
||||
2. Trigger a sync (Argo CD UI **Sync**, or CLI):
|
||||
|
||||
```bash
|
||||
argocd app sync longhorn
|
||||
```
|
||||
|
||||
3. After a good sync, confirm:
|
||||
|
||||
```bash
|
||||
kubectl get clusterrole longhorn-role
|
||||
kubectl get pods -n longhorn-system
|
||||
```
|
||||
|
||||
### Extra drive layout (this cluster)
|
||||
|
||||
Each node uses:
|
||||
|
||||
- `/dev/sda` — Talos install disk (`installDisk` in `talconfig.yaml`)
|
||||
- `/dev/sdb` — dedicated Longhorn data disk
|
||||
|
||||
`talconfig.yaml` includes a global patch that partitions `/dev/sdb` and mounts it
|
||||
at `/var/mnt/longhorn`, which matches Longhorn `defaultDataPath` in the Argo
|
||||
Helm values.
|
||||
|
||||
After editing `talconfig.yaml`, regenerate and apply configs:
|
||||
|
||||
```bash
|
||||
cd talos
|
||||
talhelper genconfig
|
||||
# apply each node’s YAML from clusterconfig/ with talosctl apply-config
|
||||
```
|
||||
|
||||
Then reboot each node once so the new disk layout is applied.
|
||||
|
||||
### `talosctl` TLS errors (`unknown authority`, `Ed25519 verification failure`)
|
||||
|
||||
`talosctl` **does not** automatically use `talos/clusterconfig/talosconfig`. If you
|
||||
omit it, the client falls back to **`~/.talos/config`**, which is usually a
|
||||
**different** cluster CA — you then get TLS handshake failures against the noble
|
||||
nodes.
|
||||
|
||||
**Always** set this in the shell where you run `talosctl` (use an absolute path
|
||||
if you change directories):
|
||||
|
||||
```bash
|
||||
cd talos
|
||||
export TALOSCONFIG="$(pwd)/clusterconfig/talosconfig"
|
||||
export ENDPOINT=192.168.50.230
|
||||
```
|
||||
|
||||
Sanity check (should print Talos and Kubernetes versions, not TLS errors):
|
||||
|
||||
```bash
|
||||
talosctl -e "${ENDPOINT}" -n 192.168.50.20 version
|
||||
```
|
||||
|
||||
Then use the same shell for `apply-config`, `reboot`, and `health`.
|
||||
|
||||
If it **still** fails after `TALOSCONFIG` is set, the running cluster was likely
|
||||
bootstrapped with **different** secrets than the ones in your current
|
||||
`talsecret.sops.yaml` / regenerated `clusterconfig/`. In that case you need the
|
||||
**original** `talosconfig` that matched the cluster when it was created, or you
|
||||
must align secrets and cluster state (recovery / rebuild is a larger topic).
|
||||
|
||||
Keep **`talosctl`** roughly aligned with the node Talos version (for example
|
||||
`v1.12.x` clients for `v1.12.5` nodes).
|
||||
|
||||
**Paste tip:** run **one** command per line. Pasting `...cp-3.yaml` and
|
||||
`talosctl` on the same line breaks the filename and can confuse the shell.
|
||||
|
||||
### More than one extra disk per node
|
||||
|
||||
If you add a third disk later, extend `machine.disks` in `talconfig.yaml` (for
|
||||
example `/dev/sdc` → `/var/mnt/longhorn-disk2`) and register that path in
|
||||
Longhorn as an additional disk for that node.
|
||||
|
||||
Recommended:
|
||||
|
||||
- use one dedicated filesystem per Longhorn disk path
|
||||
- avoid using the Talos system disk for heavy Longhorn data
|
||||
- spread replicas across nodes for resiliency
|
||||
|
||||
## 11) Upgrade Talos to `v1.12.x`
|
||||
|
||||
This repo now pins:
|
||||
|
||||
- `talosVersion: v1.12.5` in `talconfig.yaml`
|
||||
|
||||
### Regenerate configs
|
||||
|
||||
From `talos/`:
|
||||
|
||||
```bash
|
||||
talhelper genconfig
|
||||
```
|
||||
|
||||
### Rolling upgrade order
|
||||
|
||||
Upgrade one node at a time, waiting for it to return healthy before moving on.
|
||||
|
||||
1. Control plane nodes (`noble-cp-1`, then `noble-cp-2`, then `noble-cp-3`)
|
||||
2. Worker node (`noble-worker-1`)
|
||||
|
||||
Example commands (adjust node IP per step):
|
||||
|
||||
```bash
|
||||
talosctl --talosconfig ./clusterconfig/talosconfig -n 192.168.50.20 upgrade --image ghcr.io/siderolabs/installer:v1.12.5
|
||||
talosctl --talosconfig ./clusterconfig/talosconfig -n 192.168.50.20 reboot
|
||||
talosctl --talosconfig ./clusterconfig/talosconfig -n 192.168.50.20 health
|
||||
```
|
||||
|
||||
After all nodes are upgraded, verify:
|
||||
|
||||
```bash
|
||||
talosctl --talosconfig ./clusterconfig/talosconfig version
|
||||
kubectl get nodes -o wide
|
||||
```
|
||||
|
||||
## 12) Destroy the cluster and rebuild from scratch
|
||||
|
||||
Use this when Kubernetes / etcd / Argo / Longhorn state is corrupted and you want a
|
||||
**clean** cluster. This **wipes cluster state on the nodes** (etcd, workloads,
|
||||
Longhorn data on cluster disks). Plan for **downtime** and **backup** anything
|
||||
you must keep off-cluster first.
|
||||
|
||||
### 12.1 Reset every Talos node (Kubernetes is destroyed)
|
||||
|
||||
From `talos/` with a working **`talosconfig`** that matches the machines (same
|
||||
`TALOSCONFIG` / `ENDPOINT` guidance as elsewhere in this README):
|
||||
|
||||
```bash
|
||||
cd talos
|
||||
export TALOSCONFIG="$(pwd)/clusterconfig/talosconfig"
|
||||
export ENDPOINT=192.168.50.230
|
||||
```
|
||||
|
||||
Reset **one node at a time**, waiting for each to reboot before the next. Order:
|
||||
**worker first**, then **non-bootstrap control planes**, then the **bootstrap**
|
||||
control plane **last** (`noble-cp-1` → `192.168.50.20`).
|
||||
|
||||
```bash
|
||||
talosctl -e "${ENDPOINT}" -n 192.168.50.10 reset --graceful=false
|
||||
talosctl -e "${ENDPOINT}" -n 192.168.50.30 reset --graceful=false
|
||||
talosctl -e "${ENDPOINT}" -n 192.168.50.40 reset --graceful=false
|
||||
talosctl -e "${ENDPOINT}" -n 192.168.50.20 reset --graceful=false
|
||||
```
|
||||
|
||||
If the API VIP is already unreachable, target the **node IP** as endpoint for that
|
||||
node, for example:
|
||||
`talosctl -e 192.168.50.10 -n 192.168.50.10 reset --graceful=false`.
|
||||
|
||||
Your workstation **`kubeconfig`** will not work for the old cluster after this;
|
||||
that is expected until you bootstrap again.
|
||||
|
||||
### 12.2 (Optional) New cluster secrets
|
||||
|
||||
For a fully fresh identity (new cluster CA and `talosconfig`):
|
||||
|
||||
```bash
|
||||
cd talos
|
||||
talhelper gensecret > talsecret.sops.yaml
|
||||
# encrypt / store talsecret as you usually do, then:
|
||||
talhelper genconfig
|
||||
```
|
||||
|
||||
If you **keep** the existing `talsecret.sops.yaml`, still run **`talhelper genconfig`**
|
||||
so `clusterconfig/` matches what you will apply.
|
||||
|
||||
### 12.3 Apply configs, bootstrap, kubeconfig
|
||||
|
||||
Repeat **§3 Apply Talos configs** and **§4 Bootstrap the cluster** (and **§5
|
||||
Validate**) from the top of this README: `apply-config` each node, then
|
||||
`talosctl bootstrap`, then `talosctl kubeconfig` into `talos/kubeconfig`.
|
||||
|
||||
### 12.4 Redeploy GitOps (Argo CD + apps)
|
||||
|
||||
From your workstation (repo root), with `KUBECONFIG` pointing at the new
|
||||
`talos/kubeconfig`:
|
||||
|
||||
```bash
|
||||
# Set REPO to the directory that contains both talos/ and clusters/ (not a literal "path/to")
|
||||
REPO="${HOME}/Developer/home-server"
|
||||
export KUBECONFIG="${REPO}/talos/kubeconfig"
|
||||
cd "${REPO}"
|
||||
kubectl apply -k clusters/noble/bootstrap/argocd
|
||||
kubectl apply -f clusters/noble/root-application.yaml
|
||||
```
|
||||
|
||||
Resolve **Argo CD admin** login (secret / password reset) as needed; then let
|
||||
`noble-root` sync `clusters/noble/apps/`.
|
||||
|
||||
## 13) Mid-rebuild issues: etcd, bootstrap, and `apply-config`
|
||||
|
||||
### `tls: certificate required` when using `apply-config --insecure`
|
||||
|
||||
After a node has **joined** the cluster, the Talos API expects **client
|
||||
certificates** from your `talosconfig`. `--insecure` only applies to **maintenance**
|
||||
(before join / after a reset).
|
||||
|
||||
**Do one of:**
|
||||
|
||||
- Apply config **with** `talosconfig` (no `--insecure`):
|
||||
|
||||
```bash
|
||||
cd talos
|
||||
export TALOSCONFIG="$(pwd)/clusterconfig/talosconfig"
|
||||
export ENDPOINT=192.168.50.230
|
||||
talosctl -e "${ENDPOINT}" apply-config -n 192.168.50.30 -f clusterconfig/noble-noble-cp-2.yaml
|
||||
```
|
||||
|
||||
- Or **`talosctl reset`** that node first (see §12.1), then use
|
||||
`apply-config --insecure` again while it is in maintenance.
|
||||
|
||||
### `bootstrap`: `etcd data directory is not empty`
|
||||
|
||||
The bootstrap node (`192.168.50.20`) already has a **previous etcd** on disk (failed
|
||||
or partial bootstrap). Kubernetes will not bootstrap again until that state is
|
||||
**wiped**.
|
||||
|
||||
**Fix:** run **`talosctl reset --graceful=false`** on the **control plane nodes**
|
||||
(at minimum the bootstrap node; often **all four nodes** is cleaner). See §12.1.
|
||||
Then re-apply machine configs and run **`talosctl bootstrap` exactly once**.
|
||||
|
||||
### etcd unhealthy / “Preparing” on some control planes
|
||||
|
||||
Usually means **split or partial** cluster state. The reliable fix is the same
|
||||
**full reset** (§12.1), then a single ordered bring-up: apply all configs →
|
||||
bootstrap once → `talosctl health`.
|
||||
## Kubeconfig from running nodes
|
||||
|
||||
The repo root `kubeconfig` may be incomplete until you merge credentials; prefer generating `talos/kubeconfig` with the commands in §3 above.
|
||||
|
||||
5
talos/clusterconfig/.gitignore
vendored
5
talos/clusterconfig/.gitignore
vendored
@@ -1,5 +0,0 @@
|
||||
noble-noble-cp-1.yaml
|
||||
noble-noble-cp-2.yaml
|
||||
noble-noble-cp-3.yaml
|
||||
noble-noble-worker-1.yaml
|
||||
talosconfig
|
||||
@@ -1,7 +1,7 @@
|
||||
apiVersion: v1
|
||||
clusters:
|
||||
- cluster:
|
||||
certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJpRENDQVMrZ0F3SUJBZ0lRQjJqRHdiclpVQXVqU0NpMjVkcEkxVEFLQmdncWhrak9QUVFEQWpBVk1STXcKRVFZRFZRUUtFd3ByZFdKbGNtNWxkR1Z6TUI0WERUSTJNRE15TnpBMk5UWTBORm9YRFRNMk1ETXlOREEyTlRZMApORm93RlRFVE1CRUdBMVVFQ2hNS2EzVmlaWEp1WlhSbGN6QlpNQk1HQnlxR1NNNDlBZ0VHQ0NxR1NNNDlBd0VICkEwSUFCTytlK3dhN0V4SW8yN2w4a01yR0ROOTNMbFVtMytGT201Y3FmRkZ2RXdOYTgrT1loM1NPQzFCTWY0S1QKNnVrNTMwZlA1T0VrbFpOTTBCV3N4VkpOQzhxallUQmZNQTRHQTFVZER3RUIvd1FFQXdJQ2hEQWRCZ05WSFNVRQpGakFVQmdnckJnRUZCUWNEQVFZSUt3WUJCUVVIQXdJd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZEJnTlZIUTRFCkZnUVV2N0hpTE5PSUgwOXBTaTFpNDVUR2FvVXpSZTB3Q2dZSUtvWkl6ajBFQXdJRFJ3QXdSQUlnTmVkdUdsK3AKMzRQdmdGbUJMdmZIWlBzV1hqNmVQa2p0OE8yS0pHUUIvdDRDSUcyNTVIZnYzT09QR0tnYTNMby81L083cjh1bwpyMGhyNDNJR0ltME1FUkZECi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
|
||||
certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJpakNDQVRDZ0F3SUJBZ0lSQUthRzU4bCtjeURSQlIrMFlXSWltajR3Q2dZSUtvWkl6ajBFQXdJd0ZURVQKTUJFR0ExVUVDaE1LYTNWaVpYSnVaWFJsY3pBZUZ3MHlOakF6TWpnd01UTTVNelJhRncwek5qQXpNalV3TVRNNQpNelJhTUJVeEV6QVJCZ05WQkFvVENtdDFZbVZ5Ym1WMFpYTXdXVEFUQmdjcWhrak9QUUlCQmdncWhrak9QUU1CCkJ3TkNBQVNQeUpCMExLVFV2Tm0wRzB4ZHNnQ2FoRDN6Ung2UFR0Vkdxdmd4MmphZ3pLcmU1N2NRajNBRzdsRmoKeTdkMGZNSDBiK3Fwd281aG1VbWtpWmVVcHRscm8yRXdYekFPQmdOVkhROEJBZjhFQkFNQ0FvUXdIUVlEVlIwbApCQll3RkFZSUt3WUJCUVVIQXdFR0NDc0dBUVVGQndNQ01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0hRWURWUjBPCkJCWUVGSWdITVgwTTZDN1ZzSEVUVjVndjYwdWJMQ0h0TUFvR0NDcUdTTTQ5QkFNQ0EwZ0FNRVVDSUc2ZmNUT1cKL2FkTmVoTTdISVFBZGsxcGVLTU5RMFFWRjJGMVBRUzluMGZZQWlFQTNTbWRFUWNVS0p2VGZPQUUzQkJobHBIZwpNNFFTVU1rQWFaQmt4c3BTNy9BPQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg==
|
||||
server: https://192.168.50.230:6443
|
||||
name: noble
|
||||
contexts:
|
||||
@@ -10,20 +10,11 @@ contexts:
|
||||
namespace: default
|
||||
user: admin@noble
|
||||
name: admin@noble
|
||||
- context:
|
||||
cluster: noble
|
||||
namespace: default
|
||||
user: admin@noble-1
|
||||
name: admin@noble-1
|
||||
current-context: admin@noble
|
||||
kind: Config
|
||||
preferences: {}
|
||||
users:
|
||||
- name: admin@noble
|
||||
user:
|
||||
client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJoVENDQVN1Z0F3SUJBZ0lSQUtvRFZDbFc5THVVU2JPWEhWNVJBclF3Q2dZSUtvWkl6ajBFQXdJd0ZURVQKTUJFR0ExVUVDaE1LYTNWaVpYSnVaWFJsY3pBZUZ3MHlOakF6TWpjeU1qSTRNalJhRncweU56QXpNamN5TWpJNApNelJhTUNreEZ6QVZCZ05WQkFvVERuTjVjM1JsYlRwdFlYTjBaWEp6TVE0d0RBWURWUVFERXdWaFpHMXBiakJaCk1CTUdCeXFHU000OUFnRUdDQ3FHU000OUF3RUhBMElBQk5CM1VNWjdBSjl6RzB5SDJ6V3A2Sk1QcW1rU3U4amIKVGZyazVvVUF0NkhuL29UbkhNM0RwM3R5M2lieFpTU3dMdkhPd3Y3azl5L3JuL2FiL0dmZ3NCZWpTREJHTUE0RwpBMVVkRHdFQi93UUVBd0lGb0RBVEJnTlZIU1VFRERBS0JnZ3JCZ0VGQlFjREFqQWZCZ05WSFNNRUdEQVdnQlMvCnNlSXMwNGdmVDJsS0xXTGpsTVpxaFRORjdUQUtCZ2dxaGtqT1BRUURBZ05JQURCRkFpQlBzQmVicjUxa3J6WHoKWjIvaWNaWnpSWXNpRVBXSzF3K0xyMm5acE9ya1Z3SWhBS3F1WmVhYW8xTWxNam5ZK291ZjUrbnl0ZFp3TFVFNwpLZEQ3ak5obmpjY1EKLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo=
|
||||
client-key-data: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSU9uL0tLMXlNM2RiUEhhQ2ZKVUcweWc5NktPNzRiSGRzN3VpSGIzeWFwenRvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFMEhkUXhuc0FuM01iVElmYk5hbm9rdytxYVJLN3lOdE4rdVRtaFFDM29lZitoT2NjemNPbgplM0xlSnZGbEpMQXU4YzdDL3VUM0wrdWY5cHY4WitDd0Z3PT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo=
|
||||
- name: admin@noble-1
|
||||
user:
|
||||
client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJoRENDQVNxZ0F3SUJBZ0lRUStjVHg3OWxZdlFkYUROTDZLZTlqVEFLQmdncWhrak9QUVFEQWpBVk1STXcKRVFZRFZRUUtFd3ByZFdKbGNtNWxkR1Z6TUI0WERUSTJNRE15TnpBM01UVXpNVm9YRFRJM01ETXlOekEzTVRVMApNVm93S1RFWE1CVUdBMVVFQ2hNT2MzbHpkR1Z0T20xaGMzUmxjbk14RGpBTUJnTlZCQU1UQldGa2JXbHVNRmt3CkV3WUhLb1pJemowQ0FRWUlLb1pJemowREFRY0RRZ0FFVzBtcGlCbHA0OEQ3SFU5eVFIS2MwblhCOTJxYzNoNFoKT2pya0xGRksxRnBsOE5xVFdEV2x3NmpsWUFlRWdzL0E1NzB3QzFrazRoZGdiZGJGZ2hZcmJxTklNRVl3RGdZRApWUjBQQVFIL0JBUURBZ1dnTUJNR0ExVWRKUVFNTUFvR0NDc0dBUVVGQndNQ01COEdBMVVkSXdRWU1CYUFGTCt4CjRpelRpQjlQYVVvdFl1T1V4bXFGTTBYdE1Bb0dDQ3FHU000OUJBTUNBMGdBTUVVQ0lRRGdQaDdJUjV0RjhmL3UKRks1N2RpZVplOHoyeEVPWUxYYnZid0pIQXZIMWp3SWdCaW5qOEJHZXVRejZ6QUFjU2Z6aWRTYWlWMlpvZElDUApWZlcrckE3ZzV6MD0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo=
|
||||
client-key-data: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSUhUQXZZc2htcHF6VG1XdUhEK1NLcFVTVlppdllmckF5RUY4cGVIK1JiS3FvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFVzBtcGlCbHA0OEQ3SFU5eVFIS2MwblhCOTJxYzNoNFpPanJrTEZGSzFGcGw4TnFUV0RXbAp3NmpsWUFlRWdzL0E1NzB3QzFrazRoZGdiZGJGZ2hZcmJnPT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo=
|
||||
client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJoRENDQVN1Z0F3SUJBZ0lSQUw0alhIdStZSUxUQnFiU3ExbFpiT3N3Q2dZSUtvWkl6ajBFQXdJd0ZURVQKTUJFR0ExVUVDaE1LYTNWaVpYSnVaWFJsY3pBZUZ3MHlOakF6TWpnd01qQXpNVEZhRncweU56QXpNamd3TWpBegpNakZhTUNreEZ6QVZCZ05WQkFvVERuTjVjM1JsYlRwdFlYTjBaWEp6TVE0d0RBWURWUVFERXdWaFpHMXBiakJaCk1CTUdCeXFHU000OUFnRUdDQ3FHU000OUF3RUhBMElBQkJ0Y0dib3c4UFk4UnlFdFNUdEFVRkZPVjRXbndidnMKVGdaZFoyQ3NPVjB6dFZnWmxMZENlaHI3YTRxUHFFMTJPa09ObXYxTnI1eXVHN281cEdiZjc5T2pTREJHTUE0RwpBMVVkRHdFQi93UUVBd0lGb0RBVEJnTlZIU1VFRERBS0JnZ3JCZ0VGQlFjREFqQWZCZ05WSFNNRUdEQVdnQlNJCkJ6RjlET2d1MWJCeEUxZVlMK3RMbXl3aDdUQUtCZ2dxaGtqT1BRUURBZ05IQURCRUFpQkVTbE5aQktkc05OQ2sKYnVhejB2TFZrYmNXK1Q0UnYxNGNFS1huYWV5UXNBSWdWRk9qaXBSNjQzc3ZEN1NaSXRMU1FKcEQxcWhCdmd1MApxZXkxSUhKMTdGRT0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo=
|
||||
client-key-data: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSUh2b0lwTW5ubW5aalgreXRQejM3Y3RKdGFVRzNvamtlRENGamUwaWZkcW9vQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFRzF3WnVqRHc5anhISVMxSk8wQlFVVTVYaGFmQnUreE9CbDFuWUt3NVhUTzFXQm1VdDBKNgpHdnRyaW8rb1RYWTZRNDJhL1Uydm5LNGJ1am1rWnQvdjB3PT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo=
|
||||
|
||||
63
talos/scripts/longhorn-gpt-recovery.sh
Executable file
63
talos/scripts/longhorn-gpt-recovery.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env bash
|
||||
# Recover from GPT on Longhorn data disk: apply wipe-phase config → wipe → restore Longhorn talconfig.
|
||||
# Prereq: talos/talconfig.yaml is the WIPE phase (no userVolumes longhorn); talhelper genconfig -o out already run.
|
||||
set -euo pipefail
|
||||
|
||||
TALOS_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
export TALOSCONFIG="${TALOSCONFIG:-$TALOS_ROOT/out/talosconfig}"
|
||||
DISK="${DISK:-sdb}"
|
||||
|
||||
NODES=(
|
||||
"192.168.50.10:noble-helium.yaml"
|
||||
"192.168.50.20:noble-neon.yaml"
|
||||
"192.168.50.30:noble-argon.yaml"
|
||||
"192.168.50.40:noble-krypton.yaml"
|
||||
)
|
||||
|
||||
die() { echo "error: $*" >&2; exit 1; }
|
||||
|
||||
[[ -f "$TALOSCONFIG" ]] || die "missing $TALOSCONFIG — run: cd $TALOS_ROOT && talhelper genconfig -o out"
|
||||
|
||||
phase_apply_wipe() {
|
||||
echo "=== Phase 1: apply WIPE-phase machine config to every node (releases u-longhorn) ==="
|
||||
for entry in "${NODES[@]}"; do
|
||||
ip="${entry%%:*}"
|
||||
file="${entry##*:}"
|
||||
echo "Applying $file to $ip ..."
|
||||
talosctl apply-config -n "$ip" --file "$TALOS_ROOT/out/$file"
|
||||
done
|
||||
echo "Reboot all Talos nodes now (or wait for volume controller), then confirm u-longhorn is gone:"
|
||||
echo " talosctl get volumestatus -n 192.168.50.20"
|
||||
echo "When wipe would succeed, run: $0 phase2"
|
||||
}
|
||||
|
||||
phase_wipe_disks() {
|
||||
echo "=== Phase 2: wipe data disk $DISK on each node (must NOT be 'in use by volume u-longhorn') ==="
|
||||
for entry in "${NODES[@]}"; do
|
||||
ip="${entry%%:*}"
|
||||
echo "Wiping $DISK on $ip ..."
|
||||
talosctl wipe disk "$DISK" -n "$ip"
|
||||
done
|
||||
echo "=== Phase 3: restore Longhorn talconfig, genconfig, apply to all nodes ==="
|
||||
cp -f "$TALOS_ROOT/talconfig.with-longhorn.yaml" "$TALOS_ROOT/talconfig.yaml"
|
||||
(cd "$TALOS_ROOT" && talhelper genconfig -o out)
|
||||
for entry in "${NODES[@]}"; do
|
||||
ip="${entry%%:*}"
|
||||
file="${entry##*:}"
|
||||
echo "Applying restored $file to $ip ..."
|
||||
talosctl apply-config -n "$ip" --file "$TALOS_ROOT/out/$file"
|
||||
done
|
||||
echo "Done. Reboot nodes if Longhorn volume does not come up clean."
|
||||
}
|
||||
|
||||
case "${1:-}" in
|
||||
phase1|apply) phase_apply_wipe ;;
|
||||
phase2|wipe) phase_wipe_disks ;;
|
||||
"")
|
||||
echo "Usage: cd talos && talhelper genconfig -o out && export TALOSCONFIG=\"\$(pwd)/out/talosconfig\""
|
||||
echo " $0 phase1 # apply WIPE machine config to all nodes — reboot if u-longhorn lingers"
|
||||
echo " DISK=vdb $0 phase2 # wipe disk, restore Longhorn talconfig, genconfig, apply all"
|
||||
echo "Env DISK defaults to sdb."
|
||||
;;
|
||||
*) die "unknown arg: $1" ;;
|
||||
esac
|
||||
92
talos/talconfig.with-longhorn.yaml
Normal file
92
talos/talconfig.with-longhorn.yaml
Normal file
@@ -0,0 +1,92 @@
|
||||
# yaml-language-server: $schema=../talconfig.json
|
||||
# Restore target after GPT wipe: `cp talconfig.with-longhorn.yaml talconfig.yaml` then `talhelper genconfig -o out` and apply all nodes.
|
||||
# Noble lab — Talos machine configs via talhelper.
|
||||
# 1) talhelper gensecret > talsecret.yaml # or SOPS-encrypt to talsecret.sops.yaml (do not commit)
|
||||
# 2) talhelper genconfig -o out # writes to talos/out/ (gitignored from repo root)
|
||||
# 3) talosctl apply-config --insecure -n <ip> --file out/noble-<host>.yaml
|
||||
#
|
||||
# installDisk: confirm with `talosctl disks -n <ip> --insecure` (Proxmox virtio is often /dev/sda).
|
||||
# Longhorn data disk: second disk (often /dev/sdb SCSI or /dev/vdb virtio) → XFS at /var/mnt/longhorn.
|
||||
# After changing schematic/extensions: regenerate configs, upgrade nodes with new installer image, then reboot if needed.
|
||||
# Helm must set defaultDataPath to /var/mnt/longhorn (see clusters/noble/apps/longhorn/values.yaml).
|
||||
#
|
||||
# Image Factory schematic (iscsi-tools + util-linux-tools), nocloud installer:
|
||||
# factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6
|
||||
# After edits, run talhelper genconfig — `machine.install.image` in out/*.yaml should match this schematic (path may be metal-installer/ etc. on bare metal).
|
||||
# Upgrade: talosctl upgrade --image <same-as-machine.install.image-in-out> -n <node-ip>
|
||||
clusterName: noble
|
||||
talosVersion: v1.12.6
|
||||
endpoint: https://192.168.50.230:6443
|
||||
allowSchedulingOnControlPlanes: true
|
||||
additionalApiServerCertSans:
|
||||
- 192.168.50.230
|
||||
- noble.lab
|
||||
- kube.noble.lab
|
||||
nodes:
|
||||
- hostname: helium
|
||||
ipAddress: 192.168.50.10
|
||||
controlPlane: false
|
||||
installDisk: /dev/sda
|
||||
talosImageURL: &noble-installer factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f
|
||||
- hostname: neon
|
||||
ipAddress: 192.168.50.20
|
||||
controlPlane: true
|
||||
installDisk: /dev/sda
|
||||
talosImageURL: *noble-installer
|
||||
- hostname: argon
|
||||
ipAddress: 192.168.50.30
|
||||
controlPlane: true
|
||||
installDisk: /dev/sda
|
||||
talosImageURL: *noble-installer
|
||||
- hostname: krypton
|
||||
ipAddress: 192.168.50.40
|
||||
controlPlane: true
|
||||
installDisk: /dev/sda
|
||||
talosImageURL: *noble-installer
|
||||
controlPlane:
|
||||
schematic: &noble-schematic
|
||||
customization:
|
||||
systemExtensions:
|
||||
officialExtensions:
|
||||
- siderolabs/iscsi-tools
|
||||
- siderolabs/util-linux-tools
|
||||
userVolumes:
|
||||
- &longhorn-data
|
||||
name: longhorn
|
||||
# Whole dedicated disk (no partition min/max math). Avoids "not enough space" when
|
||||
# grow+maxSize:100% on a separate data disk incorrectly fails provisioning.
|
||||
volumeType: disk
|
||||
provisioning:
|
||||
diskSelector:
|
||||
# Proxmox virtio SCSI: second disk is often vdb, not sdb. Prefer WWN/serial in prod.
|
||||
match: disk.dev_path == '/dev/sdb' || disk.dev_path == '/dev/vdb'
|
||||
filesystem:
|
||||
type: xfs
|
||||
worker:
|
||||
schematic: *noble-schematic
|
||||
userVolumes:
|
||||
- *longhorn-data
|
||||
patches:
|
||||
- |-
|
||||
cluster:
|
||||
network:
|
||||
cni:
|
||||
name: none
|
||||
machine:
|
||||
kubelet:
|
||||
extraMounts:
|
||||
- destination: /var/mnt/longhorn
|
||||
type: bind
|
||||
source: /var/mnt/longhorn
|
||||
options:
|
||||
- bind
|
||||
- rshared
|
||||
- rw
|
||||
# Chart DaemonSet hostPath is /var/lib/longhorn (not configurable in Helm 1.11.x).
|
||||
- destination: /var/lib/longhorn
|
||||
type: bind
|
||||
source: /var/mnt/longhorn
|
||||
options:
|
||||
- bind
|
||||
- rshared
|
||||
- rw
|
||||
@@ -1,56 +1,92 @@
|
||||
# yaml-language-server: $schema=../talconfig.json
|
||||
# Restore target after GPT wipe: `cp talconfig.with-longhorn.yaml talconfig.yaml` then `talhelper genconfig -o out` and apply all nodes.
|
||||
# Noble lab — Talos machine configs via talhelper.
|
||||
# 1) talhelper gensecret > talsecret.yaml # or SOPS-encrypt to talsecret.sops.yaml (do not commit)
|
||||
# 2) talhelper genconfig -o out # writes to talos/out/ (gitignored from repo root)
|
||||
# 3) talosctl apply-config --insecure -n <ip> --file out/noble-<host>.yaml
|
||||
#
|
||||
# installDisk: confirm with `talosctl disks -n <ip> --insecure` (Proxmox virtio is often /dev/sda).
|
||||
# Longhorn data disk: second disk (often /dev/sdb SCSI or /dev/vdb virtio) → XFS at /var/mnt/longhorn.
|
||||
# After changing schematic/extensions: regenerate configs, upgrade nodes with new installer image, then reboot if needed.
|
||||
# Helm must set defaultDataPath to /var/mnt/longhorn (see clusters/noble/apps/longhorn/values.yaml).
|
||||
#
|
||||
# Image Factory schematic (iscsi-tools + util-linux-tools), nocloud installer:
|
||||
# factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6
|
||||
# After edits, run talhelper genconfig — `machine.install.image` in out/*.yaml should match this schematic (path may be metal-installer/ etc. on bare metal).
|
||||
# Upgrade: talosctl upgrade --image <same-as-machine.install.image-in-out> -n <node-ip>
|
||||
clusterName: noble
|
||||
talosVersion: v1.12.6
|
||||
endpoint: https://192.168.50.230:6443
|
||||
talosVersion: v1.12.5
|
||||
kubernetesVersion: v1.31.1
|
||||
allowSchedulingOnControlPlanes: true
|
||||
|
||||
# kube-vip fronts the Kubernetes API at this IP (see clusters/noble/apps/kube-vip).
|
||||
# Without these SANs, TLS to https://192.168.50.230:6443 fails (cert does not match).
|
||||
# Talos API (talosctl -e) also uses endpoint; include VIP in machine cert SANs.
|
||||
additionalApiServerCertSans:
|
||||
- 192.168.50.230
|
||||
- kube.noble.lab.pcenicni.dev
|
||||
|
||||
additionalMachineCertSans:
|
||||
- 192.168.50.230
|
||||
|
||||
# Use Cilium installed via GitOps (no bundled Talos CNI).
|
||||
cniConfig:
|
||||
name: none
|
||||
|
||||
clusterPodNets:
|
||||
- 10.244.0.0/16
|
||||
clusterSvcNets:
|
||||
- 10.96.0.0/12
|
||||
|
||||
# Secondary disk on every node (OS stays on installDisk: /dev/sda).
|
||||
# Mount matches Longhorn defaultDataPath in clusters/noble/apps/longhorn/application.yaml.
|
||||
patches:
|
||||
- |-
|
||||
machine:
|
||||
disks:
|
||||
- device: /dev/sdb
|
||||
partitions:
|
||||
- mountpoint: /var/mnt/longhorn
|
||||
|
||||
- noble.lab
|
||||
- kube.noble.lab
|
||||
nodes:
|
||||
- hostname: noble-cp-1
|
||||
ipAddress: 192.168.50.20
|
||||
controlPlane: true
|
||||
installDisk: /dev/sda
|
||||
|
||||
- hostname: noble-cp-2
|
||||
ipAddress: 192.168.50.30
|
||||
controlPlane: true
|
||||
installDisk: /dev/sda
|
||||
|
||||
- hostname: noble-cp-3
|
||||
ipAddress: 192.168.50.40
|
||||
controlPlane: true
|
||||
installDisk: /dev/sda
|
||||
|
||||
- hostname: noble-worker-1
|
||||
- hostname: helium
|
||||
ipAddress: 192.168.50.10
|
||||
controlPlane: false
|
||||
installDisk: /dev/sda
|
||||
|
||||
talosImageURL: &noble-installer factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f
|
||||
- hostname: neon
|
||||
ipAddress: 192.168.50.20
|
||||
controlPlane: true
|
||||
installDisk: /dev/sda
|
||||
talosImageURL: *noble-installer
|
||||
- hostname: argon
|
||||
ipAddress: 192.168.50.30
|
||||
controlPlane: true
|
||||
installDisk: /dev/sda
|
||||
talosImageURL: *noble-installer
|
||||
- hostname: krypton
|
||||
ipAddress: 192.168.50.40
|
||||
controlPlane: true
|
||||
installDisk: /dev/sda
|
||||
talosImageURL: *noble-installer
|
||||
controlPlane:
|
||||
schematic: &noble-schematic
|
||||
customization:
|
||||
systemExtensions:
|
||||
officialExtensions:
|
||||
- siderolabs/iscsi-tools
|
||||
- siderolabs/util-linux-tools
|
||||
userVolumes:
|
||||
- &longhorn-data
|
||||
name: longhorn
|
||||
# Whole dedicated disk (no partition min/max math). Avoids "not enough space" when
|
||||
# grow+maxSize:100% on a separate data disk incorrectly fails provisioning.
|
||||
volumeType: disk
|
||||
provisioning:
|
||||
diskSelector:
|
||||
# Proxmox virtio SCSI: second disk is often vdb, not sdb. Prefer WWN/serial in prod.
|
||||
match: disk.dev_path == '/dev/sdb' || disk.dev_path == '/dev/vdb'
|
||||
filesystem:
|
||||
type: xfs
|
||||
worker:
|
||||
schematic: *noble-schematic
|
||||
userVolumes:
|
||||
- *longhorn-data
|
||||
patches:
|
||||
- |-
|
||||
cluster:
|
||||
network:
|
||||
cni:
|
||||
name: none
|
||||
machine:
|
||||
kubelet:
|
||||
extraMounts:
|
||||
- destination: /var/mnt/longhorn
|
||||
type: bind
|
||||
source: /var/mnt/longhorn
|
||||
options:
|
||||
- bind
|
||||
- rshared
|
||||
- rw
|
||||
# Chart DaemonSet hostPath is /var/lib/longhorn (not configurable in Helm 1.11.x).
|
||||
- destination: /var/lib/longhorn
|
||||
type: bind
|
||||
source: /var/mnt/longhorn
|
||||
options:
|
||||
- bind
|
||||
- rshared
|
||||
- rw
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
cluster:
|
||||
id: kT-NVPu4QlAStlRSvgxXul7uf9FBBJ825WWQ4ybQP24=
|
||||
secret: b2jPTHcPR1GlOBwwdFBu2plsPczRXs17KcuH9RPtNa4=
|
||||
secrets:
|
||||
bootstraptoken: j2n63x.34f5io55z56drw06
|
||||
secretboxencryptionsecret: zP+KwKUwfXAQoetluPzCLhjbBqHhiUgsM/bKmPcUPP0=
|
||||
trustdinfo:
|
||||
token: wlan0h.3aon1n2fndwbp3z7
|
||||
certs:
|
||||
etcd:
|
||||
crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJmekNDQVNTZ0F3SUJBZ0lSQUpubnFkSVBhVkNKVGZCcGlMdkpMYUF3Q2dZSUtvWkl6ajBFQXdJd0R6RU4KTUFzR0ExVUVDaE1FWlhSalpEQWVGdzB5TmpBek1qY3lNak0xTkRCYUZ3MHpOakF6TWpReU1qTTFOREJhTUE4eApEVEFMQmdOVkJBb1RCR1YwWTJRd1dUQVRCZ2NxaGtqT1BRSUJCZ2dxaGtqT1BRTUJCd05DQUFRbGNRMThnZEhaCndlUTg1cDFXcHBmb1ZMS1BYRXNQRWVlcWczc29IMkFMWWtCSHFGN0I5UlczK3Q3UitlMjFIMGhKWlI2U1Y2WUQKTGlzRUV1d1hNN0tvbzJFd1h6QU9CZ05WSFE4QkFmOEVCQU1DQW9Rd0hRWURWUjBsQkJZd0ZBWUlLd1lCQlFVSApBd0VHQ0NzR0FRVUZCd01DTUE4R0ExVWRFd0VCL3dRRk1BTUJBZjh3SFFZRFZSME9CQllFRklPSGk2Q3pqUWVOCk9lUGxSeXNOTjZ5VFQyVlNNQW9HQ0NxR1NNNDlCQU1DQTBrQU1FWUNJUUNzZ3ZkRUV1SlExSmkxdi94UUQzRXoKS2todTJpQjBoTExMNktPcXpXYUhpZ0loQUtzSGY1YWlhb0FtR2dSM1NoNW5xMVUrN1FlbkRUWFZPcFVFcjRtagpCRFBVCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
|
||||
key: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSUpQYlJTTzA5RnhNVFZXeVBhb2xSTXVNT21rQ0duTm1JdDhLRGgvcjV1djNvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFSlhFTmZJSFIyY0hrUE9hZFZxYVg2RlN5ajF4TER4SG5xb043S0I5Z0MySkFSNmhld2ZVVgp0L3JlMGZudHRSOUlTV1Vla2xlbUF5NHJCQkxzRnpPeXFBPT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo=
|
||||
k8s:
|
||||
crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJpVENDQVRDZ0F3SUJBZ0lSQUtRUFR2ditFRXZZa1NaWDJpYWszZWt3Q2dZSUtvWkl6ajBFQXdJd0ZURVQKTUJFR0ExVUVDaE1LYTNWaVpYSnVaWFJsY3pBZUZ3MHlOakF6TWpjeU1qTTFOREJhRncwek5qQXpNalF5TWpNMQpOREJhTUJVeEV6QVJCZ05WQkFvVENtdDFZbVZ5Ym1WMFpYTXdXVEFUQmdjcWhrak9QUUlCQmdncWhrak9QUU1CCkJ3TkNBQVQzL0lkYXdOeUpBekcyYlRmWXFSdW5mNktPTlVPU3FheVF1czhhQnZwbE9BTWxCV1RiNXp0RzVWYm0KTEhheUhWTjZ2OFZ0U0svRnZzUlphVjh3MERXWG8yRXdYekFPQmdOVkhROEJBZjhFQkFNQ0FvUXdIUVlEVlIwbApCQll3RkFZSUt3WUJCUVVIQXdFR0NDc0dBUVVGQndNQ01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0hRWURWUjBPCkJCWUVGSnYrUkY1WnQ3b3A0VmRsUWZvaWp1UmZpVldJTUFvR0NDcUdTTTQ5QkFNQ0EwY0FNRVFDSUY4MlYzamkKelFPd3hic1JITjh3dXloak9XZVJET1FCa3Z6STgxcEhJY2lJQWlCdGN3VXpIYXZCS2pZRzloSDUwL3E2Y1Vjagpmc0w4T3A1bnhiYkZ0bTlXZFE9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg==
|
||||
key: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSUdNTSt6S2VuVFdTeHRhTStySnBxUXk5MEVsWkU4aXU5WFcwUVJ1RFI0RjBvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFOS95SFdzRGNpUU14dG0wMzJLa2JwMytpampWRGtxbXNrTHJQR2diNlpUZ0RKUVZrMitjNwpSdVZXNWl4MnNoMVRlci9GYlVpdnhiN0VXV2xmTU5BMWx3PT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo=
|
||||
k8saggregator:
|
||||
crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJYekNDQVFXZ0F3SUJBZ0lRTW1jcXNFSXo1TlY4TGlaVGFGZzVhakFLQmdncWhrak9QUVFEQWpBQU1CNFgKRFRJMk1ETXlOekl5TXpVME1Gb1hEVE0yTURNeU5ESXlNelUwTUZvd0FEQlpNQk1HQnlxR1NNNDlBZ0VHQ0NxRwpTTTQ5QXdFSEEwSUFCSjA3MjdwNklmSlVua1VHNVk4dUlDUTVMeWNzZGl0YmU1WWprTFRleXhOTU5uZXVrTUZaCjRwQTQ0azd0WWVBejJGbTQra0p5Nzk0SkM5Vy9YMjZZR0lDallUQmZNQTRHQTFVZER3RUIvd1FFQXdJQ2hEQWQKQmdOVkhTVUVGakFVQmdnckJnRUZCUWNEQVFZSUt3WUJCUVVIQXdJd0R3WURWUjBUQVFIL0JBVXdBd0VCL3pBZApCZ05WSFE0RUZnUVVuYit0Q2h4eENEWFk1VkZqb3NHTVc0dTRqWk13Q2dZSUtvWkl6ajBFQXdJRFNBQXdSUUlnCkJLL0t6WlNyWERHZVM5bFZ1UllBMzJHbW1DZmxvbzk1Tkw3Z0pUVTFlTlVDSVFENlhHODl5WEM1RWcyVFlYM1EKdW0vWUdlUzlOUkV1TnRjNGhSaWx5a1RQMXc9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg==
|
||||
key: LS0tLS1CRUdJTiBFQyBQUklWQVRFIEtFWS0tLS0tCk1IY0NBUUVFSUhTNHhwbnNhQ3g5REtETUtEV2hxQktEaWpKMUEwbkJ5akVJazRibEhtV1JvQW9HQ0NxR1NNNDkKQXdFSG9VUURRZ0FFblR2YnVub2g4bFNlUlFibGp5NGdKRGt2Snl4MksxdDdsaU9RdE43TEUwdzJkNjZRd1ZuaQprRGppVHUxaDREUFlXYmo2UW5MdjNna0wxYjlmYnBnWWdBPT0KLS0tLS1FTkQgRUMgUFJJVkFURSBLRVktLS0tLQo=
|
||||
k8sserviceaccount:
|
||||
key: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlKS0FJQkFBS0NBZ0VBc2plaHZiZXo2eEgvbGpDemhLM2dKS1FzeFJhZTNSWkZGMU1vdjA3ZTFyWGt5WFVkCkdWWVpxU0Z2Tm9IQ3VjdmRJd3dVOFhucXVmSGozZXlKS1h2eWoySXR6R1N2OVdHQUdpMktiTmNMWVFESFZGeEEKcEc0ZC80b1pKK1NtNkJoYWlJMVRxWEZCR3RHRDF4N2J5UFYrZkdYNnlZOFdQMGdCSERoSncvWEpmUHBEQU93QgphVjJnVzlvbFNVRTVFQmdvNnlHQlpwYnhQSUFtZEVJemxEQ3FPWW5kb0JDZFRjaTFyMFpHWTZ4Z3FheThxK25sCnZuL3pxNVZrcHdPSitpS1Jac2xlRVo4cFR6eFlhNmNmWDZvMjRpTFVrOTFPelE3VEtFUVhlUlprZkQrUHVlR1MKa1AybDJJaCtUZ21JWGJVV09WdUthRmxqREQ1K3I0cFhFcHNCMzBuUUN2QUhob0tjSHFSeUc4WjZxS1RpYkMvawpybTVJUmwxS1V2QU00OFRZQzk2SjIwTzh3a1JYdkMyOEhUWGdENm15K1pXQk9VTmM3QmJuV3p6aTBKU2RXUDBoCnNrcDNzZ0psQzhvQmttZE9EQnRoUzNKckQ1WFUxeVVoTi84bDI4NHBJclhubUlrdUlkK2laajVtcjFsUmdTS2oKUlkybnduc2hCQUxnU1l3cDFOWk9kZlhucnF2QW05cEJINERPNzNtVit5MzlRWEorTkJHYmJvNGp1VXBQRFNjWApWaURjZUxnZFRIbEVjQ0M4UmJwcGRCQy94cVpySVJRU2EyZzcyOEZRTWQ4dXE3S1B0QXYyS1M0VzBlSEcxcnUxCjc4K2I3TEdEc2hTUEZubXNSU1NOUElXdjZzV1l5bllmSElkcUZ2MCtRZjU2dUdXcGJUYlo2UHp3WHVjQ0F3RUEKQVFLQ0FnQXlTNmYrVWp0WkFvWFduWnowTzF2d0MxTkZOZnFVbTRYWkxOTnBsamttY0VRR3BPSVc5ZWtkQmI0TQpySGRIbHlTc0VPdFNNTjJSSjVadTJhUG1ETUJxUGNOK0ZRWmhvbWdVT3pqL09YdFJIM2FodEwxYmltWTE2WVBxCjhjazI1RFNjcUFIdDVuUUF2Uk5Qb1RwV3p3Mm96dUVGaERlN21UY1MvMEcySjRYN0d1ZlErVW4yc2dFaEd3SDkKMkFYaUtHZFg0R2RVREJJOXlFN1I3YUwvMWZJY2RlK1JqazdPbG0vTDdQSE5qR2JsUzhZZFlFL0J3UHVFTjQrbAo0TVpPVFBZckEzWmtVNzVGU2RzTVdxaHNoNjJnaVVMa0RnZUFzSWZnSzhOU0hZTXpXMzdVN3plOWFwQWsrWFJuCjBxSGxERWVlM1ZwRTA4RXp3ZWxmNGhOcndVbzRBS1Rzb0dtOE1leXZRZWQrQUtDVFo5Yi9WZDJLcEdTSHVCRkEKbzVVTUJaK0RiUFEvdzcvMWJWQVNQRkhkMW9BS1pYaDhST05iNWZtSWxjeml3Mno5NHk5WWF0SE9saTFZbDBIMQoxcTZqQUprZGVLNnF1Y0s5dXJEbWQ4eEV5dFFQRy8vR05IYkJRdjZhaXlTZUVLZ2Z2bnBhMFdDdytDZWZRRlhZClRpUXhNRmVwaXNySTJtQUNrL2ZrTXdGeUJTbmlTcktpWUJ3d3I5NDZZclNUN2VXV0NqZkwzTnUvUDRIUnZ4VVYKSjVQZ1ZXQmhqc1locGhVMG9kYXljcFFJMDdFV2w5Rk9mVzhGYmtDSGNhQUMxSm85U1ZsY0pwUHI2SXo1dFFvawp2MnFnWHMvd0I4Zy80WUNnOXBkN3praytJRUIvbXRsTnNRVTl3c1lTZWR5TXFXR0VoUUtDQVFFQXlROWVSZEZ3Cmp6Wk9CWnpRTUptVzkxM2lPUVkvL3o0WWEzUlk2cmFKbFc0VEdCa0tuVGdmajNKL1g3aVQ0Qy9JOFg0WXhPL0QKaEZJLy9LeFZPdFFueWRqZ3o2SVd0TmxZRW1wb2tKcnB6SXJsK2VvcW0wVld1aFVoWVlIUVc2SE1NcXRWVHNWYwpvZHpKV3grL1VKS0doMHJHWG1RQjJEUkpaVy9OYWdYMDJqZi9MeE1aS3E4eXd6QkdiWk14dFVYN3RxZHAxbGtRCjl3MWNVYTdFRVROVnk1bU5kN1FnTUxoN3c1RFBMNkc3Wk1yci9vUURLOHVaTWFHdG0xMmRIK3EyQnljdmdxYmsKSEFqQ3NBQWM0VW9qMHg1alBKUUU1QnRuVUdwcVVBRGdMcDd6MVFacEpiNmw2VzlwRkFvbTBreVErOWZHbkU4RQppZ2hGTUp0ekZIL3Ywd0tDQVFFQTR1cGNSVCtlNzFuUDVINFZSeURwUVhlN3Z3SHB4OUpnaWpOQnI1djNGalZvCnZMTnpVMzdYTUUrQ2RrejV5RkszMHFOU0hSZ3Q2YWNjbGRiL3huSlRWWUlyTFVSWkVoMDhvbGZRdEZiRUw5TFMKdWZkQUs4WGZOTllEVG9lY1Raekx5cVpuRjZGaUV2MFdQOXNTMzI1b1B3OGJOejhrWEUvOFJiTDU0RHRqUVd5MgpJekJ1emxLdDNKV3NrREdsK254VVlzSGZuR2tWVlBUZEZGbHk5UjEwM0lpcWZtcDBnUEpDay9DRi81ci9aMmxjCmtidTdTYkV1NDEzZHlEQ01KeXNnNldpVnBITDZhb0YzUlUrRXFWck5qVHZhWXRWSk9HN2M3TGxXQjhOaTBqaVIKRDJrTFNHZkZXOU9rZVlIbnRzOGtoK2tROWYwaFEvbEhvM3lVWDVoOEhRS0NBUUJ4MzdCbkhxMy9qcVExN1pERQpWZGo1RlVWUlFzYndTejBOYndJRlBZbERCdXJ0bFJFNzVsT0pyVEdUQnpsSm1nYlhMN0hicUdnMkExZVdSZ3luCm13MUY5djJzMjRLOHZ2Und5YStiWndIUUJVTW5mb2JQRmtCK2VBVkY4bjROeDkrZE93aS82bXdDaU1mS1FucmEKcVlKa0VlZTBBalJCUGF2c05aeEQxa2ZOYURXeGRjR2xPVUVvNTZpYjJ1Z21ZUktsYXNBNDFJMFZQNDN2L1dteQp6RDVsWi95RnRaRWR4djdoenB4cHY5SWd6Z1ZIUzRGNFJvSG5hRWlwWENYbnM4bVExNUxERHI3WFdlYmFROVlYCml5UXJLR1RRSkkxNG5FU3hlUFBwaC9Wd3Nqb3Joc3Y2d3JXNU5vNXUrU2p2cHNuZXVXRVZtbk5ac2tGdHZEMDcKZVJKZEFvSUJBUURLMHZhRXd6ZzU4d282ejJRUGZ1QmZyemsyb3V3bXV1bWx2ZWtCb2FQNnl1U0NmdGdma3FtZgp1Z0gvNGhBR09jR3JXbVprTVIrZzBNbGhPWnJIODVwL1BPbUEvYTJyM2t3N1E4ajkyT3hsWHNrU2htbHFkdVJyCklyd3o3azBNcHBFVjR5VVUzeUI5bnBETHBQSzZtY0krVXk5ZGMyZjV4MWpUcUFWbm8wMjF4Z2tMYlJndC9ZTUEKUHh6T2lrSTBvZnIvaHhGcmloWVNLUWlQVHVETkxYWXVSVTQzenNteUZGamtTVUpNMVd0ak1LOFlhRGdneDJvRQp1dnNwSEJPNlV2ZUpDZjF2ejRIN3Z4c3Y3Y0xEYWJGL2d6ZFJ6aGt6Z3d5ZjM0MkJST2pJeE4wTWJEVTBrK1M1CmpuUmVVM29kVWd2eUc2WVlhaGpZM0RGbmRVeGVJanNWQW9JQkFBZHlwajdUc25mRGZoVHZ2VGdOVTdzdTRkdkQKWVRGdE5zSDliTXMxMTJ0Z3RoQ2Zvb2Nqalk2RXE3OHVjTnZYU0d6MU5BMUJ6UGx2Y3Y3OTdrODE5ZFF4dzhNZwpKUUxuOUtHMHA2aVRCL0lDcHYvanZrMGp4MGQ4eHg2aW9PYS8vZFlsN1RtbEVNOTNsUm5jNDBjeGZFSTNiTXZCCkdCbGxyTlRqNURsUTBxUUtsY04vVzgrcGt6Qm5uQ05TVUMzYUo2c1pydklEczlaZndHMFNLK0FSVlYrV1ZHZm4KR2Y3czZrNkZuVHI3SEtrNjkvSkZjMmdrSmZIbkYxQVk1SHZ4Q1NzT0hwZHZlYmd5ZTZ1cmZBTDdrVHZQVm8vMApyWC9FeTh2WjIzM1A3VUI3c3RGVE9oY3c2RnhQeUUyQlpFV3lWQ0YwK01Hd1hYejFiOE56ekNZalltVT0KLS0tLS1FTkQgUlNBIFBSSVZBVEUgS0VZLS0tLS0K
|
||||
os:
|
||||
crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJQakNCOGFBREFnRUNBaEF1cG9xdWZ0SnI4MktwMTRQS3Rkc1dNQVVHQXl0bGNEQVFNUTR3REFZRFZRUUsKRXdWMFlXeHZjekFlRncweU5qQXpNamN5TWpNMU5ERmFGdzB6TmpBek1qUXlNak0xTkRGYU1CQXhEakFNQmdOVgpCQW9UQlhSaGJHOXpNQ293QlFZREsyVndBeUVBcnV0QktaRkswamw5V0QwRUQ0OXZWS3pLc3g5OWg3eThhWkIvCmsyamRFbWlqWVRCZk1BNEdBMVVkRHdFQi93UUVBd0lDaERBZEJnTlZIU1VFRmpBVUJnZ3JCZ0VGQlFjREFRWUkKS3dZQkJRVUhBd0l3RHdZRFZSMFRBUUgvQkFVd0F3RUIvekFkQmdOVkhRNEVGZ1FVWE1RVDR4WkE1WHduU3kveApCeUhmbDBQbXZrY3dCUVlESzJWd0EwRUFybVUrMzJpa2QyUkxlNElWMnhScm8zckNiUFkrYVBKSU9ickhTTnBICjJvcVVuOTdXRG14akhGM2o2ekxVSVpyR2wyMVd1Y2pUOXBxUFZYUkJOWHdkRHc9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg==
|
||||
key: LS0tLS1CRUdJTiBFRDI1NTE5IFBSSVZBVEUgS0VZLS0tLS0KTUM0Q0FRQXdCUVlESzJWd0JDSUVJSFpTWnlGbjdXd093N0l0UmNIT1JnYUphMTBicTJ0TllRdnY3Y2VCb2ZqTwotLS0tLUVORCBFRDI1NTE5IFBSSVZBVEUgS0VZLS0tLS0K
|
||||
@@ -1,125 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Safe Talos rolling upgrade script:
|
||||
# 1.8.4 -> 1.9.5 -> 1.10.7 -> 1.11.6 -> 1.12.5
|
||||
# Order: cp-1, cp-2, cp-3, worker-1
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||
|
||||
TALOSCONFIG_PATH="${TALOSCONFIG_PATH:-${REPO_ROOT}/talos/clusterconfig/talosconfig}"
|
||||
ENDPOINT="${ENDPOINT:-192.168.50.230}"
|
||||
|
||||
CONTROL_PLANES=("192.168.50.20" "192.168.50.30" "192.168.50.40")
|
||||
WORKERS=("192.168.50.10")
|
||||
|
||||
UPGRADE_VERSIONS=("v1.12.5")
|
||||
|
||||
if [[ ! -f "${TALOSCONFIG_PATH}" ]]; then
|
||||
echo "Talos config not found: ${TALOSCONFIG_PATH}"
|
||||
echo "Set TALOSCONFIG_PATH=/absolute/path/to/talosconfig and retry."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
run_talosctl() {
|
||||
talosctl --talosconfig "${TALOSCONFIG_PATH}" "$@"
|
||||
}
|
||||
|
||||
normalize_version() {
|
||||
local version="$1"
|
||||
echo "${version#v}"
|
||||
}
|
||||
|
||||
version_ge() {
|
||||
local left
|
||||
local right
|
||||
left="$(normalize_version "$1")"
|
||||
right="$(normalize_version "$2")"
|
||||
[[ "$(printf "%s\n%s\n" "${left}" "${right}" | sort -V | tail -n1)" == "${left}" ]]
|
||||
}
|
||||
|
||||
get_node_talos_version() {
|
||||
local node_ip="$1"
|
||||
local output
|
||||
|
||||
output="$(run_talosctl -n "${node_ip}" version 2>/dev/null || true)"
|
||||
|
||||
# Prefer the server tag for the requested node from the NODE/Tag block.
|
||||
local node_tag
|
||||
node_tag="$(
|
||||
printf "%s\n" "${output}" | awk -v node="${node_ip}" '
|
||||
$1=="NODE:" && $2==node { seen=1; next }
|
||||
seen && $1=="Tag:" { print $2; exit }
|
||||
'
|
||||
)"
|
||||
|
||||
if [[ -n "${node_tag}" ]]; then
|
||||
echo "${node_tag}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
check_cluster_ready() {
|
||||
echo "Checking cluster health via endpoint ${ENDPOINT}..."
|
||||
run_talosctl -e "${ENDPOINT}" -n "${CONTROL_PLANES[0]}" health
|
||||
kubectl get nodes -o wide
|
||||
}
|
||||
|
||||
upgrade_node_to_version() {
|
||||
local node_ip="$1"
|
||||
local version="$2"
|
||||
local image="ghcr.io/siderolabs/installer:${version}"
|
||||
local current_version=""
|
||||
|
||||
echo
|
||||
echo "=== Upgrading node ${node_ip} to ${version} ==="
|
||||
if current_version="$(get_node_talos_version "${node_ip}")"; then
|
||||
echo "Current Talos version on ${node_ip}: ${current_version}"
|
||||
if version_ge "${current_version}" "${version}"; then
|
||||
echo "Node ${node_ip} already at or above ${version}; skipping upgrade/reboot."
|
||||
return 0
|
||||
fi
|
||||
else
|
||||
echo "Could not determine current server version for ${node_ip}; continuing with upgrade."
|
||||
fi
|
||||
|
||||
run_talosctl -n "${node_ip}" upgrade --image "${image}"
|
||||
run_talosctl -n "${node_ip}" reboot
|
||||
|
||||
echo "Waiting for cluster and node health after ${node_ip} reboot..."
|
||||
run_talosctl -e "${ENDPOINT}" -n "${CONTROL_PLANES[0]}" health
|
||||
run_talosctl -n "${node_ip}" version
|
||||
kubectl get nodes -o wide
|
||||
}
|
||||
|
||||
echo "Using TALOSCONFIG: ${TALOSCONFIG_PATH}"
|
||||
echo "Control planes: ${CONTROL_PLANES[*]}"
|
||||
echo "Workers: ${WORKERS[*]}"
|
||||
echo "Upgrade hops: ${UPGRADE_VERSIONS[*]}"
|
||||
echo
|
||||
|
||||
check_cluster_ready
|
||||
|
||||
for version in "${UPGRADE_VERSIONS[@]}"; do
|
||||
echo
|
||||
echo "##### Starting upgrade hop ${version} #####"
|
||||
|
||||
for node in "${CONTROL_PLANES[@]}"; do
|
||||
upgrade_node_to_version "${node}" "${version}"
|
||||
done
|
||||
|
||||
for node in "${WORKERS[@]}"; do
|
||||
upgrade_node_to_version "${node}" "${version}"
|
||||
done
|
||||
|
||||
echo "Completed hop ${version}. Verifying cluster state..."
|
||||
check_cluster_ready
|
||||
done
|
||||
|
||||
echo
|
||||
echo "All upgrade hops complete."
|
||||
run_talosctl version
|
||||
kubectl get nodes -o wide
|
||||
Reference in New Issue
Block a user