From 4263da65d8ad8cafc1e34cf5052d89291976a611 Mon Sep 17 00:00:00 2001 From: Nikholas Pcenicni <82239765+nikpcenicni@users.noreply.github.com> Date: Fri, 27 Mar 2026 19:29:54 -0400 Subject: [PATCH] Update Cilium application.yaml to enhance ignoreDifferences for cilium-operator Deployment and improve Helm sync handling. Modify kube-vip daemonset.yaml to adjust VIP interface and add new environment variables for better configuration. Update README.md with troubleshooting tips for kube-vip and Helm upgrade conflicts. --- clusters/noble/apps/cilium/application.yaml | 11 +++- .../noble/apps/kube-vip/vip-daemonset.yaml | 21 ++++++- talos/README.md | 62 +++++++++++++++---- 3 files changed, 79 insertions(+), 15 deletions(-) diff --git a/clusters/noble/apps/cilium/application.yaml b/clusters/noble/apps/cilium/application.yaml index 41b7d84..a520f6e 100644 --- a/clusters/noble/apps/cilium/application.yaml +++ b/clusters/noble/apps/cilium/application.yaml @@ -7,8 +7,8 @@ metadata: argocd.argoproj.io/sync-wave: "0" spec: project: default - # Helm TLS material for Hubble is rotated/generated; Argo SSA and CLI helm - # upgrades both touch Secret data and cause apply conflicts unless ignored. + # Argo SSA vs CLI helm: ignore generated TLS and fields Argo commonly owns so + # RespectIgnoreDifferences can skip fighting Helm on sync. ignoreDifferences: - group: "" kind: Secret @@ -16,6 +16,13 @@ spec: namespace: kube-system jqPathExpressions: - .data + - group: apps + kind: Deployment + name: cilium-operator + namespace: kube-system + jsonPointers: + - /spec/replicas + - /spec/strategy/rollingUpdate/maxUnavailable destination: server: https://kubernetes.default.svc namespace: kube-system diff --git a/clusters/noble/apps/kube-vip/vip-daemonset.yaml b/clusters/noble/apps/kube-vip/vip-daemonset.yaml index 468cd6c..1057de6 100644 --- a/clusters/noble/apps/kube-vip/vip-daemonset.yaml +++ b/clusters/noble/apps/kube-vip/vip-daemonset.yaml @@ -23,6 +23,8 @@ spec: - key: node-role.kubernetes.io/master operator: Exists effect: NoSchedule + - operator: Exists + effect: NoExecute containers: - name: kube-vip image: ghcr.io/kube-vip/kube-vip:v0.8.3 @@ -36,17 +38,32 @@ spec: value: "192.168.50.230" - name: port value: "6443" + # Physical uplink from `talosctl -n get links` (this cluster: ens18). - name: vip_interface - value: "eth0" + value: "ens18" + - name: vip_subnet + value: "32" + - name: vip_leaderelection + value: "true" - name: cp_enable value: "true" + - name: cp_namespace + value: "kube-system" - name: svc_enable value: "true" - - name: servicesElection + # Env is svc_election (not servicesElection); see pkg/kubevip/config_envvar.go + - name: svc_election value: "true" + - name: vip_leaseduration + value: "5" + - name: vip_renewdeadline + value: "3" + - name: vip_retryperiod + value: "1" securityContext: capabilities: add: - NET_ADMIN - NET_RAW + - SYS_TIME diff --git a/talos/README.md b/talos/README.md index 2b1f055..9cae906 100644 --- a/talos/README.md +++ b/talos/README.md @@ -171,17 +171,37 @@ kubectl get pods -n kube-system -l app.kubernetes.io/part-of=cilium -w operators with hard anti-affinity) cannot deadlock `helm --wait` when only one node can take the operator early in bootstrap. -If **`helm upgrade` fails** with a server-side apply conflict on -`kube-system/hubble-server-certs` and **`argocd-controller`**, Argo already -synced Cilium and owns that Secret’s TLS fields. The **`cilium` Application** -uses **`ignoreDifferences`** on that Secret plus **`RespectIgnoreDifferences`** -so GitOps and occasional CLI Helm runs do not fight over `.data`. Until that -manifest is applied in the cluster, either **suspend** the `cilium` Application -in Argo, or delete the Secret once (`kubectl delete secret -hubble-server-certs -n kube-system`) and re-run **`helm upgrade --install`** -before Argo reconciles again. After bootstrap, prefer **`kubectl -n argocd get -application cilium -o yaml`** / Argo UI to sync Cilium instead of ad hoc -Helm, unless you suspend the app first. +If **`helm upgrade` fails** with server-side apply conflicts and +**`argocd-controller`**, Argo already synced Cilium and **owns those fields** +on live objects. Clearing **`syncPolicy`** on the Application does **not** +remove that ownership; Helm still conflicts until you **take over** the fields +or only use Argo. + +**One-shot CLI fix** (Helm 3.13+): add **`--force-conflicts`** so SSA wins the +disputed fields: + +```bash +helm upgrade --install cilium cilium/cilium \ + --namespace kube-system \ + --version 1.16.6 \ + -f clusters/noble/apps/cilium/helm-values.yaml \ + --force-conflicts +``` + +Typical conflicts: Secret **`hubble-server-certs`** (`.data` TLS) and +Deployment **`cilium-operator`** (`.spec.replicas`, +`.spec/strategy/rollingUpdate/maxUnavailable`). The **`cilium` Application** +lists **`ignoreDifferences`** for those paths plus **`RespectIgnoreDifferences`** +so later Argo syncs do not keep overwriting them. Apply the manifest after you +change it: **`kubectl apply -f clusters/noble/apps/cilium/application.yaml`**. + +After bootstrap, prefer syncing Cilium **only through Argo** (from Git) instead +of ad hoc Helm, unless you suspend the **`cilium`** Application first. + +Shell tip: a line like **`# comment`** must start with **`#`**; if the shell +reports **`command not found: #`**, the character is not a real hash or the +line was pasted wrong—run **`kubectl apply ...`** as its own command without a +leading comment on the same paste block. If nodes were already `Ready`, you can skip straight to section 7. @@ -234,6 +254,26 @@ kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip-ds -o wide nc -vz 192.168.50.230 6443 ``` +If **`kube-vip-ds` pods are `CrashLoopBackOff`**, logs usually show +`could not get link for interface '…'`. kube-vip binds the VIP to +**`vip_interface`**; on Talos the uplink is often **`eno1`**, **`enp0s…`**, or +**`enx…`**, not **`eth0`**. On a control-plane node IP from `talconfig.yaml`: + +```bash +talosctl -n 192.168.50.20 get links +``` + +Do **not** paste that command’s **table output** back into the shell: zsh runs +each line as a command (e.g. `192.168.50.20` → `command not found`), and a line +starting with **`NODE`** can be mistaken for the **`node`** binary and try to +load a file like **`NAMESPACE`** in the current directory. Also avoid pasting +the **prompt** (`(base) … %`) together with the command (duplicate prompt → +parse errors). + +Set **`vip_interface`** in `clusters/noble/apps/kube-vip/vip-daemonset.yaml` to +that link’s **`metadata.id`**, commit, sync (or `kubectl apply -k +clusters/noble/apps/kube-vip`), and confirm pods go **`Running`**. + ## 9) Argo CD via DNS host (no port) Argo CD is exposed through a kube-vip managed LoadBalancer Service: