From 2bf72779171adba3e1d2322962d35a2af96565d6 Mon Sep 17 00:00:00 2001 From: Nikholas Pcenicni <82239765+nikpcenicni@users.noreply.github.com> Date: Wed, 13 May 2026 23:59:30 -0400 Subject: [PATCH] Enhance csi-snapshot-controller README with troubleshooting guidance for CrashLoopBackOff issues and update kustomization.yaml to include deployment patch. This improves user experience and deployment reliability. --- .../csi-snapshot-controller/README.md | 14 ++++++++++++++ .../controller/deployment-patch.yaml | 19 +++++++++++++++++++ .../controller/kustomization.yaml | 3 +++ 3 files changed, 36 insertions(+) create mode 100644 clusters/noble/bootstrap/csi-snapshot-controller/controller/deployment-patch.yaml diff --git a/clusters/noble/bootstrap/csi-snapshot-controller/README.md b/clusters/noble/bootstrap/csi-snapshot-controller/README.md index 81a1b1c..60fd6cb 100644 --- a/clusters/noble/bootstrap/csi-snapshot-controller/README.md +++ b/clusters/noble/bootstrap/csi-snapshot-controller/README.md @@ -14,3 +14,17 @@ kubectl -n kube-system rollout status deploy/snapshot-controller --timeout=120s ``` After this, create or label a **VolumeSnapshotClass** for Longhorn (`velero.io/csi-volumesnapshot-class: "true"`) per `clusters/noble/bootstrap/velero/README.md`. + +## Troubleshooting + +If **`snapshot-controller`** is **CrashLoopBackOff** with exit **255** / **1**, check previous logs: + +```bash +kubectl -n kube-system logs deploy/snapshot-controller --previous --tail=80 +``` + +Typical causes: + +1. **Volume Snapshot CRDs missing or wrong version** — re-apply: `kubectl apply -k clusters/noble/bootstrap/csi-snapshot-controller/crd` +2. **Volume group snapshot APIs** — this repo installs **GA** `VolumeSnapshot*` CRDs only. The controller overlay sets **`--feature-gates=CSIVolumeGroupSnapshot=false`** so the binary does not expect **VolumeGroupSnapshot*** CRDs. +3. **RBAC** — re-apply the controller kustomize (includes `rbac-snapshot-controller.yaml`). diff --git a/clusters/noble/bootstrap/csi-snapshot-controller/controller/deployment-patch.yaml b/clusters/noble/bootstrap/csi-snapshot-controller/controller/deployment-patch.yaml new file mode 100644 index 0000000..570c2e0 --- /dev/null +++ b/clusters/noble/bootstrap/csi-snapshot-controller/controller/deployment-patch.yaml @@ -0,0 +1,19 @@ +# Overlay on upstream setup-snapshot-controller.yaml (external-snapshotter v8.5.0). +# - We only install GA VolumeSnapshot CRDs (no VolumeGroupSnapshot*). Explicitly disable the +# group-snapshot feature so the controller does not wait on / watch APIs we did not install. +# - Longer CRD wait helps apiserver/etcd under load during bootstrap (avoids startup timeout). +apiVersion: apps/v1 +kind: Deployment +metadata: + name: snapshot-controller + namespace: kube-system +spec: + template: + spec: + containers: + - name: snapshot-controller + args: + - "--v=5" + - "--leader-election=true" + - "--feature-gates=CSIVolumeGroupSnapshot=false" + - "--retry-crd-interval-max=5m" diff --git a/clusters/noble/bootstrap/csi-snapshot-controller/controller/kustomization.yaml b/clusters/noble/bootstrap/csi-snapshot-controller/controller/kustomization.yaml index 230d1b8..85dc98f 100644 --- a/clusters/noble/bootstrap/csi-snapshot-controller/controller/kustomization.yaml +++ b/clusters/noble/bootstrap/csi-snapshot-controller/controller/kustomization.yaml @@ -6,3 +6,6 @@ namespace: kube-system resources: - https://raw.githubusercontent.com/kubernetes-csi/external-snapshotter/v8.5.0/deploy/kubernetes/snapshot-controller/rbac-snapshot-controller.yaml - https://raw.githubusercontent.com/kubernetes-csi/external-snapshotter/v8.5.0/deploy/kubernetes/snapshot-controller/setup-snapshot-controller.yaml + +patches: + - path: deployment-patch.yaml