Add Trivy integration to noble cluster setup, including namespace and application configurations. Update README and playbook tags to reflect new security scanning capabilities. Enhance Longhorn and kube-prometheus-stack deployment reliability with increased timeout settings and retry mechanisms.

2026-05-13 23:48:12 -04:00
parent 774b97894c
commit 663ebc5001
16 changed files with 223 additions and 13 deletions
--- a/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml
+++ b/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml
@@ -7,10 +7,10 @@
 #   kubectl apply -f clusters/noble/bootstrap/kube-prometheus-stack/namespace.yaml
 #   helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
 #   helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \
-#     --version 82.15.1 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 30m
+#     --version 82.15.1 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 60m
 #
 # Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes).
-# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 15–30m. To watch progress,
+# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 30–60m. To watch progress,
 # open a second terminal: kubectl -n monitoring get pods,sts,ds -w
 # To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w
 #
@@ -36,6 +36,11 @@ crds:
  upgradeJob:
    enabled: false

+# Subchart defaults only tolerate NoSchedule; a node with other taints leaves node-exporter at 3/4 and Helm --wait times out.
+prometheus-node-exporter:
+  tolerations:
+    - operator: Exists
+
 # --- Longhorn-backed persistence (default chart storage is emptyDir) ---
 alertmanager:
  alertmanagerSpec: