Add Trivy integration to noble cluster setup, including namespace and application configurations. Update README and playbook tags to reflect new security scanning capabilities. Enhance Longhorn and kube-prometheus-stack deployment reliability with increased timeout settings and retry mechanisms.

This commit is contained in:
Nikholas Pcenicni
2026-05-13 23:48:12 -04:00
parent 774b97894c
commit 663ebc5001
16 changed files with 223 additions and 13 deletions

View File

@@ -7,10 +7,10 @@
# kubectl apply -f clusters/noble/bootstrap/kube-prometheus-stack/namespace.yaml
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
# helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \
# --version 82.15.1 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 30m
# --version 82.15.1 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 60m
#
# Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes).
# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 1530m. To watch progress,
# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 3060m. To watch progress,
# open a second terminal: kubectl -n monitoring get pods,sts,ds -w
# To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w
#
@@ -36,6 +36,11 @@ crds:
upgradeJob:
enabled: false
# Subchart defaults only tolerate NoSchedule; a node with other taints leaves node-exporter at 3/4 and Helm --wait times out.
prometheus-node-exporter:
tolerations:
- operator: Exists
# --- Longhorn-backed persistence (default chart storage is emptyDir) ---
alertmanager:
alertmanagerSpec: