Add Trivy integration to noble cluster setup, including namespace and application configurations. Update README and playbook tags to reflect new security scanning capabilities. Enhance Longhorn and kube-prometheus-stack deployment reliability with increased timeout settings and retry mechanisms.

2026-05-13 23:48:12 -04:00
parent 774b97894c
commit 663ebc5001
16 changed files with 223 additions and 13 deletions
--- a/clusters/noble/bootstrap/argocd/app-of-apps/kustomization.yaml
+++ b/clusters/noble/bootstrap/argocd/app-of-apps/kustomization.yaml
@@ -18,3 +18,4 @@ resources:
  - loki-application.yaml
  - fluent-bit-application.yaml
  - headlamp-application.yaml
+  - trivy-operator-application.yaml
--- a/clusters/noble/bootstrap/argocd/app-of-apps/trivy-operator-application.yaml
+++ b/clusters/noble/bootstrap/argocd/app-of-apps/trivy-operator-application.yaml
@@ -0,0 +1,29 @@
+# Bootstrap app-of-apps leaf: Trivy Operator (vulnerability + config audit reports).
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+  name: noble-trivy-operator
+  namespace: argocd
+  finalizers:
+    - resources-finalizer.argocd.argoproj.io/background
+spec:
+  project: default
+  sources:
+    - repoURL: https://aquasecurity.github.io/helm-charts/
+      chart: trivy-operator
+      targetRevision: 0.32.1
+      helm:
+        releaseName: trivy-operator
+        valueFiles:
+          - $values/clusters/noble/bootstrap/trivy/values.yaml
+    - repoURL: https://gitea.pcenicni.ca/gsdavidp/home-server.git
+      targetRevision: HEAD
+      ref: values
+  destination:
+    server: https://kubernetes.default.svc
+    namespace: trivy-system
+  # Manual sync: Ansible helm runs first; enable automation after cutover (see ../README.md §5).
+  syncPolicy:
+    syncOptions:
+      - CreateNamespace=true
+      - ServerSideApply=true
--- a/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml
+++ b/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml
@@ -7,10 +7,10 @@
 #   kubectl apply -f clusters/noble/bootstrap/kube-prometheus-stack/namespace.yaml
 #   helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
 #   helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \
-#     --version 82.15.1 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 30m
+#     --version 82.15.1 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 60m
 #
 # Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes).
-# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 15–30m. To watch progress,
+# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 30–60m. To watch progress,
 # open a second terminal: kubectl -n monitoring get pods,sts,ds -w
 # To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w
 #
@@ -36,6 +36,11 @@ crds:
  upgradeJob:
    enabled: false

+# Subchart defaults only tolerate NoSchedule; a node with other taints leaves node-exporter at 3/4 and Helm --wait times out.
+prometheus-node-exporter:
+  tolerations:
+    - operator: Exists
+
 # --- Longhorn-backed persistence (default chart storage is emptyDir) ---
 alertmanager:
  alertmanagerSpec:
--- a/clusters/noble/bootstrap/kustomization.yaml
+++ b/clusters/noble/bootstrap/kustomization.yaml
@@ -17,4 +17,5 @@ resources:
  - velero/namespace.yaml
  - velero/longhorn-volumesnapshotclass.yaml
  - headlamp/namespace.yaml
+  - trivy/namespace.yaml
  - grafana-loki-datasource/loki-datasource.yaml
--- a/clusters/noble/bootstrap/kyverno/policies-values.yaml
+++ b/clusters/noble/bootstrap/kyverno/policies-values.yaml
@@ -9,6 +9,12 @@
 # outside baseline (see namespace PSA labels under clusters/noble/bootstrap/*/namespace.yaml)
 # plus core Kubernetes namespaces and every Ansible-managed app namespace on noble.
 #
+# failurePolicy **Ignore** (chart default is Fail): when the apiserver cannot reach Kyverno
+# within the webhook timeout (e.g. admission overloaded during Helm hooks / Longhorn
+# post-upgrade Job), Fail denies the request and breaks installs. Ignore allows the request
+# through on transport failure only — policy violations are still handled per
+# validationFailureAction when Kyverno responds.
+#
 # After widening excludes, Kyverno does not always prune old PolicyReport rows; refresh:
 #   kubectl delete clusterpolicyreport --all
 #   kubectl delete policyreport -A --all
@@ -22,10 +28,10 @@ policyType: ClusterPolicy
 podSecurityStandard: baseline
 podSecuritySeverity: medium
 validationFailureAction: Audit
-failurePolicy: Fail
+failurePolicy: Ignore
 validationAllowExistingViolations: true

-# All platform namespaces on noble (ansible/playbooks/noble.yml + clusters/noble/bootstrap).
+# All platform namespaces on noble (ansible/playbooks/noble.yml + clusters/noble/bootstrap). Includes **trivy-system**.
 x-kyverno-exclude-infra: &kyverno_exclude_infra
  any:
    - resources:
@@ -44,6 +50,7 @@ x-kyverno-exclude-infra: &kyverno_exclude_infra
          - monitoring
          - newt
          - traefik
+          - trivy-system

 policyExclude:
  disallow-capabilities: *kyverno_exclude_infra
--- a/clusters/noble/bootstrap/trivy/namespace.yaml
+++ b/clusters/noble/bootstrap/trivy/namespace.yaml
@@ -0,0 +1,10 @@
+# Trivy Operator — apply before Helm (Ansible **noble_trivy**).
+# Scan jobs may use elevated capabilities; align with other operator namespaces.
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: trivy-system
+  labels:
+    pod-security.kubernetes.io/enforce: privileged
+    pod-security.kubernetes.io/audit: privileged
+    pod-security.kubernetes.io/warn: privileged
--- a/clusters/noble/bootstrap/trivy/values.yaml
+++ b/clusters/noble/bootstrap/trivy/values.yaml
@@ -0,0 +1,28 @@
+# Trivy Operator — in-cluster image vulnerability + config reports (Aqua trivy-operator Helm chart).
+#
+#   helm repo add aqua https://aquasecurity.github.io/helm-charts/ && helm repo update
+#   kubectl apply -f clusters/noble/bootstrap/trivy/namespace.yaml
+#   helm upgrade --install trivy-operator aqua/trivy-operator -n trivy-system \
+#     --version 0.32.1 -f clusters/noble/bootstrap/trivy/values.yaml --wait --timeout 15m
+#
+# Inspect: kubectl get vulnerabilityreports,configauditreports -A
+# Docs: https://aquasecurity.github.io/trivy-operator/
+
+# Skip platform/system namespaces (mirrors Kyverno excludes; reduces scan load).
+excludeNamespaces: "argocd,cert-manager,headlamp,kyverno,local-path-storage,logging,longhorn-system,loki,metallb-system,monitoring,newt,traefik,trivy-system,velero,kube-node-lease,kube-public,kube-system"
+
+operator:
+  scanJobsConcurrentLimit: 5
+  # SBOM / cluster compliance add CPU and CR volume; keep vulnerability + config audit.
+  sbomGenerationEnabled: false
+  clusterSbomCacheEnabled: false
+  clusterComplianceEnabled: false
+
+trivyOperator:
+  # Run scan Jobs on every node (Talos / mixed taints).
+  scanJobTolerations:
+    - operator: Exists
+
+serviceMonitor:
+  enabled: true
+  namespace: monitoring