From 663ebc50014cc6ef7a8535ded8fdb7c3c3fbd90a Mon Sep 17 00:00:00 2001 From: Nikholas Pcenicni <82239765+nikpcenicni@users.noreply.github.com> Date: Wed, 13 May 2026 23:48:12 -0400 Subject: [PATCH] Add Trivy integration to noble cluster setup, including namespace and application configurations. Update README and playbook tags to reflect new security scanning capabilities. Enhance Longhorn and kube-prometheus-stack deployment reliability with increased timeout settings and retry mechanisms. --- ansible/README.md | 5 +- ansible/playbooks/noble.yml | 14 ++-- ansible/roles/helm_repos/defaults/main.yml | 1 + .../roles/noble_longhorn/defaults/main.yml | 4 + ansible/roles/noble_longhorn/tasks/main.yml | 4 + .../roles/noble_platform/defaults/main.yml | 8 ++ ansible/roles/noble_platform/tasks/main.yml | 75 ++++++++++++++++++- ansible/roles/noble_trivy/defaults/main.yml | 3 + ansible/roles/noble_trivy/tasks/main.yml | 33 ++++++++ .../argocd/app-of-apps/kustomization.yaml | 1 + .../trivy-operator-application.yaml | 29 +++++++ .../kube-prometheus-stack/values.yaml | 9 ++- clusters/noble/bootstrap/kustomization.yaml | 1 + .../bootstrap/kyverno/policies-values.yaml | 11 ++- clusters/noble/bootstrap/trivy/namespace.yaml | 10 +++ clusters/noble/bootstrap/trivy/values.yaml | 28 +++++++ 16 files changed, 223 insertions(+), 13 deletions(-) create mode 100644 ansible/roles/noble_trivy/defaults/main.yml create mode 100644 ansible/roles/noble_trivy/tasks/main.yml create mode 100644 clusters/noble/bootstrap/argocd/app-of-apps/trivy-operator-application.yaml create mode 100644 clusters/noble/bootstrap/trivy/namespace.yaml create mode 100644 clusters/noble/bootstrap/trivy/values.yaml diff --git a/ansible/README.md b/ansible/README.md index cb95661..3b093c3 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -1,6 +1,6 @@ # Ansible — noble cluster -Automates [`talos/CLUSTER-BUILD.md`](../talos/CLUSTER-BUILD.md): optional **Talos Phase A** (genconfig → apply → bootstrap → kubeconfig), then **Phase B+** (CNI → add-ons → ingress → Argo CD → Kyverno → observability, etc.). **Argo CD** does not reconcile core charts — optional GitOps starts from an empty [`clusters/noble/apps/kustomization.yaml`](../clusters/noble/apps/kustomization.yaml). +Automates [`talos/CLUSTER-BUILD.md`](../talos/CLUSTER-BUILD.md): optional **Talos Phase A** (genconfig → apply → bootstrap → kubeconfig), then **Phase B+** (CNI → add-ons → ingress → Argo CD → Kyverno → observability → Trivy, etc.). **Argo CD** does not reconcile core charts — optional GitOps starts from an empty [`clusters/noble/apps/kustomization.yaml`](../clusters/noble/apps/kustomization.yaml). ## Order of operations @@ -73,6 +73,7 @@ Override with `-e` when needed, e.g. **`-e noble_talos_skip_bootstrap=true`** if ```bash ansible-playbook playbooks/noble.yml --tags cilium,metallb +ansible-playbook playbooks/noble.yml --tags trivy ansible-playbook playbooks/noble.yml --skip-tags newt ansible-playbook playbooks/noble.yml --tags velero -e noble_velero_install=true -e noble_velero_s3_bucket=... -e noble_velero_s3_url=... ``` @@ -88,7 +89,7 @@ ansible-playbook playbooks/noble.yml --tags velero -e noble_velero_install=true |------|----------| | `talos_phase_a` | Talos genconfig, apply-config, bootstrap, kubeconfig | | `helm_repos` | `helm repo add` / `update` | -| `noble_*` | Cilium, CSI Volume Snapshot CRDs + controller, metrics-server, Longhorn, MetalLB (20m Helm wait), kube-vip, Traefik, cert-manager, Newt, Argo CD, Kyverno, platform stack, Velero (optional) | +| `noble_*` | Cilium, CSI Volume Snapshot CRDs + controller, metrics-server, Longhorn, MetalLB (20m Helm wait), kube-vip, Traefik, cert-manager, Newt, Argo CD, Kyverno, platform stack, **Trivy Operator**, Velero (optional) | | `noble_landing_urls` | Writes **`ansible/output/noble-lab-ui-urls.md`** — URLs, service names, and (optional) Argo/Grafana passwords from Secrets | | `noble_post_deploy` | Post-install reminders | | `talos_bootstrap` | Genconfig-only (used by older playbook) | diff --git a/ansible/playbooks/noble.yml b/ansible/playbooks/noble.yml index 25a7a41..34410e0 100644 --- a/ansible/playbooks/noble.yml +++ b/ansible/playbooks/noble.yml @@ -4,7 +4,7 @@ # Run from repo **ansible/** directory: ansible-playbook playbooks/noble.yml # # Tags: repos, cilium, csi_snapshot, metrics, longhorn, metallb, kube_vip, traefik, cert_manager, newt, -# argocd, kyverno, kyverno_policies, platform, velero, all (default) +# argocd, kyverno, kyverno_policies, platform, trivy, velero, all (default) - name: Noble cluster — platform stack (Ansible-managed) hosts: localhost connection: local @@ -206,6 +206,12 @@ tags: [csi_snapshot, snapshot, storage] - role: noble_metrics_server tags: [metrics, metrics_server] + # Kyverno before Longhorn: Longhorn post-upgrade Job is admitted through Kyverno; policies use + # failurePolicy Ignore so webhook transport timeouts do not fail Helm (see policies-values.yaml). + - role: noble_kyverno + tags: [kyverno, policy] + - role: noble_kyverno_policies + tags: [kyverno_policies, policy] - role: noble_longhorn tags: [longhorn, storage] - role: noble_metallb @@ -220,12 +226,10 @@ tags: [newt] - role: noble_argocd tags: [argocd, gitops] - - role: noble_kyverno - tags: [kyverno, policy] - - role: noble_kyverno_policies - tags: [kyverno_policies, policy] - role: noble_platform tags: [platform, observability, apps] + - role: noble_trivy + tags: [trivy, security, scanning] - role: noble_velero tags: [velero, backups] - role: noble_landing_urls diff --git a/ansible/roles/helm_repos/defaults/main.yml b/ansible/roles/helm_repos/defaults/main.yml index f543ed3..e97da7f 100644 --- a/ansible/roles/helm_repos/defaults/main.yml +++ b/ansible/roles/helm_repos/defaults/main.yml @@ -14,3 +14,4 @@ noble_helm_repos: - { name: headlamp, url: "https://kubernetes-sigs.github.io/headlamp/" } - { name: kyverno, url: "https://kyverno.github.io/kyverno/" } - { name: vmware-tanzu, url: "https://vmware-tanzu.github.io/helm-charts" } + - { name: aqua, url: "https://aquasecurity.github.io/helm-charts/" } diff --git a/ansible/roles/noble_longhorn/defaults/main.yml b/ansible/roles/noble_longhorn/defaults/main.yml index ff43501..b15740a 100644 --- a/ansible/roles/noble_longhorn/defaults/main.yml +++ b/ansible/roles/noble_longhorn/defaults/main.yml @@ -2,3 +2,7 @@ # Helm --wait default (5m) is often too short for first Longhorn install on several nodes # (image pulls + manager/driver ordering). See ansible/roles/noble_metallb/defaults/main.yml. noble_helm_longhorn_wait_timeout: 20m + +# Transient Kyverno webhook timeouts during post-upgrade hooks / admission storms. +noble_helm_longhorn_retries: 8 +noble_helm_longhorn_retry_delay: 25 diff --git a/ansible/roles/noble_longhorn/tasks/main.yml b/ansible/roles/noble_longhorn/tasks/main.yml index 1d76add..70d319d 100644 --- a/ansible/roles/noble_longhorn/tasks/main.yml +++ b/ansible/roles/noble_longhorn/tasks/main.yml @@ -31,4 +31,8 @@ - "{{ noble_helm_longhorn_wait_timeout }}" environment: KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_longhorn_helm + retries: "{{ noble_helm_longhorn_retries | int }}" + delay: "{{ noble_helm_longhorn_retry_delay | int }}" + until: noble_longhorn_helm.rc == 0 changed_when: true diff --git a/ansible/roles/noble_platform/defaults/main.yml b/ansible/roles/noble_platform/defaults/main.yml index a53fc0c..15a3947 100644 --- a/ansible/roles/noble_platform/defaults/main.yml +++ b/ansible/roles/noble_platform/defaults/main.yml @@ -4,6 +4,14 @@ noble_platform_kubectl_request_timeout: 120s noble_platform_kustomize_retries: 5 noble_platform_kustomize_delay: 20 +# kube-prometheus-stack: operator Deployment uses Kubernetes default progressDeadlineSeconds (600s). +# First install (images + cert-manager webhook TLS) can exceed that; patch + optional rollout restart, then Helm --wait. +noble_platform_kube_prometheus_operator_progress_deadline_seconds: 1800 +noble_platform_kube_prometheus_operator_wait_retries: 60 +noble_platform_kube_prometheus_operator_wait_delay: 5 +# Longhorn PVCs + full stack often need 45-60m; node-exporter DaemonSet can be last at 3/4 until one node catches up. +noble_platform_kube_prometheus_helm_wait_timeout: 60m + # Decrypt **clusters/noble/secrets/*.yaml** with SOPS and kubectl apply (requires **sops**, **age**, and **age-key.txt**). noble_apply_sops_secrets: true noble_sops_age_key_file: "{{ noble_repo_root }}/age-key.txt" diff --git a/ansible/roles/noble_platform/tasks/main.yml b/ansible/roles/noble_platform/tasks/main.yml index 67b8ede..ce5d273 100644 --- a/ansible/roles/noble_platform/tasks/main.yml +++ b/ansible/roles/noble_platform/tasks/main.yml @@ -38,7 +38,78 @@ - noble_sops_age_key_stat.stat.exists changed_when: true -- name: Install kube-prometheus-stack +# Helm --wait alone cannot extend the operator Deployment's progressDeadlineSeconds (default 10m). +- name: Install kube-prometheus-stack (apply without Helm wait) + ansible.builtin.command: + argv: + - helm + - upgrade + - --install + - kube-prometheus + - prometheus-community/kube-prometheus-stack + - -n + - monitoring + - --version + - "82.15.1" + - -f + - "{{ noble_repo_root }}/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml" + - --force-conflicts + - --wait=false + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + changed_when: true + +- name: Wait for prometheus-operator Deployment object + ansible.builtin.command: + argv: + - kubectl + - get + - deployment/kube-prometheus-kube-prome-operator + - -n + - monitoring + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_kube_prom_operator_deploy + until: noble_kube_prom_operator_deploy.rc == 0 + retries: "{{ noble_platform_kube_prometheus_operator_wait_retries | int }}" + delay: "{{ noble_platform_kube_prometheus_operator_wait_delay | int }}" + changed_when: false + +- name: Extend prometheus-operator Deployment progress deadline + ansible.builtin.command: + argv: + - kubectl + - patch + - deployment/kube-prometheus-kube-prome-operator + - -n + - monitoring + - --type=merge + - -p + - "{{ {'spec': {'progressDeadlineSeconds': (noble_platform_kube_prometheus_operator_progress_deadline_seconds | int)}} | to_json }}" + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + changed_when: true + +- name: Restart prometheus-operator if Deployment already hit progress deadline + ansible.builtin.shell: | + set -euo pipefail + dep=kube-prometheus-kube-prome-operator + msg=$(kubectl get deployment "$dep" -n monitoring -o jsonpath='{.status.conditions[?(@.type=="Progressing")].message}' 2>/dev/null || true) + reason=$(kubectl get deployment "$dep" -n monitoring -o jsonpath='{.status.conditions[?(@.type=="Progressing")].reason}' 2>/dev/null || true) + combined="${reason}${msg}" + if printf '%s' "$combined" | grep -qiE 'ProgressDeadlineExceeded|progress[[:space:]]*deadline[[:space:]]*exceeded'; then + kubectl rollout restart deployment/"$dep" -n monitoring + echo restarted + fi + args: + executable: /bin/bash + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_kube_prom_operator_restart + changed_when: "'restarted' in noble_kube_prom_operator_restart.stdout" + +# Helm --wait prints nothing until done or timeout; override noble_platform_kube_prometheus_helm_wait_timeout if needed. +- name: Install kube-prometheus-stack (Helm wait for full release; often 30-60m silent - watch kubectl -n monitoring get pods,ds,pvc) ansible.builtin.command: argv: - helm @@ -55,7 +126,7 @@ - --force-conflicts - --wait - --timeout - - 30m + - "{{ noble_platform_kube_prometheus_helm_wait_timeout }}" environment: KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true diff --git a/ansible/roles/noble_trivy/defaults/main.yml b/ansible/roles/noble_trivy/defaults/main.yml new file mode 100644 index 0000000..f4643a8 --- /dev/null +++ b/ansible/roles/noble_trivy/defaults/main.yml @@ -0,0 +1,3 @@ +--- +noble_trivy_chart_version: "0.32.1" +noble_helm_trivy_wait_timeout: 15m diff --git a/ansible/roles/noble_trivy/tasks/main.yml b/ansible/roles/noble_trivy/tasks/main.yml new file mode 100644 index 0000000..d6a1059 --- /dev/null +++ b/ansible/roles/noble_trivy/tasks/main.yml @@ -0,0 +1,33 @@ +--- +- name: Apply trivy-system namespace (PSA) + ansible.builtin.command: + argv: + - kubectl + - apply + - -f + - "{{ noble_repo_root }}/clusters/noble/bootstrap/trivy/namespace.yaml" + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + changed_when: true + +- name: Install Trivy Operator + ansible.builtin.command: + argv: + - helm + - upgrade + - --install + - trivy-operator + - aqua/trivy-operator + - -n + - trivy-system + - --version + - "{{ noble_trivy_chart_version }}" + - -f + - "{{ noble_repo_root }}/clusters/noble/bootstrap/trivy/values.yaml" + - --force-conflicts + - --wait + - --timeout + - "{{ noble_helm_trivy_wait_timeout }}" + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + changed_when: true diff --git a/clusters/noble/bootstrap/argocd/app-of-apps/kustomization.yaml b/clusters/noble/bootstrap/argocd/app-of-apps/kustomization.yaml index 43ef591..93e29f5 100644 --- a/clusters/noble/bootstrap/argocd/app-of-apps/kustomization.yaml +++ b/clusters/noble/bootstrap/argocd/app-of-apps/kustomization.yaml @@ -18,3 +18,4 @@ resources: - loki-application.yaml - fluent-bit-application.yaml - headlamp-application.yaml + - trivy-operator-application.yaml diff --git a/clusters/noble/bootstrap/argocd/app-of-apps/trivy-operator-application.yaml b/clusters/noble/bootstrap/argocd/app-of-apps/trivy-operator-application.yaml new file mode 100644 index 0000000..81e748d --- /dev/null +++ b/clusters/noble/bootstrap/argocd/app-of-apps/trivy-operator-application.yaml @@ -0,0 +1,29 @@ +# Bootstrap app-of-apps leaf: Trivy Operator (vulnerability + config audit reports). +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: noble-trivy-operator + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io/background +spec: + project: default + sources: + - repoURL: https://aquasecurity.github.io/helm-charts/ + chart: trivy-operator + targetRevision: 0.32.1 + helm: + releaseName: trivy-operator + valueFiles: + - $values/clusters/noble/bootstrap/trivy/values.yaml + - repoURL: https://gitea.pcenicni.ca/gsdavidp/home-server.git + targetRevision: HEAD + ref: values + destination: + server: https://kubernetes.default.svc + namespace: trivy-system + # Manual sync: Ansible helm runs first; enable automation after cutover (see ../README.md §5). + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true diff --git a/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml b/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml index 0caf8ad..4ce8072 100644 --- a/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml +++ b/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml @@ -7,10 +7,10 @@ # kubectl apply -f clusters/noble/bootstrap/kube-prometheus-stack/namespace.yaml # helm repo add prometheus-community https://prometheus-community.github.io/helm-charts # helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \ -# --version 82.15.1 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 30m +# --version 82.15.1 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 60m # # Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes). -# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 15–30m. To watch progress, +# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 30–60m. To watch progress, # open a second terminal: kubectl -n monitoring get pods,sts,ds -w # To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w # @@ -36,6 +36,11 @@ crds: upgradeJob: enabled: false +# Subchart defaults only tolerate NoSchedule; a node with other taints leaves node-exporter at 3/4 and Helm --wait times out. +prometheus-node-exporter: + tolerations: + - operator: Exists + # --- Longhorn-backed persistence (default chart storage is emptyDir) --- alertmanager: alertmanagerSpec: diff --git a/clusters/noble/bootstrap/kustomization.yaml b/clusters/noble/bootstrap/kustomization.yaml index 80f0be8..db2bc0a 100644 --- a/clusters/noble/bootstrap/kustomization.yaml +++ b/clusters/noble/bootstrap/kustomization.yaml @@ -17,4 +17,5 @@ resources: - velero/namespace.yaml - velero/longhorn-volumesnapshotclass.yaml - headlamp/namespace.yaml + - trivy/namespace.yaml - grafana-loki-datasource/loki-datasource.yaml diff --git a/clusters/noble/bootstrap/kyverno/policies-values.yaml b/clusters/noble/bootstrap/kyverno/policies-values.yaml index 6a6fe09..52cd7fc 100644 --- a/clusters/noble/bootstrap/kyverno/policies-values.yaml +++ b/clusters/noble/bootstrap/kyverno/policies-values.yaml @@ -9,6 +9,12 @@ # outside baseline (see namespace PSA labels under clusters/noble/bootstrap/*/namespace.yaml) # plus core Kubernetes namespaces and every Ansible-managed app namespace on noble. # +# failurePolicy **Ignore** (chart default is Fail): when the apiserver cannot reach Kyverno +# within the webhook timeout (e.g. admission overloaded during Helm hooks / Longhorn +# post-upgrade Job), Fail denies the request and breaks installs. Ignore allows the request +# through on transport failure only — policy violations are still handled per +# validationFailureAction when Kyverno responds. +# # After widening excludes, Kyverno does not always prune old PolicyReport rows; refresh: # kubectl delete clusterpolicyreport --all # kubectl delete policyreport -A --all @@ -22,10 +28,10 @@ policyType: ClusterPolicy podSecurityStandard: baseline podSecuritySeverity: medium validationFailureAction: Audit -failurePolicy: Fail +failurePolicy: Ignore validationAllowExistingViolations: true -# All platform namespaces on noble (ansible/playbooks/noble.yml + clusters/noble/bootstrap). +# All platform namespaces on noble (ansible/playbooks/noble.yml + clusters/noble/bootstrap). Includes **trivy-system**. x-kyverno-exclude-infra: &kyverno_exclude_infra any: - resources: @@ -44,6 +50,7 @@ x-kyverno-exclude-infra: &kyverno_exclude_infra - monitoring - newt - traefik + - trivy-system policyExclude: disallow-capabilities: *kyverno_exclude_infra diff --git a/clusters/noble/bootstrap/trivy/namespace.yaml b/clusters/noble/bootstrap/trivy/namespace.yaml new file mode 100644 index 0000000..c49863c --- /dev/null +++ b/clusters/noble/bootstrap/trivy/namespace.yaml @@ -0,0 +1,10 @@ +# Trivy Operator — apply before Helm (Ansible **noble_trivy**). +# Scan jobs may use elevated capabilities; align with other operator namespaces. +apiVersion: v1 +kind: Namespace +metadata: + name: trivy-system + labels: + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/warn: privileged diff --git a/clusters/noble/bootstrap/trivy/values.yaml b/clusters/noble/bootstrap/trivy/values.yaml new file mode 100644 index 0000000..92b25d5 --- /dev/null +++ b/clusters/noble/bootstrap/trivy/values.yaml @@ -0,0 +1,28 @@ +# Trivy Operator — in-cluster image vulnerability + config reports (Aqua trivy-operator Helm chart). +# +# helm repo add aqua https://aquasecurity.github.io/helm-charts/ && helm repo update +# kubectl apply -f clusters/noble/bootstrap/trivy/namespace.yaml +# helm upgrade --install trivy-operator aqua/trivy-operator -n trivy-system \ +# --version 0.32.1 -f clusters/noble/bootstrap/trivy/values.yaml --wait --timeout 15m +# +# Inspect: kubectl get vulnerabilityreports,configauditreports -A +# Docs: https://aquasecurity.github.io/trivy-operator/ + +# Skip platform/system namespaces (mirrors Kyverno excludes; reduces scan load). +excludeNamespaces: "argocd,cert-manager,headlamp,kyverno,local-path-storage,logging,longhorn-system,loki,metallb-system,monitoring,newt,traefik,trivy-system,velero,kube-node-lease,kube-public,kube-system" + +operator: + scanJobsConcurrentLimit: 5 + # SBOM / cluster compliance add CPU and CR volume; keep vulnerability + config audit. + sbomGenerationEnabled: false + clusterSbomCacheEnabled: false + clusterComplianceEnabled: false + +trivyOperator: + # Run scan Jobs on every node (Talos / mixed taints). + scanJobTolerations: + - operator: Exists + +serviceMonitor: + enabled: true + namespace: monitoring