Add Trivy integration to noble cluster setup, including namespace and application configurations. Update README and playbook tags to reflect new security scanning capabilities. Enhance Longhorn and kube-prometheus-stack deployment reliability with increased timeout settings and retry mechanisms.

This commit is contained in:
Nikholas Pcenicni
2026-05-13 23:48:12 -04:00
parent 774b97894c
commit 663ebc5001
16 changed files with 223 additions and 13 deletions

View File

@@ -18,3 +18,4 @@ resources:
- loki-application.yaml
- fluent-bit-application.yaml
- headlamp-application.yaml
- trivy-operator-application.yaml

View File

@@ -0,0 +1,29 @@
# Bootstrap app-of-apps leaf: Trivy Operator (vulnerability + config audit reports).
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: noble-trivy-operator
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io/background
spec:
project: default
sources:
- repoURL: https://aquasecurity.github.io/helm-charts/
chart: trivy-operator
targetRevision: 0.32.1
helm:
releaseName: trivy-operator
valueFiles:
- $values/clusters/noble/bootstrap/trivy/values.yaml
- repoURL: https://gitea.pcenicni.ca/gsdavidp/home-server.git
targetRevision: HEAD
ref: values
destination:
server: https://kubernetes.default.svc
namespace: trivy-system
# Manual sync: Ansible helm runs first; enable automation after cutover (see ../README.md §5).
syncPolicy:
syncOptions:
- CreateNamespace=true
- ServerSideApply=true

View File

@@ -7,10 +7,10 @@
# kubectl apply -f clusters/noble/bootstrap/kube-prometheus-stack/namespace.yaml
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
# helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \
# --version 82.15.1 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 30m
# --version 82.15.1 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 60m
#
# Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes).
# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 1530m. To watch progress,
# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 3060m. To watch progress,
# open a second terminal: kubectl -n monitoring get pods,sts,ds -w
# To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w
#
@@ -36,6 +36,11 @@ crds:
upgradeJob:
enabled: false
# Subchart defaults only tolerate NoSchedule; a node with other taints leaves node-exporter at 3/4 and Helm --wait times out.
prometheus-node-exporter:
tolerations:
- operator: Exists
# --- Longhorn-backed persistence (default chart storage is emptyDir) ---
alertmanager:
alertmanagerSpec:

View File

@@ -17,4 +17,5 @@ resources:
- velero/namespace.yaml
- velero/longhorn-volumesnapshotclass.yaml
- headlamp/namespace.yaml
- trivy/namespace.yaml
- grafana-loki-datasource/loki-datasource.yaml

View File

@@ -9,6 +9,12 @@
# outside baseline (see namespace PSA labels under clusters/noble/bootstrap/*/namespace.yaml)
# plus core Kubernetes namespaces and every Ansible-managed app namespace on noble.
#
# failurePolicy **Ignore** (chart default is Fail): when the apiserver cannot reach Kyverno
# within the webhook timeout (e.g. admission overloaded during Helm hooks / Longhorn
# post-upgrade Job), Fail denies the request and breaks installs. Ignore allows the request
# through on transport failure only — policy violations are still handled per
# validationFailureAction when Kyverno responds.
#
# After widening excludes, Kyverno does not always prune old PolicyReport rows; refresh:
# kubectl delete clusterpolicyreport --all
# kubectl delete policyreport -A --all
@@ -22,10 +28,10 @@ policyType: ClusterPolicy
podSecurityStandard: baseline
podSecuritySeverity: medium
validationFailureAction: Audit
failurePolicy: Fail
failurePolicy: Ignore
validationAllowExistingViolations: true
# All platform namespaces on noble (ansible/playbooks/noble.yml + clusters/noble/bootstrap).
# All platform namespaces on noble (ansible/playbooks/noble.yml + clusters/noble/bootstrap). Includes **trivy-system**.
x-kyverno-exclude-infra: &kyverno_exclude_infra
any:
- resources:
@@ -44,6 +50,7 @@ x-kyverno-exclude-infra: &kyverno_exclude_infra
- monitoring
- newt
- traefik
- trivy-system
policyExclude:
disallow-capabilities: *kyverno_exclude_infra

View File

@@ -0,0 +1,10 @@
# Trivy Operator — apply before Helm (Ansible **noble_trivy**).
# Scan jobs may use elevated capabilities; align with other operator namespaces.
apiVersion: v1
kind: Namespace
metadata:
name: trivy-system
labels:
pod-security.kubernetes.io/enforce: privileged
pod-security.kubernetes.io/audit: privileged
pod-security.kubernetes.io/warn: privileged

View File

@@ -0,0 +1,28 @@
# Trivy Operator — in-cluster image vulnerability + config reports (Aqua trivy-operator Helm chart).
#
# helm repo add aqua https://aquasecurity.github.io/helm-charts/ && helm repo update
# kubectl apply -f clusters/noble/bootstrap/trivy/namespace.yaml
# helm upgrade --install trivy-operator aqua/trivy-operator -n trivy-system \
# --version 0.32.1 -f clusters/noble/bootstrap/trivy/values.yaml --wait --timeout 15m
#
# Inspect: kubectl get vulnerabilityreports,configauditreports -A
# Docs: https://aquasecurity.github.io/trivy-operator/
# Skip platform/system namespaces (mirrors Kyverno excludes; reduces scan load).
excludeNamespaces: "argocd,cert-manager,headlamp,kyverno,local-path-storage,logging,longhorn-system,loki,metallb-system,monitoring,newt,traefik,trivy-system,velero,kube-node-lease,kube-public,kube-system"
operator:
scanJobsConcurrentLimit: 5
# SBOM / cluster compliance add CPU and CR volume; keep vulnerability + config audit.
sbomGenerationEnabled: false
clusterSbomCacheEnabled: false
clusterComplianceEnabled: false
trivyOperator:
# Run scan Jobs on every node (Talos / mixed taints).
scanJobTolerations:
- operator: Exists
serviceMonitor:
enabled: true
namespace: monitoring