Add Trivy integration to noble cluster setup, including namespace and application configurations. Update README and playbook tags to reflect new security scanning capabilities. Enhance Longhorn and kube-prometheus-stack deployment reliability with increased timeout settings and retry mechanisms.

This commit is contained in:
Nikholas Pcenicni
2026-05-13 23:48:12 -04:00
parent 774b97894c
commit 663ebc5001
16 changed files with 223 additions and 13 deletions

View File

@@ -1,6 +1,6 @@
# Ansible — noble cluster
Automates [`talos/CLUSTER-BUILD.md`](../talos/CLUSTER-BUILD.md): optional **Talos Phase A** (genconfig → apply → bootstrap → kubeconfig), then **Phase B+** (CNI → add-ons → ingress → Argo CD → Kyverno → observability, etc.). **Argo CD** does not reconcile core charts — optional GitOps starts from an empty [`clusters/noble/apps/kustomization.yaml`](../clusters/noble/apps/kustomization.yaml).
Automates [`talos/CLUSTER-BUILD.md`](../talos/CLUSTER-BUILD.md): optional **Talos Phase A** (genconfig → apply → bootstrap → kubeconfig), then **Phase B+** (CNI → add-ons → ingress → Argo CD → Kyverno → observability → Trivy, etc.). **Argo CD** does not reconcile core charts — optional GitOps starts from an empty [`clusters/noble/apps/kustomization.yaml`](../clusters/noble/apps/kustomization.yaml).
## Order of operations
@@ -73,6 +73,7 @@ Override with `-e` when needed, e.g. **`-e noble_talos_skip_bootstrap=true`** if
```bash
ansible-playbook playbooks/noble.yml --tags cilium,metallb
ansible-playbook playbooks/noble.yml --tags trivy
ansible-playbook playbooks/noble.yml --skip-tags newt
ansible-playbook playbooks/noble.yml --tags velero -e noble_velero_install=true -e noble_velero_s3_bucket=... -e noble_velero_s3_url=...
```
@@ -88,7 +89,7 @@ ansible-playbook playbooks/noble.yml --tags velero -e noble_velero_install=true
|------|----------|
| `talos_phase_a` | Talos genconfig, apply-config, bootstrap, kubeconfig |
| `helm_repos` | `helm repo add` / `update` |
| `noble_*` | Cilium, CSI Volume Snapshot CRDs + controller, metrics-server, Longhorn, MetalLB (20m Helm wait), kube-vip, Traefik, cert-manager, Newt, Argo CD, Kyverno, platform stack, Velero (optional) |
| `noble_*` | Cilium, CSI Volume Snapshot CRDs + controller, metrics-server, Longhorn, MetalLB (20m Helm wait), kube-vip, Traefik, cert-manager, Newt, Argo CD, Kyverno, platform stack, **Trivy Operator**, Velero (optional) |
| `noble_landing_urls` | Writes **`ansible/output/noble-lab-ui-urls.md`** — URLs, service names, and (optional) Argo/Grafana passwords from Secrets |
| `noble_post_deploy` | Post-install reminders |
| `talos_bootstrap` | Genconfig-only (used by older playbook) |

View File

@@ -4,7 +4,7 @@
# Run from repo **ansible/** directory: ansible-playbook playbooks/noble.yml
#
# Tags: repos, cilium, csi_snapshot, metrics, longhorn, metallb, kube_vip, traefik, cert_manager, newt,
# argocd, kyverno, kyverno_policies, platform, velero, all (default)
# argocd, kyverno, kyverno_policies, platform, trivy, velero, all (default)
- name: Noble cluster — platform stack (Ansible-managed)
hosts: localhost
connection: local
@@ -206,6 +206,12 @@
tags: [csi_snapshot, snapshot, storage]
- role: noble_metrics_server
tags: [metrics, metrics_server]
# Kyverno before Longhorn: Longhorn post-upgrade Job is admitted through Kyverno; policies use
# failurePolicy Ignore so webhook transport timeouts do not fail Helm (see policies-values.yaml).
- role: noble_kyverno
tags: [kyverno, policy]
- role: noble_kyverno_policies
tags: [kyverno_policies, policy]
- role: noble_longhorn
tags: [longhorn, storage]
- role: noble_metallb
@@ -220,12 +226,10 @@
tags: [newt]
- role: noble_argocd
tags: [argocd, gitops]
- role: noble_kyverno
tags: [kyverno, policy]
- role: noble_kyverno_policies
tags: [kyverno_policies, policy]
- role: noble_platform
tags: [platform, observability, apps]
- role: noble_trivy
tags: [trivy, security, scanning]
- role: noble_velero
tags: [velero, backups]
- role: noble_landing_urls

View File

@@ -14,3 +14,4 @@ noble_helm_repos:
- { name: headlamp, url: "https://kubernetes-sigs.github.io/headlamp/" }
- { name: kyverno, url: "https://kyverno.github.io/kyverno/" }
- { name: vmware-tanzu, url: "https://vmware-tanzu.github.io/helm-charts" }
- { name: aqua, url: "https://aquasecurity.github.io/helm-charts/" }

View File

@@ -2,3 +2,7 @@
# Helm --wait default (5m) is often too short for first Longhorn install on several nodes
# (image pulls + manager/driver ordering). See ansible/roles/noble_metallb/defaults/main.yml.
noble_helm_longhorn_wait_timeout: 20m
# Transient Kyverno webhook timeouts during post-upgrade hooks / admission storms.
noble_helm_longhorn_retries: 8
noble_helm_longhorn_retry_delay: 25

View File

@@ -31,4 +31,8 @@
- "{{ noble_helm_longhorn_wait_timeout }}"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
register: noble_longhorn_helm
retries: "{{ noble_helm_longhorn_retries | int }}"
delay: "{{ noble_helm_longhorn_retry_delay | int }}"
until: noble_longhorn_helm.rc == 0
changed_when: true

View File

@@ -4,6 +4,14 @@ noble_platform_kubectl_request_timeout: 120s
noble_platform_kustomize_retries: 5
noble_platform_kustomize_delay: 20
# kube-prometheus-stack: operator Deployment uses Kubernetes default progressDeadlineSeconds (600s).
# First install (images + cert-manager webhook TLS) can exceed that; patch + optional rollout restart, then Helm --wait.
noble_platform_kube_prometheus_operator_progress_deadline_seconds: 1800
noble_platform_kube_prometheus_operator_wait_retries: 60
noble_platform_kube_prometheus_operator_wait_delay: 5
# Longhorn PVCs + full stack often need 45-60m; node-exporter DaemonSet can be last at 3/4 until one node catches up.
noble_platform_kube_prometheus_helm_wait_timeout: 60m
# Decrypt **clusters/noble/secrets/*.yaml** with SOPS and kubectl apply (requires **sops**, **age**, and **age-key.txt**).
noble_apply_sops_secrets: true
noble_sops_age_key_file: "{{ noble_repo_root }}/age-key.txt"

View File

@@ -38,7 +38,78 @@
- noble_sops_age_key_stat.stat.exists
changed_when: true
- name: Install kube-prometheus-stack
# Helm --wait alone cannot extend the operator Deployment's progressDeadlineSeconds (default 10m).
- name: Install kube-prometheus-stack (apply without Helm wait)
ansible.builtin.command:
argv:
- helm
- upgrade
- --install
- kube-prometheus
- prometheus-community/kube-prometheus-stack
- -n
- monitoring
- --version
- "82.15.1"
- -f
- "{{ noble_repo_root }}/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml"
- --force-conflicts
- --wait=false
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true
- name: Wait for prometheus-operator Deployment object
ansible.builtin.command:
argv:
- kubectl
- get
- deployment/kube-prometheus-kube-prome-operator
- -n
- monitoring
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
register: noble_kube_prom_operator_deploy
until: noble_kube_prom_operator_deploy.rc == 0
retries: "{{ noble_platform_kube_prometheus_operator_wait_retries | int }}"
delay: "{{ noble_platform_kube_prometheus_operator_wait_delay | int }}"
changed_when: false
- name: Extend prometheus-operator Deployment progress deadline
ansible.builtin.command:
argv:
- kubectl
- patch
- deployment/kube-prometheus-kube-prome-operator
- -n
- monitoring
- --type=merge
- -p
- "{{ {'spec': {'progressDeadlineSeconds': (noble_platform_kube_prometheus_operator_progress_deadline_seconds | int)}} | to_json }}"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true
- name: Restart prometheus-operator if Deployment already hit progress deadline
ansible.builtin.shell: |
set -euo pipefail
dep=kube-prometheus-kube-prome-operator
msg=$(kubectl get deployment "$dep" -n monitoring -o jsonpath='{.status.conditions[?(@.type=="Progressing")].message}' 2>/dev/null || true)
reason=$(kubectl get deployment "$dep" -n monitoring -o jsonpath='{.status.conditions[?(@.type=="Progressing")].reason}' 2>/dev/null || true)
combined="${reason}${msg}"
if printf '%s' "$combined" | grep -qiE 'ProgressDeadlineExceeded|progress[[:space:]]*deadline[[:space:]]*exceeded'; then
kubectl rollout restart deployment/"$dep" -n monitoring
echo restarted
fi
args:
executable: /bin/bash
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
register: noble_kube_prom_operator_restart
changed_when: "'restarted' in noble_kube_prom_operator_restart.stdout"
# Helm --wait prints nothing until done or timeout; override noble_platform_kube_prometheus_helm_wait_timeout if needed.
- name: Install kube-prometheus-stack (Helm wait for full release; often 30-60m silent - watch kubectl -n monitoring get pods,ds,pvc)
ansible.builtin.command:
argv:
- helm
@@ -55,7 +126,7 @@
- --force-conflicts
- --wait
- --timeout
- 30m
- "{{ noble_platform_kube_prometheus_helm_wait_timeout }}"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true

View File

@@ -0,0 +1,3 @@
---
noble_trivy_chart_version: "0.32.1"
noble_helm_trivy_wait_timeout: 15m

View File

@@ -0,0 +1,33 @@
---
- name: Apply trivy-system namespace (PSA)
ansible.builtin.command:
argv:
- kubectl
- apply
- -f
- "{{ noble_repo_root }}/clusters/noble/bootstrap/trivy/namespace.yaml"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true
- name: Install Trivy Operator
ansible.builtin.command:
argv:
- helm
- upgrade
- --install
- trivy-operator
- aqua/trivy-operator
- -n
- trivy-system
- --version
- "{{ noble_trivy_chart_version }}"
- -f
- "{{ noble_repo_root }}/clusters/noble/bootstrap/trivy/values.yaml"
- --force-conflicts
- --wait
- --timeout
- "{{ noble_helm_trivy_wait_timeout }}"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true