# kube-prometheus-stack — noble lab (Prometheus Operator + Grafana + Alertmanager + exporters) # # Chart: prometheus-community/kube-prometheus-stack — pin version on install (e.g. 82.15.1). # # Install (use one terminal; chain with && so `helm upgrade` always runs after `helm repo update`): # # kubectl apply -f clusters/noble/apps/kube-prometheus-stack/namespace.yaml # helm repo add prometheus-community https://prometheus-community.github.io/helm-charts # helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \ # --version 82.15.1 -f clusters/noble/apps/kube-prometheus-stack/values.yaml --wait --timeout 30m # # Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes). # Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 15–30m. To watch progress, # open a second terminal: kubectl -n monitoring get pods,sts,ds -w # To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w # # Grafana admin password: Secret `kube-prometheus-grafana` keys `admin-user` / `admin-password` unless you set grafana.adminPassword. # Use cert-manager for admission webhook TLS instead of Helm pre-hook Jobs (patch/create Secret). # Those Jobs are validated by Kyverno before `kyverno-svc` exists during a single Argo sync, which fails. # Requires cert-manager CRDs (bootstrap before this chart). prometheusOperator: admissionWebhooks: certManager: enabled: true # --- Longhorn-backed persistence (default chart storage is emptyDir) --- alertmanager: alertmanagerSpec: storage: volumeClaimTemplate: spec: storageClassName: longhorn accessModes: ["ReadWriteOnce"] resources: requests: storage: 5Gi ingress: enabled: true ingressClassName: traefik annotations: cert-manager.io/cluster-issuer: letsencrypt-prod hosts: - alertmanager.apps.noble.lab.pcenicni.dev paths: - / pathType: Prefix tls: - secretName: alertmanager-apps-noble-tls hosts: - alertmanager.apps.noble.lab.pcenicni.dev prometheus: prometheusSpec: retention: 15d retentionSize: 25GB storageSpec: volumeClaimTemplate: spec: storageClassName: longhorn accessModes: ["ReadWriteOnce"] resources: requests: storage: 30Gi ingress: enabled: true ingressClassName: traefik annotations: cert-manager.io/cluster-issuer: letsencrypt-prod hosts: - prometheus.apps.noble.lab.pcenicni.dev paths: - / pathType: Prefix tls: - secretName: prometheus-apps-noble-tls hosts: - prometheus.apps.noble.lab.pcenicni.dev grafana: persistence: enabled: true type: sts storageClassName: longhorn accessModes: - ReadWriteOnce size: 10Gi # HTTPS via Traefik + cert-manager (ClusterIssuer letsencrypt-prod; same pattern as other *.apps.noble.lab.pcenicni.dev hosts). # DNS: grafana.apps.noble.lab.pcenicni.dev → Traefik LoadBalancer (192.168.50.211) — see clusters/noble/apps/traefik/values.yaml ingress: enabled: true ingressClassName: traefik path: / pathType: Prefix annotations: cert-manager.io/cluster-issuer: letsencrypt-prod hosts: - grafana.apps.noble.lab.pcenicni.dev tls: - secretName: grafana-apps-noble-tls hosts: - grafana.apps.noble.lab.pcenicni.dev grafana.ini: server: domain: grafana.apps.noble.lab.pcenicni.dev root_url: https://grafana.apps.noble.lab.pcenicni.dev/ # Traefik sets X-Forwarded-*; required for correct redirects and cookies behind the ingress. use_proxy_headers: true # Loki datasource: apply `clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml` (sidecar ConfigMap) instead of additionalDataSources here.