Update CLUSTER-BUILD.md to include kube-prometheus-stack Helm chart details, enhance observability phase with Grafana ingress configuration, and clarify deployment instructions for monitoring components. Mark tasks as completed for kube-prometheus-stack installation and PVC binding on Longhorn.

2026-03-28 00:28:54 -04:00
parent fd4afef992
commit 7caba0d90c
3 changed files with 91 additions and 3 deletions
--- a/clusters/noble/apps/kube-prometheus-stack/values.yaml
+++ b/clusters/noble/apps/kube-prometheus-stack/values.yaml
@@ -0,0 +1,72 @@
+# kube-prometheus-stack — noble lab (Prometheus Operator + Grafana + Alertmanager + exporters)
+#
+# Chart: prometheus-community/kube-prometheus-stack — pin version on install (e.g. 82.15.1).
+#
+# Install (use one terminal; chain with && so `helm upgrade` always runs after `helm repo update`):
+#
+#   kubectl apply -f clusters/noble/apps/kube-prometheus-stack/namespace.yaml
+#   helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+#   helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \
+#     --version 82.15.1 -f clusters/noble/apps/kube-prometheus-stack/values.yaml --wait --timeout 30m
+#
+# Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes).
+# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 15–30m. To watch progress,
+# open a second terminal: kubectl -n monitoring get pods,sts,ds -w
+# To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w
+#
+# Grafana admin password: Secret `kube-prometheus-grafana` keys `admin-user` / `admin-password` unless you set grafana.adminPassword.
+
+# --- Longhorn-backed persistence (default chart storage is emptyDir) ---
+alertmanager:
+  alertmanagerSpec:
+    storage:
+      volumeClaimTemplate:
+        spec:
+          storageClassName: longhorn
+          accessModes: ["ReadWriteOnce"]
+          resources:
+            requests:
+              storage: 5Gi
+
+prometheus:
+  prometheusSpec:
+    retention: 15d
+    retentionSize: 25GB
+    storageSpec:
+      volumeClaimTemplate:
+        spec:
+          storageClassName: longhorn
+          accessModes: ["ReadWriteOnce"]
+          resources:
+            requests:
+              storage: 30Gi
+
+grafana:
+  persistence:
+    enabled: true
+    type: sts
+    storageClassName: longhorn
+    accessModes:
+      - ReadWriteOnce
+    size: 10Gi
+
+  # HTTPS via Traefik + cert-manager (ClusterIssuer letsencrypt-prod; same pattern as other *.apps.noble.lab.pcenicni.dev hosts).
+  # DNS: grafana.apps.noble.lab.pcenicni.dev → Traefik LoadBalancer (192.168.50.211) — see clusters/noble/apps/traefik/values.yaml
+  ingress:
+    enabled: true
+    ingressClassName: traefik
+    path: /
+    pathType: Prefix
+    annotations:
+      cert-manager.io/cluster-issuer: letsencrypt-prod
+    hosts:
+      - grafana.apps.noble.lab.pcenicni.dev
+    tls:
+      - secretName: grafana-apps-noble-tls
+        hosts:
+          - grafana.apps.noble.lab.pcenicni.dev
+
+  grafana.ini:
+    server:
+      domain: grafana.apps.noble.lab.pcenicni.dev
+      root_url: https://grafana.apps.noble.lab.pcenicni.dev/