Update CLUSTER-BUILD.md to include kube-prometheus-stack Helm chart details, enhance observability phase with Grafana ingress configuration, and clarify deployment instructions for monitoring components. Mark tasks as completed for kube-prometheus-stack installation and PVC binding on Longhorn.

This commit is contained in:
Nikholas Pcenicni
2026-03-28 00:28:54 -04:00
parent fd4afef992
commit 7caba0d90c
3 changed files with 91 additions and 3 deletions

View File

@@ -0,0 +1,72 @@
# kube-prometheus-stack — noble lab (Prometheus Operator + Grafana + Alertmanager + exporters)
#
# Chart: prometheus-community/kube-prometheus-stack — pin version on install (e.g. 82.15.1).
#
# Install (use one terminal; chain with && so `helm upgrade` always runs after `helm repo update`):
#
# kubectl apply -f clusters/noble/apps/kube-prometheus-stack/namespace.yaml
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
# helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \
# --version 82.15.1 -f clusters/noble/apps/kube-prometheus-stack/values.yaml --wait --timeout 30m
#
# Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes).
# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 1530m. To watch progress,
# open a second terminal: kubectl -n monitoring get pods,sts,ds -w
# To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w
#
# Grafana admin password: Secret `kube-prometheus-grafana` keys `admin-user` / `admin-password` unless you set grafana.adminPassword.
# --- Longhorn-backed persistence (default chart storage is emptyDir) ---
alertmanager:
alertmanagerSpec:
storage:
volumeClaimTemplate:
spec:
storageClassName: longhorn
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 5Gi
prometheus:
prometheusSpec:
retention: 15d
retentionSize: 25GB
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: longhorn
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 30Gi
grafana:
persistence:
enabled: true
type: sts
storageClassName: longhorn
accessModes:
- ReadWriteOnce
size: 10Gi
# HTTPS via Traefik + cert-manager (ClusterIssuer letsencrypt-prod; same pattern as other *.apps.noble.lab.pcenicni.dev hosts).
# DNS: grafana.apps.noble.lab.pcenicni.dev → Traefik LoadBalancer (192.168.50.211) — see clusters/noble/apps/traefik/values.yaml
ingress:
enabled: true
ingressClassName: traefik
path: /
pathType: Prefix
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
hosts:
- grafana.apps.noble.lab.pcenicni.dev
tls:
- secretName: grafana-apps-noble-tls
hosts:
- grafana.apps.noble.lab.pcenicni.dev
grafana.ini:
server:
domain: grafana.apps.noble.lab.pcenicni.dev
root_url: https://grafana.apps.noble.lab.pcenicni.dev/