home-server/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml

# kube-prometheus-stack — noble lab (Prometheus Operator + Grafana + Alertmanager + exporters)
#
# Chart: prometheus-community/kube-prometheus-stack — pin version on install (e.g. 85.0.3).
#
# Install (use one terminal; chain with && so `helm upgrade` always runs after `helm repo update`):
#
#   kubectl apply -f clusters/noble/bootstrap/kube-prometheus-stack/namespace.yaml
#   helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
#   helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \
#     --version 85.0.3 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 60m
#
# Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes).
# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 30–60m. To watch progress,
# open a second terminal: kubectl -n monitoring get pods,sts,ds -w
# To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w
#
# Grafana admin password: Secret `kube-prometheus-grafana` keys `admin-user` / `admin-password` unless you set grafana.adminPassword.

# Use cert-manager for admission webhook TLS instead of Helm pre-hook Jobs (patch/create Secret).
# Those Jobs are validated by Kyverno before `kyverno-svc` exists during a single Argo sync, which fails.
# Requires cert-manager CRDs (bootstrap before this chart).
prometheusOperator:
  admissionWebhooks:
    certManager:
      enabled: true

# CRDs + Argo CD: **`helm.skipCrds: true`** on the Argo Application avoids Argo rendering chart CRDs with
# **`--include-crds`** (client-side apply would overflow **last-applied-configuration** on huge CRDs).
# Ref: https://github.com/argoproj/argo-cd/issues/11269
#
# Do **not** enable **`crds.upgradeJob`** while this release is Argo-managed: the hook creates ConfigMap
# **`kube-prometheus-crds-upgrade`** whose **binaryData** is enormous; Argo client-side apply repeats the same
# annotation size limit on that object. Keep the job **off**; upgrade Prometheus Operator CRDs when you bump
# the chart via **Ansible `helm upgrade`** (or the chart’s manual CRD steps), not via Argo sync.
crds:
  upgradeJob:
    enabled: false

# Subchart defaults only tolerate NoSchedule; a node with other taints leaves node-exporter at 3/4 and Helm --wait times out.
prometheus-node-exporter:
  tolerations:
    - operator: Exists

# --- Longhorn-backed persistence (default chart storage is emptyDir) ---
alertmanager:
  alertmanagerSpec:
    storage:
      volumeClaimTemplate:
        spec:
          storageClassName: longhorn
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 5Gi
  ingress:
    enabled: true
    ingressClassName: traefik
    annotations:
      cert-manager.io/cluster-issuer: letsencrypt-prod
    hosts:
      - alertmanager.apps.noble.lab.pcenicni.dev
    paths:
      - /
    pathType: Prefix
    tls:
      - secretName: alertmanager-apps-noble-tls
        hosts:
          - alertmanager.apps.noble.lab.pcenicni.dev

prometheus:
  prometheusSpec:
    retention: 15d
    retentionSize: 25GB
    storageSpec:
      volumeClaimTemplate:
        spec:
          storageClassName: longhorn
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 30Gi
  ingress:
    enabled: true
    ingressClassName: traefik
    annotations:
      cert-manager.io/cluster-issuer: letsencrypt-prod
    hosts:
      - prometheus.apps.noble.lab.pcenicni.dev
    paths:
      - /
    pathType: Prefix
    tls:
      - secretName: prometheus-apps-noble-tls
        hosts:
          - prometheus.apps.noble.lab.pcenicni.dev

grafana:
  persistence:
    enabled: true
    type: sts
    storageClassName: longhorn
    accessModes:
      - ReadWriteOnce
    size: 10Gi

  # HTTPS via Traefik + cert-manager (ClusterIssuer letsencrypt-prod; same pattern as other *.apps.noble.lab.pcenicni.dev hosts).
  # DNS: grafana.apps.noble.lab.pcenicni.dev → Traefik LoadBalancer (192.168.50.211) — see clusters/noble/bootstrap/traefik/values.yaml
  ingress:
    enabled: true
    ingressClassName: traefik
    path: /
    pathType: Prefix
    annotations:
      cert-manager.io/cluster-issuer: letsencrypt-prod
    hosts:
      - grafana.apps.noble.lab.pcenicni.dev
    tls:
      - secretName: grafana-apps-noble-tls
        hosts:
          - grafana.apps.noble.lab.pcenicni.dev

  grafana.ini:
    server:
      domain: grafana.apps.noble.lab.pcenicni.dev
      root_url: https://grafana.apps.noble.lab.pcenicni.dev/
      # Traefik sets X-Forwarded-*; required for correct redirects and cookies behind the ingress.
      use_proxy_headers: true

  # Loki datasource: apply `clusters/noble/bootstrap/grafana-loki-datasource/loki-datasource.yaml` (sidecar ConfigMap) instead of additionalDataSources here.