# kube-prometheus-stack — noble lab (Prometheus Operator + Grafana + Alertmanager + exporters)
#
# Chart: prometheus-community/kube-prometheus-stack — pin version on install (e.g. 82.15.1).
#
# Install (use one terminal; chain with && so `helm upgrade` always runs after `helm repo update`):
#
#   kubectl apply -f clusters/noble/apps/kube-prometheus-stack/namespace.yaml
#   helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
#   helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \
#     --version 82.15.1 -f clusters/noble/apps/kube-prometheus-stack/values.yaml --wait --timeout 30m
#
# Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes).
# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 15–30m. To watch progress,
# open a second terminal: kubectl -n monitoring get pods,sts,ds -w
# To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w
#
# Grafana admin password: Secret `kube-prometheus-grafana` keys `admin-user` / `admin-password` unless you set grafana.adminPassword.

# Use cert-manager for admission webhook TLS instead of Helm pre-hook Jobs (patch/create Secret).
# Those Jobs are validated by Kyverno before `kyverno-svc` exists during a single Argo sync, which fails.
# Requires cert-manager CRDs (bootstrap before this chart).
prometheusOperator:
  admissionWebhooks:
    certManager:
      enabled: true

# --- Longhorn-backed persistence (default chart storage is emptyDir) ---
alertmanager:
  alertmanagerSpec:
    storage:
      volumeClaimTemplate:
        spec:
          storageClassName: longhorn
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 5Gi
  ingress:
    enabled: true
    ingressClassName: traefik
    annotations:
      cert-manager.io/cluster-issuer: letsencrypt-prod
    hosts:
      - alertmanager.apps.noble.lab.pcenicni.dev
    paths:
      - /
    pathType: Prefix
    tls:
      - secretName: alertmanager-apps-noble-tls
        hosts:
          - alertmanager.apps.noble.lab.pcenicni.dev

prometheus:
  prometheusSpec:
    retention: 15d
    retentionSize: 25GB
    storageSpec:
      volumeClaimTemplate:
        spec:
          storageClassName: longhorn
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 30Gi
  ingress:
    enabled: true
    ingressClassName: traefik
    annotations:
      cert-manager.io/cluster-issuer: letsencrypt-prod
    hosts:
      - prometheus.apps.noble.lab.pcenicni.dev
    paths:
      - /
    pathType: Prefix
    tls:
      - secretName: prometheus-apps-noble-tls
        hosts:
          - prometheus.apps.noble.lab.pcenicni.dev

grafana:
  persistence:
    enabled: true
    type: sts
    storageClassName: longhorn
    accessModes:
      - ReadWriteOnce
    size: 10Gi

  # HTTPS via Traefik + cert-manager (ClusterIssuer letsencrypt-prod; same pattern as other *.apps.noble.lab.pcenicni.dev hosts).
  # DNS: grafana.apps.noble.lab.pcenicni.dev → Traefik LoadBalancer (192.168.50.211) — see clusters/noble/apps/traefik/values.yaml
  ingress:
    enabled: true
    ingressClassName: traefik
    path: /
    pathType: Prefix
    annotations:
      cert-manager.io/cluster-issuer: letsencrypt-prod
    hosts:
      - grafana.apps.noble.lab.pcenicni.dev
    tls:
      - secretName: grafana-apps-noble-tls
        hosts:
          - grafana.apps.noble.lab.pcenicni.dev

  grafana.ini:
    server:
      domain: grafana.apps.noble.lab.pcenicni.dev
      root_url: https://grafana.apps.noble.lab.pcenicni.dev/
      # Traefik sets X-Forwarded-*; required for correct redirects and cookies behind the ingress.
      use_proxy_headers: true

  # Loki datasource: apply `clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml` (sidecar ConfigMap) instead of additionalDataSources here.