130 lines
4.9 KiB
YAML
130 lines
4.9 KiB
YAML
# kube-prometheus-stack — noble lab (Prometheus Operator + Grafana + Alertmanager + exporters)
|
||
#
|
||
# Chart: prometheus-community/kube-prometheus-stack — pin version on install (e.g. 85.0.3).
|
||
#
|
||
# Install (use one terminal; chain with && so `helm upgrade` always runs after `helm repo update`):
|
||
#
|
||
# kubectl apply -f clusters/noble/bootstrap/kube-prometheus-stack/namespace.yaml
|
||
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||
# helm repo update && helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack -n monitoring \
|
||
# --version 85.0.3 -f clusters/noble/bootstrap/kube-prometheus-stack/values.yaml --wait --timeout 60m
|
||
#
|
||
# Why it looks "stalled": with --wait, Helm prints almost nothing until the release finishes (can be many minutes).
|
||
# Do not use --timeout 5m for first install — Longhorn PVCs + StatefulSets often need 30–60m. To watch progress,
|
||
# open a second terminal: kubectl -n monitoring get pods,sts,ds -w
|
||
# To apply manifest changes without blocking: omit --wait, then kubectl -n monitoring get pods -w
|
||
#
|
||
# Grafana admin password: Secret `kube-prometheus-grafana` keys `admin-user` / `admin-password` unless you set grafana.adminPassword.
|
||
|
||
# Use cert-manager for admission webhook TLS instead of Helm pre-hook Jobs (patch/create Secret).
|
||
# Those Jobs are validated by Kyverno before `kyverno-svc` exists during a single Argo sync, which fails.
|
||
# Requires cert-manager CRDs (bootstrap before this chart).
|
||
prometheusOperator:
|
||
admissionWebhooks:
|
||
certManager:
|
||
enabled: true
|
||
|
||
# CRDs + Argo CD: **`helm.skipCrds: true`** on the Argo Application avoids Argo rendering chart CRDs with
|
||
# **`--include-crds`** (client-side apply would overflow **last-applied-configuration** on huge CRDs).
|
||
# Ref: https://github.com/argoproj/argo-cd/issues/11269
|
||
#
|
||
# Do **not** enable **`crds.upgradeJob`** while this release is Argo-managed: the hook creates ConfigMap
|
||
# **`kube-prometheus-crds-upgrade`** whose **binaryData** is enormous; Argo client-side apply repeats the same
|
||
# annotation size limit on that object. Keep the job **off**; upgrade Prometheus Operator CRDs when you bump
|
||
# the chart via **Ansible `helm upgrade`** (or the chart’s manual CRD steps), not via Argo sync.
|
||
crds:
|
||
upgradeJob:
|
||
enabled: false
|
||
|
||
# Subchart defaults only tolerate NoSchedule; a node with other taints leaves node-exporter at 3/4 and Helm --wait times out.
|
||
prometheus-node-exporter:
|
||
tolerations:
|
||
- operator: Exists
|
||
|
||
# --- Longhorn-backed persistence (default chart storage is emptyDir) ---
|
||
alertmanager:
|
||
alertmanagerSpec:
|
||
storage:
|
||
volumeClaimTemplate:
|
||
spec:
|
||
storageClassName: longhorn
|
||
accessModes: ["ReadWriteOnce"]
|
||
resources:
|
||
requests:
|
||
storage: 5Gi
|
||
ingress:
|
||
enabled: true
|
||
ingressClassName: traefik
|
||
annotations:
|
||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||
hosts:
|
||
- alertmanager.apps.noble.lab.pcenicni.dev
|
||
paths:
|
||
- /
|
||
pathType: Prefix
|
||
tls:
|
||
- secretName: alertmanager-apps-noble-tls
|
||
hosts:
|
||
- alertmanager.apps.noble.lab.pcenicni.dev
|
||
|
||
prometheus:
|
||
prometheusSpec:
|
||
retention: 15d
|
||
retentionSize: 25GB
|
||
storageSpec:
|
||
volumeClaimTemplate:
|
||
spec:
|
||
storageClassName: longhorn
|
||
accessModes: ["ReadWriteOnce"]
|
||
resources:
|
||
requests:
|
||
storage: 30Gi
|
||
ingress:
|
||
enabled: true
|
||
ingressClassName: traefik
|
||
annotations:
|
||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||
hosts:
|
||
- prometheus.apps.noble.lab.pcenicni.dev
|
||
paths:
|
||
- /
|
||
pathType: Prefix
|
||
tls:
|
||
- secretName: prometheus-apps-noble-tls
|
||
hosts:
|
||
- prometheus.apps.noble.lab.pcenicni.dev
|
||
|
||
grafana:
|
||
persistence:
|
||
enabled: true
|
||
type: sts
|
||
storageClassName: longhorn
|
||
accessModes:
|
||
- ReadWriteOnce
|
||
size: 10Gi
|
||
|
||
# HTTPS via Traefik + cert-manager (ClusterIssuer letsencrypt-prod; same pattern as other *.apps.noble.lab.pcenicni.dev hosts).
|
||
# DNS: grafana.apps.noble.lab.pcenicni.dev → Traefik LoadBalancer (192.168.50.211) — see clusters/noble/bootstrap/traefik/values.yaml
|
||
ingress:
|
||
enabled: true
|
||
ingressClassName: traefik
|
||
path: /
|
||
pathType: Prefix
|
||
annotations:
|
||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||
hosts:
|
||
- grafana.apps.noble.lab.pcenicni.dev
|
||
tls:
|
||
- secretName: grafana-apps-noble-tls
|
||
hosts:
|
||
- grafana.apps.noble.lab.pcenicni.dev
|
||
|
||
grafana.ini:
|
||
server:
|
||
domain: grafana.apps.noble.lab.pcenicni.dev
|
||
root_url: https://grafana.apps.noble.lab.pcenicni.dev/
|
||
# Traefik sets X-Forwarded-*; required for correct redirects and cookies behind the ingress.
|
||
use_proxy_headers: true
|
||
|
||
# Loki datasource: apply `clusters/noble/bootstrap/grafana-loki-datasource/loki-datasource.yaml` (sidecar ConfigMap) instead of additionalDataSources here.
|