Update kube-prometheus-stack values.yaml to clarify Loki datasource configuration and enhance observability documentation in CLUSTER-BUILD.md. Include deployment instructions for Loki and Fluent Bit, and mark tasks related to Grafana integration as completed.

2026-03-28 00:56:49 -04:00
parent 7caba0d90c
commit 2b4f568632
7 changed files with 176 additions and 3 deletions
--- a/clusters/noble/apps/fluent-bit/namespace.yaml
+++ b/clusters/noble/apps/fluent-bit/namespace.yaml
@@ -0,0 +1,10 @@
+# Fluent Bit (tail container logs → Loki) — apply before Helm.
+# HostPath mounts under /var/log require PSA privileged (same idea as monitoring/node-exporter).
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: logging
+  labels:
+    pod-security.kubernetes.io/enforce: privileged
+    pod-security.kubernetes.io/audit: privileged
+    pod-security.kubernetes.io/warn: privileged
--- a/clusters/noble/apps/fluent-bit/values.yaml
+++ b/clusters/noble/apps/fluent-bit/values.yaml
@@ -0,0 +1,40 @@
+# Fluent Bit — noble lab (DaemonSet; ship Kubernetes container logs to Loki gateway).
+#
+# Chart: fluent/fluent-bit — pin version on install (e.g. 0.56.0).
+# Install **after** Loki so `loki-gateway.loki.svc` exists.
+#
+# Talos: only **tail** `/var/log/containers` (no host **systemd** input — journal layout differs from typical Linux).
+#
+# kubectl apply -f clusters/noble/apps/fluent-bit/namespace.yaml
+# helm repo add fluent https://fluent.github.io/helm-charts
+# helm repo update
+# helm upgrade --install fluent-bit fluent/fluent-bit -n logging \
+#   --version 0.56.0 -f clusters/noble/apps/fluent-bit/values.yaml --wait --timeout 15m
+
+config:
+  inputs: |
+    [INPUT]
+        Name tail
+        Path /var/log/containers/*.log
+        multiline.parser docker, cri
+        Tag kube.*
+        Mem_Buf_Limit 5MB
+        Skip_Long_Lines On
+
+  filters: |
+    [FILTER]
+        Name kubernetes
+        Match kube.*
+        Merge_Log On
+        Keep_Log Off
+        K8S-Logging.Parser On
+        K8S-Logging.Exclude On
+
+  outputs: |
+    [OUTPUT]
+        Name loki
+        Match kube.*
+        Host loki-gateway.loki.svc.cluster.local
+        Port 80
+        tls Off
+        labels job=fluent-bit
--- a/clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml
+++ b/clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml
@@ -0,0 +1,27 @@
+# Extra Grafana datasource — apply to **monitoring** (same namespace as kube-prometheus Grafana).
+# The Grafana sidecar watches ConfigMaps labeled **grafana_datasource: "1"** and loads YAML keys as files.
+# Does not require editing the kube-prometheus-stack Helm release.
+#
+#   kubectl apply -f clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml
+#
+# Remove with: kubectl delete -f clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasource-loki
+  namespace: monitoring
+  labels:
+    grafana_datasource: "1"
+data:
+  loki.yaml: |
+    apiVersion: 1
+    datasources:
+      - name: Loki
+        type: loki
+        uid: loki
+        access: proxy
+        url: http://loki-gateway.loki.svc.cluster.local:80
+        isDefault: false
+        editable: false
+        jsonData:
+          maxLines: 1000
--- a/clusters/noble/apps/kube-prometheus-stack/values.yaml
+++ b/clusters/noble/apps/kube-prometheus-stack/values.yaml
@@ -70,3 +70,5 @@ grafana:
    server:
      domain: grafana.apps.noble.lab.pcenicni.dev
      root_url: https://grafana.apps.noble.lab.pcenicni.dev/
+
+  # Loki datasource: apply `clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml` (sidecar ConfigMap) instead of additionalDataSources here.
--- a/clusters/noble/apps/loki/namespace.yaml
+++ b/clusters/noble/apps/loki/namespace.yaml
@@ -0,0 +1,9 @@
+# Loki (SingleBinary + filesystem on Longhorn) — apply before Helm.
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: loki
+  labels:
+    pod-security.kubernetes.io/enforce: baseline
+    pod-security.kubernetes.io/audit: baseline
+    pod-security.kubernetes.io/warn: baseline
--- a/clusters/noble/apps/loki/values.yaml
+++ b/clusters/noble/apps/loki/values.yaml
@@ -0,0 +1,78 @@
+# Grafana Loki — noble lab (SingleBinary, filesystem on Longhorn; no MinIO/S3).
+#
+# Chart: grafana/loki — pin version on install (e.g. 6.55.0).
+#
+# kubectl apply -f clusters/noble/apps/loki/namespace.yaml
+# helm repo add grafana https://grafana.github.io/helm-charts
+# helm repo update
+# helm upgrade --install loki grafana/loki -n loki \
+#   --version 6.55.0 -f clusters/noble/apps/loki/values.yaml --wait --timeout 30m
+#
+# Query/push URL for Grafana + Fluent Bit: http://loki-gateway.loki.svc.cluster.local:80
+
+deploymentMode: SingleBinary
+
+loki:
+  # Single-tenant lab: chart default auth_enabled: true requires X-Scope-OrgID on every query/push (Grafana + Fluent Bit break).
+  auth_enabled: false
+  commonConfig:
+    replication_factor: 1
+  storage:
+    type: filesystem
+  schemaConfig:
+    configs:
+      - from: "2024-04-01"
+        store: tsdb
+        object_store: filesystem
+        schema: v13
+        index:
+          prefix: loki_index_
+          period: 24h
+  pattern_ingester:
+    enabled: false
+  limits_config:
+    allow_structured_metadata: true
+    volume_enabled: true
+
+singleBinary:
+  replicas: 1
+  persistence:
+    enabled: true
+    storageClass: longhorn
+    size: 30Gi
+
+backend:
+  replicas: 0
+read:
+  replicas: 0
+write:
+  replicas: 0
+ingester:
+  replicas: 0
+querier:
+  replicas: 0
+queryFrontend:
+  replicas: 0
+queryScheduler:
+  replicas: 0
+distributor:
+  replicas: 0
+compactor:
+  replicas: 0
+indexGateway:
+  replicas: 0
+bloomCompactor:
+  replicas: 0
+bloomGateway:
+  replicas: 0
+
+minio:
+  enabled: false
+
+gateway:
+  enabled: true
+
+# Memcached chunk cache: chart default is ~8Gi RAM requests; even 512Mi can stay Pending on small clusters (affinity).
+# Homelab: disable — Loki works without it; queries may be slightly slower under load.
+chunksCache:
+  enabled: false