Update kube-prometheus-stack values.yaml to clarify Loki datasource configuration and enhance observability documentation in CLUSTER-BUILD.md. Include deployment instructions for Loki and Fluent Bit, and mark tasks related to Grafana integration as completed.
This commit is contained in:
10
clusters/noble/apps/fluent-bit/namespace.yaml
Normal file
10
clusters/noble/apps/fluent-bit/namespace.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# Fluent Bit (tail container logs → Loki) — apply before Helm.
|
||||||
|
# HostPath mounts under /var/log require PSA privileged (same idea as monitoring/node-exporter).
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: logging
|
||||||
|
labels:
|
||||||
|
pod-security.kubernetes.io/enforce: privileged
|
||||||
|
pod-security.kubernetes.io/audit: privileged
|
||||||
|
pod-security.kubernetes.io/warn: privileged
|
||||||
40
clusters/noble/apps/fluent-bit/values.yaml
Normal file
40
clusters/noble/apps/fluent-bit/values.yaml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# Fluent Bit — noble lab (DaemonSet; ship Kubernetes container logs to Loki gateway).
|
||||||
|
#
|
||||||
|
# Chart: fluent/fluent-bit — pin version on install (e.g. 0.56.0).
|
||||||
|
# Install **after** Loki so `loki-gateway.loki.svc` exists.
|
||||||
|
#
|
||||||
|
# Talos: only **tail** `/var/log/containers` (no host **systemd** input — journal layout differs from typical Linux).
|
||||||
|
#
|
||||||
|
# kubectl apply -f clusters/noble/apps/fluent-bit/namespace.yaml
|
||||||
|
# helm repo add fluent https://fluent.github.io/helm-charts
|
||||||
|
# helm repo update
|
||||||
|
# helm upgrade --install fluent-bit fluent/fluent-bit -n logging \
|
||||||
|
# --version 0.56.0 -f clusters/noble/apps/fluent-bit/values.yaml --wait --timeout 15m
|
||||||
|
|
||||||
|
config:
|
||||||
|
inputs: |
|
||||||
|
[INPUT]
|
||||||
|
Name tail
|
||||||
|
Path /var/log/containers/*.log
|
||||||
|
multiline.parser docker, cri
|
||||||
|
Tag kube.*
|
||||||
|
Mem_Buf_Limit 5MB
|
||||||
|
Skip_Long_Lines On
|
||||||
|
|
||||||
|
filters: |
|
||||||
|
[FILTER]
|
||||||
|
Name kubernetes
|
||||||
|
Match kube.*
|
||||||
|
Merge_Log On
|
||||||
|
Keep_Log Off
|
||||||
|
K8S-Logging.Parser On
|
||||||
|
K8S-Logging.Exclude On
|
||||||
|
|
||||||
|
outputs: |
|
||||||
|
[OUTPUT]
|
||||||
|
Name loki
|
||||||
|
Match kube.*
|
||||||
|
Host loki-gateway.loki.svc.cluster.local
|
||||||
|
Port 80
|
||||||
|
tls Off
|
||||||
|
labels job=fluent-bit
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
# Extra Grafana datasource — apply to **monitoring** (same namespace as kube-prometheus Grafana).
|
||||||
|
# The Grafana sidecar watches ConfigMaps labeled **grafana_datasource: "1"** and loads YAML keys as files.
|
||||||
|
# Does not require editing the kube-prometheus-stack Helm release.
|
||||||
|
#
|
||||||
|
# kubectl apply -f clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml
|
||||||
|
#
|
||||||
|
# Remove with: kubectl delete -f clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-datasource-loki
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
grafana_datasource: "1"
|
||||||
|
data:
|
||||||
|
loki.yaml: |
|
||||||
|
apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
uid: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki-gateway.loki.svc.cluster.local:80
|
||||||
|
isDefault: false
|
||||||
|
editable: false
|
||||||
|
jsonData:
|
||||||
|
maxLines: 1000
|
||||||
@@ -70,3 +70,5 @@ grafana:
|
|||||||
server:
|
server:
|
||||||
domain: grafana.apps.noble.lab.pcenicni.dev
|
domain: grafana.apps.noble.lab.pcenicni.dev
|
||||||
root_url: https://grafana.apps.noble.lab.pcenicni.dev/
|
root_url: https://grafana.apps.noble.lab.pcenicni.dev/
|
||||||
|
|
||||||
|
# Loki datasource: apply `clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml` (sidecar ConfigMap) instead of additionalDataSources here.
|
||||||
|
|||||||
9
clusters/noble/apps/loki/namespace.yaml
Normal file
9
clusters/noble/apps/loki/namespace.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# Loki (SingleBinary + filesystem on Longhorn) — apply before Helm.
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: loki
|
||||||
|
labels:
|
||||||
|
pod-security.kubernetes.io/enforce: baseline
|
||||||
|
pod-security.kubernetes.io/audit: baseline
|
||||||
|
pod-security.kubernetes.io/warn: baseline
|
||||||
78
clusters/noble/apps/loki/values.yaml
Normal file
78
clusters/noble/apps/loki/values.yaml
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
# Grafana Loki — noble lab (SingleBinary, filesystem on Longhorn; no MinIO/S3).
|
||||||
|
#
|
||||||
|
# Chart: grafana/loki — pin version on install (e.g. 6.55.0).
|
||||||
|
#
|
||||||
|
# kubectl apply -f clusters/noble/apps/loki/namespace.yaml
|
||||||
|
# helm repo add grafana https://grafana.github.io/helm-charts
|
||||||
|
# helm repo update
|
||||||
|
# helm upgrade --install loki grafana/loki -n loki \
|
||||||
|
# --version 6.55.0 -f clusters/noble/apps/loki/values.yaml --wait --timeout 30m
|
||||||
|
#
|
||||||
|
# Query/push URL for Grafana + Fluent Bit: http://loki-gateway.loki.svc.cluster.local:80
|
||||||
|
|
||||||
|
deploymentMode: SingleBinary
|
||||||
|
|
||||||
|
loki:
|
||||||
|
# Single-tenant lab: chart default auth_enabled: true requires X-Scope-OrgID on every query/push (Grafana + Fluent Bit break).
|
||||||
|
auth_enabled: false
|
||||||
|
commonConfig:
|
||||||
|
replication_factor: 1
|
||||||
|
storage:
|
||||||
|
type: filesystem
|
||||||
|
schemaConfig:
|
||||||
|
configs:
|
||||||
|
- from: "2024-04-01"
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: loki_index_
|
||||||
|
period: 24h
|
||||||
|
pattern_ingester:
|
||||||
|
enabled: false
|
||||||
|
limits_config:
|
||||||
|
allow_structured_metadata: true
|
||||||
|
volume_enabled: true
|
||||||
|
|
||||||
|
singleBinary:
|
||||||
|
replicas: 1
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClass: longhorn
|
||||||
|
size: 30Gi
|
||||||
|
|
||||||
|
backend:
|
||||||
|
replicas: 0
|
||||||
|
read:
|
||||||
|
replicas: 0
|
||||||
|
write:
|
||||||
|
replicas: 0
|
||||||
|
ingester:
|
||||||
|
replicas: 0
|
||||||
|
querier:
|
||||||
|
replicas: 0
|
||||||
|
queryFrontend:
|
||||||
|
replicas: 0
|
||||||
|
queryScheduler:
|
||||||
|
replicas: 0
|
||||||
|
distributor:
|
||||||
|
replicas: 0
|
||||||
|
compactor:
|
||||||
|
replicas: 0
|
||||||
|
indexGateway:
|
||||||
|
replicas: 0
|
||||||
|
bloomCompactor:
|
||||||
|
replicas: 0
|
||||||
|
bloomGateway:
|
||||||
|
replicas: 0
|
||||||
|
|
||||||
|
minio:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
gateway:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Memcached chunk cache: chart default is ~8Gi RAM requests; even 512Mi can stay Pending on small clusters (affinity).
|
||||||
|
# Homelab: disable — Loki works without it; queries may be slightly slower under load.
|
||||||
|
chunksCache:
|
||||||
|
enabled: false
|
||||||
@@ -14,8 +14,9 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes)
|
|||||||
- **cert-manager** Helm **v1.20.0** / app **v1.20.0** — `clusters/noble/apps/cert-manager/`; **`ClusterIssuer`** **`letsencrypt-staging`** and **`letsencrypt-prod`** (HTTP-01, ingress class **`traefik`**); ACME email **`certificates@noble.lab.pcenicni.dev`** (edit in manifests if you want a different mailbox).
|
- **cert-manager** Helm **v1.20.0** / app **v1.20.0** — `clusters/noble/apps/cert-manager/`; **`ClusterIssuer`** **`letsencrypt-staging`** and **`letsencrypt-prod`** (HTTP-01, ingress class **`traefik`**); ACME email **`certificates@noble.lab.pcenicni.dev`** (edit in manifests if you want a different mailbox).
|
||||||
- **Newt** Helm **1.2.0** / app **1.10.1** — `clusters/noble/apps/newt/` (**fossorial/newt**); Pangolin site tunnel — **`newt-pangolin-auth`** Secret (**`PANGOLIN_ENDPOINT`**, **`NEWT_ID`**, **`NEWT_SECRET`**). **Public DNS** is **not** automated with ExternalDNS: **CNAME** records at your DNS host per Pangolin’s domain instructions, plus **Integration API** for HTTP resources/targets — see **`clusters/noble/apps/newt/README.md`**. LAN access to Traefik can still use **`*.apps.noble.lab.pcenicni.dev`** → **`192.168.50.211`** (split horizon / local resolver).
|
- **Newt** Helm **1.2.0** / app **1.10.1** — `clusters/noble/apps/newt/` (**fossorial/newt**); Pangolin site tunnel — **`newt-pangolin-auth`** Secret (**`PANGOLIN_ENDPOINT`**, **`NEWT_ID`**, **`NEWT_SECRET`**). **Public DNS** is **not** automated with ExternalDNS: **CNAME** records at your DNS host per Pangolin’s domain instructions, plus **Integration API** for HTTP resources/targets — see **`clusters/noble/apps/newt/README.md`**. LAN access to Traefik can still use **`*.apps.noble.lab.pcenicni.dev`** → **`192.168.50.211`** (split horizon / local resolver).
|
||||||
- **Argo CD** Helm **9.4.17** / app **v3.3.6** — `clusters/noble/bootstrap/argocd/`; **`argocd-server`** **`LoadBalancer`** **`192.168.50.210`**; app-of-apps scaffold under **`bootstrap/argocd/apps/`** (edit **`root-application.yaml`** `repoURL` before applying).
|
- **Argo CD** Helm **9.4.17** / app **v3.3.6** — `clusters/noble/bootstrap/argocd/`; **`argocd-server`** **`LoadBalancer`** **`192.168.50.210`**; app-of-apps scaffold under **`bootstrap/argocd/apps/`** (edit **`root-application.yaml`** `repoURL` before applying).
|
||||||
- **kube-prometheus-stack** — Helm chart **82.15.1** — `clusters/noble/apps/kube-prometheus-stack/` (**namespace** `monitoring`, PSA **privileged** — **node-exporter** needs host mounts); **Longhorn** PVCs for Prometheus, Grafana, Alertmanager. **Grafana Ingress:** **`https://grafana.apps.noble.lab.pcenicni.dev`** (Traefik **`ingressClassName: traefik`**, **`cert-manager.io/cluster-issuer: letsencrypt-prod`**). **`helm upgrade --install` with `--wait` is silent until done** — use **`--timeout 30m`** (not `5m`) and watch **`kubectl -n monitoring get pods -w`** in another terminal. Grafana admin password: Secret **`kube-prometheus-grafana`**, keys **`admin-user`** / **`admin-password`**.
|
- **kube-prometheus-stack** — Helm chart **82.15.1** — `clusters/noble/apps/kube-prometheus-stack/` (**namespace** `monitoring`, PSA **privileged** — **node-exporter** needs host mounts); **Longhorn** PVCs for Prometheus, Grafana, Alertmanager. **Grafana Ingress:** **`https://grafana.apps.noble.lab.pcenicni.dev`** (Traefik **`ingressClassName: traefik`**, **`cert-manager.io/cluster-issuer: letsencrypt-prod`**). **Loki** in Grafana: ConfigMap **`clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml`** (sidecar label **`grafana_datasource`**) — apply after **Loki** is running; does not use **`grafana.additionalDataSources`** on the chart. **`helm upgrade --install` with `--wait` is silent until done** — use **`--timeout 30m`** (not `5m`) and watch **`kubectl -n monitoring get pods -w`** in another terminal. Grafana admin password: Secret **`kube-prometheus-grafana`**, keys **`admin-user`** / **`admin-password`**.
|
||||||
- **Still open:** **Loki** + **Fluent Bit** + Grafana datasource (Phase D).
|
- **Loki** + **Fluent Bit** (manifests in repo) — **`grafana/loki` 6.55.0** SingleBinary + **filesystem** on **Longhorn** (`clusters/noble/apps/loki/`); **`loki.auth_enabled: false`** (single-tenant lab — avoids **`X-Scope-OrgID`** on Grafana/Fluent Bit); **`chunksCache.enabled: false`** (default memcached cache is heavy / often Pending on small nodes). **`fluent/fluent-bit` 0.56.0** tails **`/var/log/containers`** only → **`loki-gateway.loki.svc:80`** (`clusters/noble/apps/fluent-bit/`). **`logging`** namespace PSA **privileged** (hostPath).
|
||||||
|
- **Still open:** deploy **Loki** → **Fluent Bit** → **`helm upgrade kube-prometheus`** (Phase D checklist).
|
||||||
|
|
||||||
## Inventory
|
## Inventory
|
||||||
|
|
||||||
@@ -53,6 +54,8 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes)
|
|||||||
- Newt (Fossorial): **1.2.0** (Helm chart; app **1.10.1**)
|
- Newt (Fossorial): **1.2.0** (Helm chart; app **1.10.1**)
|
||||||
- Argo CD: **9.4.17** (Helm chart `argo/argo-cd`; app **v3.3.6**)
|
- Argo CD: **9.4.17** (Helm chart `argo/argo-cd`; app **v3.3.6**)
|
||||||
- kube-prometheus-stack: **82.15.1** (Helm chart `prometheus-community/kube-prometheus-stack`; app **v0.89.x** bundle)
|
- kube-prometheus-stack: **82.15.1** (Helm chart `prometheus-community/kube-prometheus-stack`; app **v0.89.x** bundle)
|
||||||
|
- Loki: **6.55.0** (Helm chart `grafana/loki`; app **3.6.7**)
|
||||||
|
- Fluent Bit: **0.56.0** (Helm chart `fluent/fluent-bit`; app **4.2.3**)
|
||||||
|
|
||||||
## Repo paths (this workspace)
|
## Repo paths (this workspace)
|
||||||
|
|
||||||
@@ -73,6 +76,9 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes)
|
|||||||
| Newt / Pangolin tunnel (Helm) | `clusters/noble/apps/newt/` — `values.yaml`, `namespace.yaml`, `README.md` |
|
| Newt / Pangolin tunnel (Helm) | `clusters/noble/apps/newt/` — `values.yaml`, `namespace.yaml`, `README.md` |
|
||||||
| Argo CD (bootstrap + app-of-apps) | `clusters/noble/bootstrap/argocd/` — `values.yaml`, `root-application.yaml`, `apps/`, `README.md` |
|
| Argo CD (bootstrap + app-of-apps) | `clusters/noble/bootstrap/argocd/` — `values.yaml`, `root-application.yaml`, `apps/`, `README.md` |
|
||||||
| kube-prometheus-stack (Helm values) | `clusters/noble/apps/kube-prometheus-stack/` — `values.yaml`, `namespace.yaml` |
|
| kube-prometheus-stack (Helm values) | `clusters/noble/apps/kube-prometheus-stack/` — `values.yaml`, `namespace.yaml` |
|
||||||
|
| Grafana Loki datasource (ConfigMap; no chart change) | `clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml` |
|
||||||
|
| Loki (Helm values) | `clusters/noble/apps/loki/` — `values.yaml`, `namespace.yaml` |
|
||||||
|
| Fluent Bit → Loki (Helm values) | `clusters/noble/apps/fluent-bit/` — `values.yaml`, `namespace.yaml` |
|
||||||
|
|
||||||
**Git vs cluster:** manifests and `talconfig` live in git; **`talhelper genconfig -o out`**, bootstrap, Helm, and `kubectl` run on your LAN. See **`talos/README.md`** for workstation reachability (lab LAN/VPN), **`talosctl kubeconfig`** vs Kubernetes `server:` (VIP vs node IP), and **`--insecure`** only in maintenance.
|
**Git vs cluster:** manifests and `talconfig` live in git; **`talhelper genconfig -o out`**, bootstrap, Helm, and `kubectl` run on your LAN. See **`talos/README.md`** for workstation reachability (lab LAN/VPN), **`talosctl kubeconfig`** vs Kubernetes `server:` (VIP vs node IP), and **`--insecure`** only in maintenance.
|
||||||
|
|
||||||
@@ -82,6 +88,7 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes)
|
|||||||
2. **MetalLB Helm chart** (CRDs + controller) **before** `kubectl apply -k` on the pool manifests.
|
2. **MetalLB Helm chart** (CRDs + controller) **before** `kubectl apply -k` on the pool manifests.
|
||||||
3. **`clusters/noble/apps/metallb/namespace.yaml`** before or merged onto `metallb-system` so Pod Security does not block speaker (see `apps/metallb/README.md`).
|
3. **`clusters/noble/apps/metallb/namespace.yaml`** before or merged onto `metallb-system` so Pod Security does not block speaker (see `apps/metallb/README.md`).
|
||||||
4. **Longhorn:** Talos user volume + extensions in `talconfig.with-longhorn.yaml` (when restored); Helm **`defaultDataPath`** in `clusters/noble/apps/longhorn/values.yaml`.
|
4. **Longhorn:** Talos user volume + extensions in `talconfig.with-longhorn.yaml` (when restored); Helm **`defaultDataPath`** in `clusters/noble/apps/longhorn/values.yaml`.
|
||||||
|
5. **Loki → Fluent Bit → Grafana:** deploy **Loki** (`loki-gateway` Service) before **Fluent Bit**; run **`helm upgrade`** on **kube-prometheus-stack** after **Loki** so Grafana provisions the **Loki** datasource.
|
||||||
|
|
||||||
## Prerequisites (before phases)
|
## Prerequisites (before phases)
|
||||||
|
|
||||||
@@ -127,7 +134,7 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes)
|
|||||||
## Phase D — Observability
|
## Phase D — Observability
|
||||||
|
|
||||||
- [x] **kube-prometheus-stack** — `kubectl apply -f clusters/noble/apps/kube-prometheus-stack/namespace.yaml` then **`helm upgrade --install`** as in `clusters/noble/apps/kube-prometheus-stack/values.yaml` (chart **82.15.1**); PVCs **`longhorn`**; **`--wait --timeout 30m`** recommended; verify **`kubectl -n monitoring get pods,pvc`**
|
- [x] **kube-prometheus-stack** — `kubectl apply -f clusters/noble/apps/kube-prometheus-stack/namespace.yaml` then **`helm upgrade --install`** as in `clusters/noble/apps/kube-prometheus-stack/values.yaml` (chart **82.15.1**); PVCs **`longhorn`**; **`--wait --timeout 30m`** recommended; verify **`kubectl -n monitoring get pods,pvc`**
|
||||||
- [ ] **Loki** + **Fluent Bit**; Grafana datasource
|
- [ ] **Loki** + **Fluent Bit** + **Grafana Loki datasource** — **order:** **`kubectl apply -f clusters/noble/apps/loki/namespace.yaml`** → **`helm upgrade --install loki`** `grafana/loki` **6.55.0** `-f clusters/noble/apps/loki/values.yaml` → **`kubectl apply -f clusters/noble/apps/fluent-bit/namespace.yaml`** → **`helm upgrade --install fluent-bit`** `fluent/fluent-bit` **0.56.0** `-f clusters/noble/apps/fluent-bit/values.yaml` → **`kubectl apply -f clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml`**. Verify **Explore → Loki** in Grafana; **`kubectl -n loki get pods,pvc`**, **`kubectl -n logging get pods`**
|
||||||
|
|
||||||
## Phase E — Secrets
|
## Phase E — Secrets
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user