From 2a64f40f936610397e7f15eb63f3b4703f255c25 Mon Sep 17 00:00:00 2001 From: Nikholas Pcenicni <82239765+nikpcenicni@users.noreply.github.com> Date: Fri, 27 Mar 2026 23:45:00 -0400 Subject: [PATCH] Enable pre-upgrade job for Longhorn in values.yaml, update MetalLB README for clarity on LoadBalancer IP assignment, and enhance Talos configuration with node IP validation for VIPs. Update cluster build documentation to reflect new application versions and configurations. --- clusters/noble/apps/cert-manager/README.md | 37 +++++++++ .../clusterissuer-letsencrypt-prod.yaml | 16 ++++ .../clusterissuer-letsencrypt-staging.yaml | 16 ++++ .../apps/cert-manager/kustomization.yaml | 5 ++ .../noble/apps/cert-manager/namespace.yaml | 9 +++ clusters/noble/apps/cert-manager/values.yaml | 14 ++++ clusters/noble/apps/longhorn/values.yaml | 4 +- clusters/noble/apps/metallb/README.md | 6 +- clusters/noble/apps/newt/README.md | 79 +++++++++++++++++++ clusters/noble/apps/newt/namespace.yaml | 9 +++ clusters/noble/apps/newt/values.yaml | 26 ++++++ clusters/noble/apps/traefik/README.md | 33 ++++++++ clusters/noble/apps/traefik/namespace.yaml | 10 +++ clusters/noble/apps/traefik/values.yaml | 29 +++++++ clusters/noble/bootstrap/argocd/README.md | 52 ++++++++++++ .../noble/bootstrap/argocd/apps/README.md | 10 +++ .../bootstrap/argocd/root-application.yaml | 30 +++++++ clusters/noble/bootstrap/argocd/values.yaml | 25 ++++++ talos/CLUSTER-BUILD.md | 45 +++++++---- talos/talconfig.with-longhorn.yaml | 12 ++- talos/talconfig.yaml | 12 ++- 21 files changed, 452 insertions(+), 27 deletions(-) create mode 100644 clusters/noble/apps/cert-manager/README.md create mode 100644 clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-prod.yaml create mode 100644 clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-staging.yaml create mode 100644 clusters/noble/apps/cert-manager/kustomization.yaml create mode 100644 clusters/noble/apps/cert-manager/namespace.yaml create mode 100644 clusters/noble/apps/cert-manager/values.yaml create mode 100644 clusters/noble/apps/newt/README.md create mode 100644 clusters/noble/apps/newt/namespace.yaml create mode 100644 clusters/noble/apps/newt/values.yaml create mode 100644 clusters/noble/apps/traefik/README.md create mode 100644 clusters/noble/apps/traefik/namespace.yaml create mode 100644 clusters/noble/apps/traefik/values.yaml create mode 100644 clusters/noble/bootstrap/argocd/README.md create mode 100644 clusters/noble/bootstrap/argocd/apps/README.md create mode 100644 clusters/noble/bootstrap/argocd/root-application.yaml create mode 100644 clusters/noble/bootstrap/argocd/values.yaml diff --git a/clusters/noble/apps/cert-manager/README.md b/clusters/noble/apps/cert-manager/README.md new file mode 100644 index 0000000..7a31ae5 --- /dev/null +++ b/clusters/noble/apps/cert-manager/README.md @@ -0,0 +1,37 @@ +# cert-manager — noble + +**Prerequisites:** **Traefik** (ingress class **`traefik`**), DNS for **`*.apps.noble.lab.pcenicni.dev`** → Traefik LB. + +1. Create the namespace: + + ```bash + kubectl apply -f clusters/noble/apps/cert-manager/namespace.yaml + ``` + +2. Install the chart (CRDs included via `values.yaml`): + + ```bash + helm repo add jetstack https://charts.jetstack.io + helm repo update + helm upgrade --install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --version v1.20.0 \ + -f clusters/noble/apps/cert-manager/values.yaml \ + --wait + ``` + +3. Optionally edit **`spec.acme.email`** in both ClusterIssuer manifests (default **`certificates@noble.lab.pcenicni.dev`**) — Let’s Encrypt uses this for expiry and account notices. Do **not** use **`example.com`** (ACME rejects it). + +4. Apply ClusterIssuers (staging then prod, or both): + + ```bash + kubectl apply -k clusters/noble/apps/cert-manager + ``` + +5. Confirm: + + ```bash + kubectl get clusterissuer + ``` + +Use **`cert-manager.io/cluster-issuer: letsencrypt-staging`** on Ingresses while testing; switch to **`letsencrypt-prod`** when ready. diff --git a/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-prod.yaml b/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-prod.yaml new file mode 100644 index 0000000..677928b --- /dev/null +++ b/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-prod.yaml @@ -0,0 +1,16 @@ +# Let's Encrypt production — trusted certificates; respect rate limits. +# Prefer a real mailbox for expiry notices; this domain is accepted by LE (edit if needed). +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod +spec: + acme: + email: certificates@noble.lab.pcenicni.dev + server: https://acme-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-prod-account-key + solvers: + - http01: + ingress: + class: traefik diff --git a/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-staging.yaml b/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-staging.yaml new file mode 100644 index 0000000..560d839 --- /dev/null +++ b/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-staging.yaml @@ -0,0 +1,16 @@ +# Let's Encrypt staging — use for tests (untrusted issuer in browsers). +# Prefer a real mailbox for expiry notices; this domain is accepted by LE (edit if needed). +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging +spec: + acme: + email: certificates@noble.lab.pcenicni.dev + server: https://acme-staging-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-staging-account-key + solvers: + - http01: + ingress: + class: traefik diff --git a/clusters/noble/apps/cert-manager/kustomization.yaml b/clusters/noble/apps/cert-manager/kustomization.yaml new file mode 100644 index 0000000..3443eb3 --- /dev/null +++ b/clusters/noble/apps/cert-manager/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - clusterissuer-letsencrypt-staging.yaml + - clusterissuer-letsencrypt-prod.yaml diff --git a/clusters/noble/apps/cert-manager/namespace.yaml b/clusters/noble/apps/cert-manager/namespace.yaml new file mode 100644 index 0000000..3929e91 --- /dev/null +++ b/clusters/noble/apps/cert-manager/namespace.yaml @@ -0,0 +1,9 @@ +# cert-manager controller + webhook — noble lab +apiVersion: v1 +kind: Namespace +metadata: + name: cert-manager + labels: + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/audit: baseline + pod-security.kubernetes.io/warn: baseline diff --git a/clusters/noble/apps/cert-manager/values.yaml b/clusters/noble/apps/cert-manager/values.yaml new file mode 100644 index 0000000..ea2a2a7 --- /dev/null +++ b/clusters/noble/apps/cert-manager/values.yaml @@ -0,0 +1,14 @@ +# cert-manager — noble lab +# +# Chart: jetstack/cert-manager — pin version on the helm command (e.g. v1.20.0). +# +# kubectl apply -f clusters/noble/apps/cert-manager/namespace.yaml +# helm repo add jetstack https://charts.jetstack.io +# helm repo update +# helm upgrade --install cert-manager jetstack/cert-manager -n cert-manager \ +# --version v1.20.0 -f clusters/noble/apps/cert-manager/values.yaml --wait +# +# kubectl apply -k clusters/noble/apps/cert-manager + +crds: + enabled: true diff --git a/clusters/noble/apps/longhorn/values.yaml b/clusters/noble/apps/longhorn/values.yaml index b7bce02..69a34a0 100644 --- a/clusters/noble/apps/longhorn/values.yaml +++ b/clusters/noble/apps/longhorn/values.yaml @@ -16,6 +16,6 @@ defaultSettings: # Default 30% reserved often makes small data disks look "full" to the scheduler. storageReservedPercentageForDefaultDisk: "10" -# Pre-upgrade Job waits for healthy managers; disable while fixing Talos image (iscsi-tools) / kubelet binds, then re-enable. +# Pre-upgrade Job: keep enabled for normal Helm upgrades (disable only if GitOps sync fights the Job). preUpgradeChecker: - jobEnabled: false + jobEnabled: true diff --git a/clusters/noble/apps/metallb/README.md b/clusters/noble/apps/metallb/README.md index cb1fc87..93b6a34 100644 --- a/clusters/noble/apps/metallb/README.md +++ b/clusters/noble/apps/metallb/README.md @@ -41,11 +41,11 @@ Then restart MetalLB pods if they were failing (`kubectl get pods -n metallb-sys kubectl apply -k clusters/noble/apps/metallb ``` -3. Confirm a test `Service` `type: LoadBalancer` receives an address in `192.168.50.210`–`192.168.50.229`. +3. Confirm a `Service` `type: LoadBalancer` receives an address in `192.168.50.210`–`192.168.50.229` (e.g. **`kubectl get svc -n traefik traefik`** after installing **Traefik** in `clusters/noble/apps/traefik/`). -Reserve **one** IP in that range for Argo CD (e.g. `192.168.50.210`) via `spec.loadBalancerIP` or chart values when you expose the server. +Reserve **one** IP in that range for Argo CD (e.g. `192.168.50.210`) via `spec.loadBalancerIP` or chart values when you expose the server. Traefik pins **`192.168.50.211`** in **`clusters/noble/apps/traefik/values.yaml`**. -### `Pending` MetalLB pods +## `Pending` MetalLB pods 1. `kubectl get nodes` — every node **`Ready`**? If **`NotReady`** or **`NetworkUnavailable`**, finish **CNI** install first. 2. `kubectl describe pod -n metallb-system ` — read **Events** at the bottom (`0/N nodes are available: …`). diff --git a/clusters/noble/apps/newt/README.md b/clusters/noble/apps/newt/README.md new file mode 100644 index 0000000..1bb62d8 --- /dev/null +++ b/clusters/noble/apps/newt/README.md @@ -0,0 +1,79 @@ +# Newt (Pangolin) — noble + +This is the **primary** automation path for **public** hostnames to workloads in this cluster (it **replaces** in-cluster ExternalDNS). [Newt](https://github.com/fosrl/newt) is the on-prem agent that connects your cluster to a **Pangolin** site (WireGuard tunnel). The [Fossorial Helm chart](https://github.com/fosrl/helm-charts) deploys one or more instances. + +**Secrets:** Never commit endpoint, Newt ID, or Newt secret. If credentials were pasted into chat or CI logs, **rotate them** in Pangolin and recreate the Kubernetes Secret. + +## 1. Create the Secret + +Keys must match `values.yaml` (`PANGOLIN_ENDPOINT`, `NEWT_ID`, `NEWT_SECRET`): + +```bash +kubectl apply -f clusters/noble/apps/newt/namespace.yaml + +kubectl -n newt create secret generic newt-pangolin-auth \ + --from-literal=PANGOLIN_ENDPOINT='https://pangolin.pcenicni.dev' \ + --from-literal=NEWT_ID='YOUR_NEWT_ID' \ + --from-literal=NEWT_SECRET='YOUR_NEWT_SECRET' +``` + +Use the Pangolin UI or [Integration API](https://docs.pangolin.net/manage/common-api-routes) (`pick-site-defaults` + `create site`) to obtain a Newt ID and secret for a new site if you are not reusing an existing pair. + +## 2. Install the chart + +```bash +helm repo add fossorial https://charts.fossorial.io +helm repo update +helm upgrade --install newt fossorial/newt \ + --namespace newt \ + --version 1.2.0 \ + -f clusters/noble/apps/newt/values.yaml \ + --wait +``` + +## 3. DNS: CNAME at your DNS host + Pangolin API for routes + +Pangolin does not replace your public DNS provider. Typical flow: + +1. **Link a domain** in Pangolin (organization **Domains**). For **CNAME**-style domains, Pangolin shows the hostname you must **CNAME** to at Cloudflare / your registrar (see [Domains](https://docs.pangolin.net/manage/common-api-routes#list-domains)). +2. **Create public HTTP resources** (and **targets** to your Newt **site**) via the [Integration API](https://docs.pangolin.net/manage/integration-api) — same flows as the UI. Swagger: `https:///v1/docs` (self-hosted: enable `enable_integration_api` and route `api.example.com` → integration port per [docs](https://docs.pangolin.net/self-host/advanced/integration-api)). + +Minimal patterns (Bearer token = org or root API key): + +```bash +export API_BASE='https://api.example.com/v1' # your Pangolin Integration API base +export ORG_ID='your-org-id' +export TOKEN='your-integration-api-key' + +# Domains already linked to the org (use domainId when creating a resource) +curl -sS -H "Authorization: Bearer ${TOKEN}" \ + "${API_BASE}/org/${ORG_ID}/domains" + +# Create an HTTP resource on a domain (FQDN = subdomain + base domain for NS/wildcard domains) +curl -sS -X PUT -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' \ + "${API_BASE}/org/${ORG_ID}/resource" \ + -d '{ + "name": "Example app", + "http": true, + "domainId": "YOUR_DOMAIN_ID", + "protocol": "tcp", + "subdomain": "my-app" + }' + +# Point the resource at your Newt site backend (siteId from list sites / create site; ip:port inside the tunnel) +curl -sS -X PUT -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' \ + "${API_BASE}/resource/RESOURCE_ID/target" \ + -d '{ + "siteId": YOUR_SITE_ID, + "ip": "10.x.x.x", + "port": 443, + "method": "http" + }' +``` + +Exact JSON fields and IDs differ by domain type (**ns** vs **cname** vs **wildcard**); see [Common API routes](https://docs.pangolin.net/manage/common-api-routes) and Swagger. + +## LAN vs internet + +- **LAN / VPN:** point **`*.apps.noble.lab.pcenicni.dev`** at the Traefik **LoadBalancer** (**`192.168.50.211`**) with local or split-horizon DNS if you want direct in-lab access. +- **Internet-facing:** use Pangolin **resources** + **targets** to the Newt **site**; public names rely on **CNAME** records at your DNS provider per Pangolin’s domain setup, not on ExternalDNS in the cluster. diff --git a/clusters/noble/apps/newt/namespace.yaml b/clusters/noble/apps/newt/namespace.yaml new file mode 100644 index 0000000..38c4ec3 --- /dev/null +++ b/clusters/noble/apps/newt/namespace.yaml @@ -0,0 +1,9 @@ +# Newt (Pangolin site tunnel client) — noble lab +apiVersion: v1 +kind: Namespace +metadata: + name: newt + labels: + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/audit: baseline + pod-security.kubernetes.io/warn: baseline diff --git a/clusters/noble/apps/newt/values.yaml b/clusters/noble/apps/newt/values.yaml new file mode 100644 index 0000000..e238912 --- /dev/null +++ b/clusters/noble/apps/newt/values.yaml @@ -0,0 +1,26 @@ +# Newt — noble lab (Fossorial Helm chart) +# +# Credentials MUST come from a Secret — do not put endpoint/id/secret in git. +# +# kubectl apply -f clusters/noble/apps/newt/namespace.yaml +# kubectl -n newt create secret generic newt-pangolin-auth \ +# --from-literal=PANGOLIN_ENDPOINT='https://pangolin.example.com' \ +# --from-literal=NEWT_ID='...' \ +# --from-literal=NEWT_SECRET='...' +# +# helm repo add fossorial https://charts.fossorial.io +# helm upgrade --install newt fossorial/newt -n newt \ +# --version 1.2.0 -f clusters/noble/apps/newt/values.yaml --wait +# +# See README.md for Pangolin Integration API (domains + HTTP resources + CNAME). + +newtInstances: + - name: main-tunnel + enabled: true + replicas: 1 + auth: + existingSecretName: newt-pangolin-auth + keys: + endpointKey: PANGOLIN_ENDPOINT + idKey: NEWT_ID + secretKey: NEWT_SECRET diff --git a/clusters/noble/apps/traefik/README.md b/clusters/noble/apps/traefik/README.md new file mode 100644 index 0000000..51598b0 --- /dev/null +++ b/clusters/noble/apps/traefik/README.md @@ -0,0 +1,33 @@ +# Traefik — noble + +**Prerequisites:** **Cilium**, **MetalLB** (pool + L2), nodes **Ready**. + +1. Create the namespace (Pod Security **baseline** — Traefik needs more than **restricted**): + + ```bash + kubectl apply -f clusters/noble/apps/traefik/namespace.yaml + ``` + +2. Install the chart (**do not** use `--create-namespace` if the namespace already exists): + + ```bash + helm repo add traefik https://traefik.github.io/charts + helm repo update + helm upgrade --install traefik traefik/traefik \ + --namespace traefik \ + --version 39.0.6 \ + -f clusters/noble/apps/traefik/values.yaml \ + --wait + ``` + +3. Confirm the Service has a pool address. On the **LAN**, **`*.apps.noble.lab.pcenicni.dev`** can resolve to this IP (split horizon / local DNS). **Public** names go through **Pangolin + Newt** (CNAME + API), not ExternalDNS — see **`clusters/noble/apps/newt/README.md`**. + + ```bash + kubectl get svc -n traefik traefik + ``` + + Values pin **`192.168.50.211`** via **`metallb.io/loadBalancerIPs`**. **`192.168.50.210`** stays free for Argo CD. + +4. Create **Ingress** resources with **`ingressClassName: traefik`** (or rely on the default class). **TLS:** add **`cert-manager.io/cluster-issuer: letsencrypt-staging`** (or **`letsencrypt-prod`**) and **`tls`** hosts — see **`clusters/noble/apps/cert-manager/README.md`**. + +5. **Public DNS:** use **Newt** + Pangolin (**CNAME** at your DNS host + **Integration API** for resources/targets) — **`clusters/noble/apps/newt/README.md`**. diff --git a/clusters/noble/apps/traefik/namespace.yaml b/clusters/noble/apps/traefik/namespace.yaml new file mode 100644 index 0000000..b758f9a --- /dev/null +++ b/clusters/noble/apps/traefik/namespace.yaml @@ -0,0 +1,10 @@ +# Traefik controller — apply before Helm (omit --create-namespace on install). +# Ingress controller needs capabilities beyond "restricted"; use baseline. +apiVersion: v1 +kind: Namespace +metadata: + name: traefik + labels: + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/audit: baseline + pod-security.kubernetes.io/warn: baseline diff --git a/clusters/noble/apps/traefik/values.yaml b/clusters/noble/apps/traefik/values.yaml new file mode 100644 index 0000000..e74b28c --- /dev/null +++ b/clusters/noble/apps/traefik/values.yaml @@ -0,0 +1,29 @@ +# Traefik ingress controller — noble lab +# +# Chart: traefik/traefik — pin version on the helm command (e.g. 39.0.6). +# DNS: point *.apps.noble.lab.pcenicni.dev to the LoadBalancer IP below. +# +# kubectl apply -f clusters/noble/apps/traefik/namespace.yaml +# helm repo add traefik https://traefik.github.io/charts +# helm upgrade --install traefik traefik/traefik -n traefik \ +# --version 39.0.6 -f clusters/noble/apps/traefik/values.yaml --wait + +service: + type: LoadBalancer + annotations: + metallb.io/loadBalancerIPs: 192.168.50.211 + +ingressClass: + enabled: true + isDefaultClass: true + name: traefik + +# Ingress-only; Gateway API objects from the chart are not needed here. +gateway: + enabled: false + +gatewayClass: + enabled: false + +deployment: + replicas: 1 diff --git a/clusters/noble/bootstrap/argocd/README.md b/clusters/noble/bootstrap/argocd/README.md new file mode 100644 index 0000000..dfd2433 --- /dev/null +++ b/clusters/noble/bootstrap/argocd/README.md @@ -0,0 +1,52 @@ +# Argo CD — noble (bootstrap) + +**Prerequisites:** cluster **Ready**, **MetalLB** pool **`192.168.50.210`–`229`** (Argo CD uses **`192.168.50.210`**; Traefik **`192.168.50.211`**). + +## 1. Install + +```bash +helm repo add argo https://argoproj.github.io/argo-helm +helm repo update +helm upgrade --install argocd argo/argo-cd \ + --namespace argocd \ + --create-namespace \ + --version 9.4.17 \ + -f clusters/noble/bootstrap/argocd/values.yaml \ + --wait +``` + +## 2. UI / CLI address + +```bash +kubectl get svc -n argocd argocd-server +``` + +**LoadBalancer** should show **`192.168.50.210`**. Log in as **`admin`**; initial password: + +```bash +kubectl -n argocd get secret argocd-initial-admin-secret \ + -o jsonpath='{.data.password}' | base64 -d +echo +``` + +Change the password in the UI or via `argocd account update-password`. + +## 3. Register this repo (if private) + +Use **Settings → Repositories** in the UI, or `argocd repo add` / a `Secret` of type `repository`. + +## 4. App-of-apps (optional) + +1. Edit **`root-application.yaml`**: set **`repoURL`** and **`targetRevision`** to this repository. +2. Commit **`Application`** manifests under **`apps/`** (see **`apps/README.md`**). +3. Apply the root: + + ```bash + kubectl apply -f clusters/noble/bootstrap/argocd/root-application.yaml + ``` + +Until **`apps/`** contains valid **`Application`** resources, the root app may show **OutOfSync** or sync nothing — that is expected. + +## Versions + +Pinned in **`values.yaml`** comments (chart **9.4.17** / Argo CD **v3.3.6** at time of writing). Bump **`--version`** when upgrading. diff --git a/clusters/noble/bootstrap/argocd/apps/README.md b/clusters/noble/bootstrap/argocd/apps/README.md new file mode 100644 index 0000000..14e4af0 --- /dev/null +++ b/clusters/noble/bootstrap/argocd/apps/README.md @@ -0,0 +1,10 @@ +# Argo CD — app-of-apps children + +Add **`Application`** manifests here (one file per workload or group). The **`noble-root`** Application in the parent directory syncs this folder. + +Example patterns: + +- **Helm:** `spec.source` with `chart`, `repoURL` (Helm repo), and `helm.valueFiles` pointing at paths in the same git repo. +- **Kustomize / plain manifests:** `spec.source.path` to a directory of YAML. + +The historical **`clusters/noble/apps/*`** tree is written for **manual `helm upgrade`**; migrating each app to an Argo CD `Application` is optional follow-up work. diff --git a/clusters/noble/bootstrap/argocd/root-application.yaml b/clusters/noble/bootstrap/argocd/root-application.yaml new file mode 100644 index 0000000..79dcc72 --- /dev/null +++ b/clusters/noble/bootstrap/argocd/root-application.yaml @@ -0,0 +1,30 @@ +# App-of-apps root — apply after Argo CD is running. +# +# 1. Set spec.source.repoURL (and targetRevision) to this git repository. +# 2. kubectl apply -f clusters/noble/bootstrap/argocd/root-application.yaml +# +# Syncs **Application** YAMLs under **apps/** (add workloads there). Do **not** +# point at **clusters/noble/apps/** — that tree is Helm values for manual installs. +# +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: noble-root + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://gitea.pcenicni.ca/gsdavidp/home-server.git + targetRevision: main + path: clusters/noble/bootstrap/argocd/apps + destination: + server: https://kubernetes.default.svc + namespace: argocd + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/clusters/noble/bootstrap/argocd/values.yaml b/clusters/noble/bootstrap/argocd/values.yaml new file mode 100644 index 0000000..200e2fd --- /dev/null +++ b/clusters/noble/bootstrap/argocd/values.yaml @@ -0,0 +1,25 @@ +# Argo CD — noble lab (GitOps) +# +# Chart: argo/argo-cd — pin version on the helm command (e.g. 9.4.17). +# MetalLB: Argo CD UI/API uses pool IP **192.168.50.210** (Traefik stays **192.168.50.211**). +# +# helm repo add argo https://argoproj.github.io/argo-helm +# helm upgrade --install argocd argo/argo-cd -n argocd --create-namespace \ +# --version 9.4.17 -f clusters/noble/bootstrap/argocd/values.yaml --wait +# +# Initial admin password: kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath='{.data.password}' | base64 -d +# +# Optional: kubectl apply -f clusters/noble/bootstrap/argocd/root-application.yaml + +global: + domain: "" + +configs: + params: + server.insecure: false + +server: + service: + type: LoadBalancer + annotations: + metallb.io/loadBalancerIPs: 192.168.50.210 diff --git a/talos/CLUSTER-BUILD.md b/talos/CLUSTER-BUILD.md index ed2576d..05ad58a 100644 --- a/talos/CLUSTER-BUILD.md +++ b/talos/CLUSTER-BUILD.md @@ -9,7 +9,12 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) - **MetalLB** Helm **0.15.3** / app **v0.15.3**; **IPAddressPool** `noble-l2` + **L2Advertisement** — pool **`192.168.50.210`–`192.168.50.229`**. - **kube-vip** DaemonSet **3/3** on control planes; VIP **`192.168.50.230`** on **`ens18`** (`vip_subnet` **`/32`** required — bare **`32`** breaks parsing). **Verified from workstation:** `kubectl config set-cluster noble --server=https://192.168.50.230:6443` then **`kubectl get --raw /healthz`** → **`ok`** (`talos/kubeconfig`; see `talos/README.md`). - **metrics-server** Helm **3.13.0** / app **v0.8.0** — `clusters/noble/apps/metrics-server/values.yaml` (`--kubelet-insecure-tls` for Talos); **`kubectl top nodes`** works. -- **Still open:** Longhorn, Traefik, cert-manager, Argo CD, observability — checklist below. +- **Longhorn** Helm **1.11.1** / app **v1.11.1** — `clusters/noble/apps/longhorn/` (PSA **privileged** namespace, `defaultDataPath` `/var/mnt/longhorn`, `preUpgradeChecker` enabled); **StorageClass** `longhorn` (default); **`nodes.longhorn.io`** all **Ready**; test **PVC** `Bound` on `longhorn`. +- **Traefik** Helm **39.0.6** / app **v3.6.11** — `clusters/noble/apps/traefik/`; **`Service`** **`LoadBalancer`** **`EXTERNAL-IP` `192.168.50.211`**; **`IngressClass`** **`traefik`** (default). Point **`*.apps.noble.lab.pcenicni.dev`** at **`192.168.50.211`**. MetalLB pool verification was done before replacing the temporary nginx test with Traefik. +- **cert-manager** Helm **v1.20.0** / app **v1.20.0** — `clusters/noble/apps/cert-manager/`; **`ClusterIssuer`** **`letsencrypt-staging`** and **`letsencrypt-prod`** (HTTP-01, ingress class **`traefik`**); ACME email **`certificates@noble.lab.pcenicni.dev`** (edit in manifests if you want a different mailbox). +- **Newt** Helm **1.2.0** / app **1.10.1** — `clusters/noble/apps/newt/` (**fossorial/newt**); Pangolin site tunnel — **`newt-pangolin-auth`** Secret (**`PANGOLIN_ENDPOINT`**, **`NEWT_ID`**, **`NEWT_SECRET`**). **Public DNS** is **not** automated with ExternalDNS: **CNAME** records at your DNS host per Pangolin’s domain instructions, plus **Integration API** for HTTP resources/targets — see **`clusters/noble/apps/newt/README.md`**. LAN access to Traefik can still use **`*.apps.noble.lab.pcenicni.dev`** → **`192.168.50.211`** (split horizon / local resolver). +- **Argo CD** Helm **9.4.17** / app **v3.3.6** — `clusters/noble/bootstrap/argocd/`; **`argocd-server`** **`LoadBalancer`** **`192.168.50.210`**; app-of-apps scaffold under **`bootstrap/argocd/apps/`** (edit **`root-application.yaml`** `repoURL` before applying). +- **Still open:** observability — checklist below. ## Inventory @@ -27,8 +32,9 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) | Kubernetes API VIP (kube-vip) | `192.168.50.230` (see `talos/README.md`; align with `talos/talconfig.yaml` `additionalApiServerCertSans`) | | MetalLB L2 pool | `192.168.50.210`–`192.168.50.229` | | Argo CD `LoadBalancer` | **Pick one IP** in the MetalLB pool (e.g. `192.168.50.210`) | -| Apps ingress DNS | `*.apps.noble.lab.pcenicni.dev` | -| ExternalDNS | Pangolin (map to supported ExternalDNS provider when documented) | +| Traefik (apps ingress) | `192.168.50.211` — **`metallb.io/loadBalancerIPs`** in `clusters/noble/apps/traefik/values.yaml` | +| Apps ingress (LAN / split horizon) | `*.apps.noble.lab.pcenicni.dev` → Traefik LB | +| Public DNS (Pangolin) | **Newt** tunnel + **CNAME** at registrar + **Integration API** — `clusters/noble/apps/newt/` | | Velero | S3-compatible URL — configure later | ## Versions @@ -39,6 +45,11 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) - Cilium: **1.16.6** (Helm chart; see `clusters/noble/apps/cilium/README.md`) - MetalLB: **0.15.3** (Helm chart; app **v0.15.3**) - metrics-server: **3.13.0** (Helm chart; app **v0.8.0**) +- Longhorn: **1.11.1** (Helm chart; app **v1.11.1**) +- Traefik: **39.0.6** (Helm chart; app **v3.6.11**) +- cert-manager: **v1.20.0** (Helm chart; app **v1.20.0**) +- Newt (Fossorial): **1.2.0** (Helm chart; app **1.10.1**) +- Argo CD: **9.4.17** (Helm chart `argo/argo-cd`; app **v3.3.6**) ## Repo paths (this workspace) @@ -52,8 +63,12 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) | kube-vip (kustomize) | `clusters/noble/apps/kube-vip/` (`vip_interface` e.g. `ens18`) | | Cilium (Helm values) | `clusters/noble/apps/cilium/` — `values.yaml` (phase 1), optional `values-kpr.yaml`, `README.md` | | MetalLB | `clusters/noble/apps/metallb/` — `namespace.yaml` (PSA **privileged**), `ip-address-pool.yaml`, `kustomization.yaml`, `README.md` | -| Longhorn Helm values | `clusters/noble/apps/longhorn/values.yaml` | +| Longhorn | `clusters/noble/apps/longhorn/` — `values.yaml`, `namespace.yaml` (PSA **privileged**), `kustomization.yaml` | | metrics-server (Helm values) | `clusters/noble/apps/metrics-server/values.yaml` | +| Traefik (Helm values) | `clusters/noble/apps/traefik/` — `values.yaml`, `namespace.yaml`, `README.md` | +| cert-manager (Helm + ClusterIssuers) | `clusters/noble/apps/cert-manager/` — `values.yaml`, `namespace.yaml`, `kustomization.yaml`, `README.md` | +| Newt / Pangolin tunnel (Helm) | `clusters/noble/apps/newt/` — `values.yaml`, `namespace.yaml`, `README.md` | +| Argo CD (bootstrap + app-of-apps) | `clusters/noble/bootstrap/argocd/` — `values.yaml`, `root-application.yaml`, `apps/`, `README.md` | **Git vs cluster:** manifests and `talconfig` live in git; **`talhelper genconfig -o out`**, bootstrap, Helm, and `kubectl` run on your LAN. See **`talos/README.md`** for workstation reachability (lab LAN/VPN), **`talosctl kubeconfig`** vs Kubernetes `server:` (VIP vs node IP), and **`--insecure`** only in maintenance. @@ -91,17 +106,18 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) - [x] **Cilium** (Helm **1.16.6**) — **required** before MetalLB if `cni: none` (`clusters/noble/apps/cilium/`) - [x] **metrics-server** — Helm **3.13.0**; values in `clusters/noble/apps/metrics-server/values.yaml`; verify `kubectl top nodes` -- [ ] **Longhorn** — Talos: `talconfig.with-longhorn.yaml` + `talos/README.md` §5; Helm: `clusters/noble/apps/longhorn/values.yaml` (`defaultDataPath` `/var/mnt/longhorn`) +- [x] **Longhorn** — Talos: user volume + kubelet mounts + extensions (`talos/README.md` §5); Helm **1.11.1**; `kubectl apply -k clusters/noble/apps/longhorn`; verify **`nodes.longhorn.io`** and test PVC **`Bound`** - [x] **MetalLB** — chart installed; **pool + L2** from `clusters/noble/apps/metallb/` applied (`192.168.50.210`–`229`) -- [ ] **`Service` `LoadBalancer`** test — assign an IP from `210`–`229` (e.g. dummy `LoadBalancer` or Traefik) -- [ ] **Traefik** `LoadBalancer` for `*.apps.noble.lab.pcenicni.dev` -- [ ] **cert-manager** + ClusterIssuer (staging → prod) -- [ ] **ExternalDNS** (Pangolin-compatible provider) +- [x] **`Service` `LoadBalancer`** / pool check — MetalLB assigns from `210`–`229` (validated before Traefik; temporary nginx test removed in favor of Traefik) +- [x] **Traefik** `LoadBalancer` for `*.apps.noble.lab.pcenicni.dev` — `clusters/noble/apps/traefik/`; **`192.168.50.211`** +- [x] **cert-manager** + ClusterIssuer (**`letsencrypt-staging`** / **`letsencrypt-prod`**) — `clusters/noble/apps/cert-manager/` +- [x] **Newt** (Pangolin tunnel; replaces ExternalDNS for public DNS) — `clusters/noble/apps/newt/` — **`newt-pangolin-auth`**; CNAME + **Integration API** per **`newt/README.md`** ## Phase C — GitOps -- [ ] **Argo CD** bootstrap (`clusters/noble/bootstrap/argocd`, root app) — path TBD when added -- [ ] Argo CD server **LoadBalancer** with dedicated pool IP +- [x] **Argo CD** bootstrap — `clusters/noble/bootstrap/argocd/` (`helm upgrade --install argocd …`) +- [x] Argo CD server **LoadBalancer** — **`192.168.50.210`** (see `values.yaml`) +- [ ] **App-of-apps** — set **`repoURL`** in **`root-application.yaml`**, add **`Application`** manifests under **`bootstrap/argocd/apps/`**, apply **`root-application.yaml`** - [ ] SSO — later ## Phase D — Observability @@ -129,9 +145,10 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) - [x] `kubectl get nodes` — all **Ready** - [x] API via VIP `:6443` — **`kubectl get --raw /healthz`** → **`ok`** with kubeconfig **`server:`** `https://192.168.50.230:6443` -- [ ] Test `LoadBalancer` receives IP from `210`–`229` -- [ ] Sample Ingress + cert + ExternalDNS record -- [ ] PVC bound; Prometheus/Loki durable if configured +- [x] Ingress **`LoadBalancer`** in pool `210`–`229` (**Traefik** → **`192.168.50.211`**) +- [x] **Argo CD** UI — **`argocd-server`** **`LoadBalancer`** **`192.168.50.210`** (initial **`admin`** password from **`argocd-initial-admin-secret`**) +- [ ] Sample Ingress + cert (cert-manager ready) + Pangolin resource + CNAME +- [x] PVC **`Bound`** on **Longhorn** (`storageClassName: longhorn`); Prometheus/Loki durable when configured --- diff --git a/talos/talconfig.with-longhorn.yaml b/talos/talconfig.with-longhorn.yaml index b4d9244..2069860 100644 --- a/talos/talconfig.with-longhorn.yaml +++ b/talos/talconfig.with-longhorn.yaml @@ -10,10 +10,10 @@ # After changing schematic/extensions: regenerate configs, upgrade nodes with new installer image, then reboot if needed. # Helm must set defaultDataPath to /var/mnt/longhorn (see clusters/noble/apps/longhorn/values.yaml). # -# Image Factory schematic (iscsi-tools + util-linux-tools), nocloud installer: -# factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6 -# After edits, run talhelper genconfig — `machine.install.image` in out/*.yaml should match this schematic (path may be metal-installer/ etc. on bare metal). -# Upgrade: talosctl upgrade --image -n +# Image Factory schematic (iscsi-tools + util-linux-tools), nocloud installer — pinned per-node via `talosImageURL` +# (base URL only, no `:tag` — talhelper validates and appends `talosVersion`). +# After edits: `talhelper genconfig -o out` → `machine.install.image` in out/*.yaml. +# Cluster upgrade: `talosctl upgrade --image factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6 -n --wait` clusterName: noble talosVersion: v1.12.6 endpoint: https://192.168.50.230:6443 @@ -74,6 +74,10 @@ patches: name: none machine: kubelet: + # Avoid NodeIPController warnings when VIP / multiple addresses exist; pin to LAN. + nodeIP: + validSubnets: + - 192.168.50.0/24 extraMounts: - destination: /var/mnt/longhorn type: bind diff --git a/talos/talconfig.yaml b/talos/talconfig.yaml index b4d9244..2069860 100644 --- a/talos/talconfig.yaml +++ b/talos/talconfig.yaml @@ -10,10 +10,10 @@ # After changing schematic/extensions: regenerate configs, upgrade nodes with new installer image, then reboot if needed. # Helm must set defaultDataPath to /var/mnt/longhorn (see clusters/noble/apps/longhorn/values.yaml). # -# Image Factory schematic (iscsi-tools + util-linux-tools), nocloud installer: -# factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6 -# After edits, run talhelper genconfig — `machine.install.image` in out/*.yaml should match this schematic (path may be metal-installer/ etc. on bare metal). -# Upgrade: talosctl upgrade --image -n +# Image Factory schematic (iscsi-tools + util-linux-tools), nocloud installer — pinned per-node via `talosImageURL` +# (base URL only, no `:tag` — talhelper validates and appends `talosVersion`). +# After edits: `talhelper genconfig -o out` → `machine.install.image` in out/*.yaml. +# Cluster upgrade: `talosctl upgrade --image factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6 -n --wait` clusterName: noble talosVersion: v1.12.6 endpoint: https://192.168.50.230:6443 @@ -74,6 +74,10 @@ patches: name: none machine: kubelet: + # Avoid NodeIPController warnings when VIP / multiple addresses exist; pin to LAN. + nodeIP: + validSubnets: + - 192.168.50.0/24 extraMounts: - destination: /var/mnt/longhorn type: bind