diff --git a/clusters/noble/apps/cert-manager/README.md b/clusters/noble/apps/cert-manager/README.md new file mode 100644 index 0000000..7a31ae5 --- /dev/null +++ b/clusters/noble/apps/cert-manager/README.md @@ -0,0 +1,37 @@ +# cert-manager — noble + +**Prerequisites:** **Traefik** (ingress class **`traefik`**), DNS for **`*.apps.noble.lab.pcenicni.dev`** → Traefik LB. + +1. Create the namespace: + + ```bash + kubectl apply -f clusters/noble/apps/cert-manager/namespace.yaml + ``` + +2. Install the chart (CRDs included via `values.yaml`): + + ```bash + helm repo add jetstack https://charts.jetstack.io + helm repo update + helm upgrade --install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --version v1.20.0 \ + -f clusters/noble/apps/cert-manager/values.yaml \ + --wait + ``` + +3. Optionally edit **`spec.acme.email`** in both ClusterIssuer manifests (default **`certificates@noble.lab.pcenicni.dev`**) — Let’s Encrypt uses this for expiry and account notices. Do **not** use **`example.com`** (ACME rejects it). + +4. Apply ClusterIssuers (staging then prod, or both): + + ```bash + kubectl apply -k clusters/noble/apps/cert-manager + ``` + +5. Confirm: + + ```bash + kubectl get clusterissuer + ``` + +Use **`cert-manager.io/cluster-issuer: letsencrypt-staging`** on Ingresses while testing; switch to **`letsencrypt-prod`** when ready. diff --git a/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-prod.yaml b/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-prod.yaml new file mode 100644 index 0000000..677928b --- /dev/null +++ b/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-prod.yaml @@ -0,0 +1,16 @@ +# Let's Encrypt production — trusted certificates; respect rate limits. +# Prefer a real mailbox for expiry notices; this domain is accepted by LE (edit if needed). +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod +spec: + acme: + email: certificates@noble.lab.pcenicni.dev + server: https://acme-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-prod-account-key + solvers: + - http01: + ingress: + class: traefik diff --git a/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-staging.yaml b/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-staging.yaml new file mode 100644 index 0000000..560d839 --- /dev/null +++ b/clusters/noble/apps/cert-manager/clusterissuer-letsencrypt-staging.yaml @@ -0,0 +1,16 @@ +# Let's Encrypt staging — use for tests (untrusted issuer in browsers). +# Prefer a real mailbox for expiry notices; this domain is accepted by LE (edit if needed). +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging +spec: + acme: + email: certificates@noble.lab.pcenicni.dev + server: https://acme-staging-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-staging-account-key + solvers: + - http01: + ingress: + class: traefik diff --git a/clusters/noble/apps/cert-manager/kustomization.yaml b/clusters/noble/apps/cert-manager/kustomization.yaml new file mode 100644 index 0000000..3443eb3 --- /dev/null +++ b/clusters/noble/apps/cert-manager/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - clusterissuer-letsencrypt-staging.yaml + - clusterissuer-letsencrypt-prod.yaml diff --git a/clusters/noble/apps/cert-manager/namespace.yaml b/clusters/noble/apps/cert-manager/namespace.yaml new file mode 100644 index 0000000..3929e91 --- /dev/null +++ b/clusters/noble/apps/cert-manager/namespace.yaml @@ -0,0 +1,9 @@ +# cert-manager controller + webhook — noble lab +apiVersion: v1 +kind: Namespace +metadata: + name: cert-manager + labels: + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/audit: baseline + pod-security.kubernetes.io/warn: baseline diff --git a/clusters/noble/apps/cert-manager/values.yaml b/clusters/noble/apps/cert-manager/values.yaml new file mode 100644 index 0000000..ea2a2a7 --- /dev/null +++ b/clusters/noble/apps/cert-manager/values.yaml @@ -0,0 +1,14 @@ +# cert-manager — noble lab +# +# Chart: jetstack/cert-manager — pin version on the helm command (e.g. v1.20.0). +# +# kubectl apply -f clusters/noble/apps/cert-manager/namespace.yaml +# helm repo add jetstack https://charts.jetstack.io +# helm repo update +# helm upgrade --install cert-manager jetstack/cert-manager -n cert-manager \ +# --version v1.20.0 -f clusters/noble/apps/cert-manager/values.yaml --wait +# +# kubectl apply -k clusters/noble/apps/cert-manager + +crds: + enabled: true diff --git a/clusters/noble/apps/longhorn/values.yaml b/clusters/noble/apps/longhorn/values.yaml index b7bce02..69a34a0 100644 --- a/clusters/noble/apps/longhorn/values.yaml +++ b/clusters/noble/apps/longhorn/values.yaml @@ -16,6 +16,6 @@ defaultSettings: # Default 30% reserved often makes small data disks look "full" to the scheduler. storageReservedPercentageForDefaultDisk: "10" -# Pre-upgrade Job waits for healthy managers; disable while fixing Talos image (iscsi-tools) / kubelet binds, then re-enable. +# Pre-upgrade Job: keep enabled for normal Helm upgrades (disable only if GitOps sync fights the Job). preUpgradeChecker: - jobEnabled: false + jobEnabled: true diff --git a/clusters/noble/apps/metallb/README.md b/clusters/noble/apps/metallb/README.md index cb1fc87..93b6a34 100644 --- a/clusters/noble/apps/metallb/README.md +++ b/clusters/noble/apps/metallb/README.md @@ -41,11 +41,11 @@ Then restart MetalLB pods if they were failing (`kubectl get pods -n metallb-sys kubectl apply -k clusters/noble/apps/metallb ``` -3. Confirm a test `Service` `type: LoadBalancer` receives an address in `192.168.50.210`–`192.168.50.229`. +3. Confirm a `Service` `type: LoadBalancer` receives an address in `192.168.50.210`–`192.168.50.229` (e.g. **`kubectl get svc -n traefik traefik`** after installing **Traefik** in `clusters/noble/apps/traefik/`). -Reserve **one** IP in that range for Argo CD (e.g. `192.168.50.210`) via `spec.loadBalancerIP` or chart values when you expose the server. +Reserve **one** IP in that range for Argo CD (e.g. `192.168.50.210`) via `spec.loadBalancerIP` or chart values when you expose the server. Traefik pins **`192.168.50.211`** in **`clusters/noble/apps/traefik/values.yaml`**. -### `Pending` MetalLB pods +## `Pending` MetalLB pods 1. `kubectl get nodes` — every node **`Ready`**? If **`NotReady`** or **`NetworkUnavailable`**, finish **CNI** install first. 2. `kubectl describe pod -n metallb-system ` — read **Events** at the bottom (`0/N nodes are available: …`). diff --git a/clusters/noble/apps/newt/README.md b/clusters/noble/apps/newt/README.md new file mode 100644 index 0000000..1bb62d8 --- /dev/null +++ b/clusters/noble/apps/newt/README.md @@ -0,0 +1,79 @@ +# Newt (Pangolin) — noble + +This is the **primary** automation path for **public** hostnames to workloads in this cluster (it **replaces** in-cluster ExternalDNS). [Newt](https://github.com/fosrl/newt) is the on-prem agent that connects your cluster to a **Pangolin** site (WireGuard tunnel). The [Fossorial Helm chart](https://github.com/fosrl/helm-charts) deploys one or more instances. + +**Secrets:** Never commit endpoint, Newt ID, or Newt secret. If credentials were pasted into chat or CI logs, **rotate them** in Pangolin and recreate the Kubernetes Secret. + +## 1. Create the Secret + +Keys must match `values.yaml` (`PANGOLIN_ENDPOINT`, `NEWT_ID`, `NEWT_SECRET`): + +```bash +kubectl apply -f clusters/noble/apps/newt/namespace.yaml + +kubectl -n newt create secret generic newt-pangolin-auth \ + --from-literal=PANGOLIN_ENDPOINT='https://pangolin.pcenicni.dev' \ + --from-literal=NEWT_ID='YOUR_NEWT_ID' \ + --from-literal=NEWT_SECRET='YOUR_NEWT_SECRET' +``` + +Use the Pangolin UI or [Integration API](https://docs.pangolin.net/manage/common-api-routes) (`pick-site-defaults` + `create site`) to obtain a Newt ID and secret for a new site if you are not reusing an existing pair. + +## 2. Install the chart + +```bash +helm repo add fossorial https://charts.fossorial.io +helm repo update +helm upgrade --install newt fossorial/newt \ + --namespace newt \ + --version 1.2.0 \ + -f clusters/noble/apps/newt/values.yaml \ + --wait +``` + +## 3. DNS: CNAME at your DNS host + Pangolin API for routes + +Pangolin does not replace your public DNS provider. Typical flow: + +1. **Link a domain** in Pangolin (organization **Domains**). For **CNAME**-style domains, Pangolin shows the hostname you must **CNAME** to at Cloudflare / your registrar (see [Domains](https://docs.pangolin.net/manage/common-api-routes#list-domains)). +2. **Create public HTTP resources** (and **targets** to your Newt **site**) via the [Integration API](https://docs.pangolin.net/manage/integration-api) — same flows as the UI. Swagger: `https:///v1/docs` (self-hosted: enable `enable_integration_api` and route `api.example.com` → integration port per [docs](https://docs.pangolin.net/self-host/advanced/integration-api)). + +Minimal patterns (Bearer token = org or root API key): + +```bash +export API_BASE='https://api.example.com/v1' # your Pangolin Integration API base +export ORG_ID='your-org-id' +export TOKEN='your-integration-api-key' + +# Domains already linked to the org (use domainId when creating a resource) +curl -sS -H "Authorization: Bearer ${TOKEN}" \ + "${API_BASE}/org/${ORG_ID}/domains" + +# Create an HTTP resource on a domain (FQDN = subdomain + base domain for NS/wildcard domains) +curl -sS -X PUT -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' \ + "${API_BASE}/org/${ORG_ID}/resource" \ + -d '{ + "name": "Example app", + "http": true, + "domainId": "YOUR_DOMAIN_ID", + "protocol": "tcp", + "subdomain": "my-app" + }' + +# Point the resource at your Newt site backend (siteId from list sites / create site; ip:port inside the tunnel) +curl -sS -X PUT -H "Authorization: Bearer ${TOKEN}" -H 'Content-Type: application/json' \ + "${API_BASE}/resource/RESOURCE_ID/target" \ + -d '{ + "siteId": YOUR_SITE_ID, + "ip": "10.x.x.x", + "port": 443, + "method": "http" + }' +``` + +Exact JSON fields and IDs differ by domain type (**ns** vs **cname** vs **wildcard**); see [Common API routes](https://docs.pangolin.net/manage/common-api-routes) and Swagger. + +## LAN vs internet + +- **LAN / VPN:** point **`*.apps.noble.lab.pcenicni.dev`** at the Traefik **LoadBalancer** (**`192.168.50.211`**) with local or split-horizon DNS if you want direct in-lab access. +- **Internet-facing:** use Pangolin **resources** + **targets** to the Newt **site**; public names rely on **CNAME** records at your DNS provider per Pangolin’s domain setup, not on ExternalDNS in the cluster. diff --git a/clusters/noble/apps/newt/namespace.yaml b/clusters/noble/apps/newt/namespace.yaml new file mode 100644 index 0000000..38c4ec3 --- /dev/null +++ b/clusters/noble/apps/newt/namespace.yaml @@ -0,0 +1,9 @@ +# Newt (Pangolin site tunnel client) — noble lab +apiVersion: v1 +kind: Namespace +metadata: + name: newt + labels: + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/audit: baseline + pod-security.kubernetes.io/warn: baseline diff --git a/clusters/noble/apps/newt/values.yaml b/clusters/noble/apps/newt/values.yaml new file mode 100644 index 0000000..e238912 --- /dev/null +++ b/clusters/noble/apps/newt/values.yaml @@ -0,0 +1,26 @@ +# Newt — noble lab (Fossorial Helm chart) +# +# Credentials MUST come from a Secret — do not put endpoint/id/secret in git. +# +# kubectl apply -f clusters/noble/apps/newt/namespace.yaml +# kubectl -n newt create secret generic newt-pangolin-auth \ +# --from-literal=PANGOLIN_ENDPOINT='https://pangolin.example.com' \ +# --from-literal=NEWT_ID='...' \ +# --from-literal=NEWT_SECRET='...' +# +# helm repo add fossorial https://charts.fossorial.io +# helm upgrade --install newt fossorial/newt -n newt \ +# --version 1.2.0 -f clusters/noble/apps/newt/values.yaml --wait +# +# See README.md for Pangolin Integration API (domains + HTTP resources + CNAME). + +newtInstances: + - name: main-tunnel + enabled: true + replicas: 1 + auth: + existingSecretName: newt-pangolin-auth + keys: + endpointKey: PANGOLIN_ENDPOINT + idKey: NEWT_ID + secretKey: NEWT_SECRET diff --git a/clusters/noble/apps/traefik/README.md b/clusters/noble/apps/traefik/README.md new file mode 100644 index 0000000..51598b0 --- /dev/null +++ b/clusters/noble/apps/traefik/README.md @@ -0,0 +1,33 @@ +# Traefik — noble + +**Prerequisites:** **Cilium**, **MetalLB** (pool + L2), nodes **Ready**. + +1. Create the namespace (Pod Security **baseline** — Traefik needs more than **restricted**): + + ```bash + kubectl apply -f clusters/noble/apps/traefik/namespace.yaml + ``` + +2. Install the chart (**do not** use `--create-namespace` if the namespace already exists): + + ```bash + helm repo add traefik https://traefik.github.io/charts + helm repo update + helm upgrade --install traefik traefik/traefik \ + --namespace traefik \ + --version 39.0.6 \ + -f clusters/noble/apps/traefik/values.yaml \ + --wait + ``` + +3. Confirm the Service has a pool address. On the **LAN**, **`*.apps.noble.lab.pcenicni.dev`** can resolve to this IP (split horizon / local DNS). **Public** names go through **Pangolin + Newt** (CNAME + API), not ExternalDNS — see **`clusters/noble/apps/newt/README.md`**. + + ```bash + kubectl get svc -n traefik traefik + ``` + + Values pin **`192.168.50.211`** via **`metallb.io/loadBalancerIPs`**. **`192.168.50.210`** stays free for Argo CD. + +4. Create **Ingress** resources with **`ingressClassName: traefik`** (or rely on the default class). **TLS:** add **`cert-manager.io/cluster-issuer: letsencrypt-staging`** (or **`letsencrypt-prod`**) and **`tls`** hosts — see **`clusters/noble/apps/cert-manager/README.md`**. + +5. **Public DNS:** use **Newt** + Pangolin (**CNAME** at your DNS host + **Integration API** for resources/targets) — **`clusters/noble/apps/newt/README.md`**. diff --git a/clusters/noble/apps/traefik/namespace.yaml b/clusters/noble/apps/traefik/namespace.yaml new file mode 100644 index 0000000..b758f9a --- /dev/null +++ b/clusters/noble/apps/traefik/namespace.yaml @@ -0,0 +1,10 @@ +# Traefik controller — apply before Helm (omit --create-namespace on install). +# Ingress controller needs capabilities beyond "restricted"; use baseline. +apiVersion: v1 +kind: Namespace +metadata: + name: traefik + labels: + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/audit: baseline + pod-security.kubernetes.io/warn: baseline diff --git a/clusters/noble/apps/traefik/values.yaml b/clusters/noble/apps/traefik/values.yaml new file mode 100644 index 0000000..e74b28c --- /dev/null +++ b/clusters/noble/apps/traefik/values.yaml @@ -0,0 +1,29 @@ +# Traefik ingress controller — noble lab +# +# Chart: traefik/traefik — pin version on the helm command (e.g. 39.0.6). +# DNS: point *.apps.noble.lab.pcenicni.dev to the LoadBalancer IP below. +# +# kubectl apply -f clusters/noble/apps/traefik/namespace.yaml +# helm repo add traefik https://traefik.github.io/charts +# helm upgrade --install traefik traefik/traefik -n traefik \ +# --version 39.0.6 -f clusters/noble/apps/traefik/values.yaml --wait + +service: + type: LoadBalancer + annotations: + metallb.io/loadBalancerIPs: 192.168.50.211 + +ingressClass: + enabled: true + isDefaultClass: true + name: traefik + +# Ingress-only; Gateway API objects from the chart are not needed here. +gateway: + enabled: false + +gatewayClass: + enabled: false + +deployment: + replicas: 1 diff --git a/clusters/noble/bootstrap/argocd/README.md b/clusters/noble/bootstrap/argocd/README.md new file mode 100644 index 0000000..dfd2433 --- /dev/null +++ b/clusters/noble/bootstrap/argocd/README.md @@ -0,0 +1,52 @@ +# Argo CD — noble (bootstrap) + +**Prerequisites:** cluster **Ready**, **MetalLB** pool **`192.168.50.210`–`229`** (Argo CD uses **`192.168.50.210`**; Traefik **`192.168.50.211`**). + +## 1. Install + +```bash +helm repo add argo https://argoproj.github.io/argo-helm +helm repo update +helm upgrade --install argocd argo/argo-cd \ + --namespace argocd \ + --create-namespace \ + --version 9.4.17 \ + -f clusters/noble/bootstrap/argocd/values.yaml \ + --wait +``` + +## 2. UI / CLI address + +```bash +kubectl get svc -n argocd argocd-server +``` + +**LoadBalancer** should show **`192.168.50.210`**. Log in as **`admin`**; initial password: + +```bash +kubectl -n argocd get secret argocd-initial-admin-secret \ + -o jsonpath='{.data.password}' | base64 -d +echo +``` + +Change the password in the UI or via `argocd account update-password`. + +## 3. Register this repo (if private) + +Use **Settings → Repositories** in the UI, or `argocd repo add` / a `Secret` of type `repository`. + +## 4. App-of-apps (optional) + +1. Edit **`root-application.yaml`**: set **`repoURL`** and **`targetRevision`** to this repository. +2. Commit **`Application`** manifests under **`apps/`** (see **`apps/README.md`**). +3. Apply the root: + + ```bash + kubectl apply -f clusters/noble/bootstrap/argocd/root-application.yaml + ``` + +Until **`apps/`** contains valid **`Application`** resources, the root app may show **OutOfSync** or sync nothing — that is expected. + +## Versions + +Pinned in **`values.yaml`** comments (chart **9.4.17** / Argo CD **v3.3.6** at time of writing). Bump **`--version`** when upgrading. diff --git a/clusters/noble/bootstrap/argocd/apps/README.md b/clusters/noble/bootstrap/argocd/apps/README.md new file mode 100644 index 0000000..14e4af0 --- /dev/null +++ b/clusters/noble/bootstrap/argocd/apps/README.md @@ -0,0 +1,10 @@ +# Argo CD — app-of-apps children + +Add **`Application`** manifests here (one file per workload or group). The **`noble-root`** Application in the parent directory syncs this folder. + +Example patterns: + +- **Helm:** `spec.source` with `chart`, `repoURL` (Helm repo), and `helm.valueFiles` pointing at paths in the same git repo. +- **Kustomize / plain manifests:** `spec.source.path` to a directory of YAML. + +The historical **`clusters/noble/apps/*`** tree is written for **manual `helm upgrade`**; migrating each app to an Argo CD `Application` is optional follow-up work. diff --git a/clusters/noble/bootstrap/argocd/root-application.yaml b/clusters/noble/bootstrap/argocd/root-application.yaml new file mode 100644 index 0000000..79dcc72 --- /dev/null +++ b/clusters/noble/bootstrap/argocd/root-application.yaml @@ -0,0 +1,30 @@ +# App-of-apps root — apply after Argo CD is running. +# +# 1. Set spec.source.repoURL (and targetRevision) to this git repository. +# 2. kubectl apply -f clusters/noble/bootstrap/argocd/root-application.yaml +# +# Syncs **Application** YAMLs under **apps/** (add workloads there). Do **not** +# point at **clusters/noble/apps/** — that tree is Helm values for manual installs. +# +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: noble-root + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://gitea.pcenicni.ca/gsdavidp/home-server.git + targetRevision: main + path: clusters/noble/bootstrap/argocd/apps + destination: + server: https://kubernetes.default.svc + namespace: argocd + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/clusters/noble/bootstrap/argocd/values.yaml b/clusters/noble/bootstrap/argocd/values.yaml new file mode 100644 index 0000000..200e2fd --- /dev/null +++ b/clusters/noble/bootstrap/argocd/values.yaml @@ -0,0 +1,25 @@ +# Argo CD — noble lab (GitOps) +# +# Chart: argo/argo-cd — pin version on the helm command (e.g. 9.4.17). +# MetalLB: Argo CD UI/API uses pool IP **192.168.50.210** (Traefik stays **192.168.50.211**). +# +# helm repo add argo https://argoproj.github.io/argo-helm +# helm upgrade --install argocd argo/argo-cd -n argocd --create-namespace \ +# --version 9.4.17 -f clusters/noble/bootstrap/argocd/values.yaml --wait +# +# Initial admin password: kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath='{.data.password}' | base64 -d +# +# Optional: kubectl apply -f clusters/noble/bootstrap/argocd/root-application.yaml + +global: + domain: "" + +configs: + params: + server.insecure: false + +server: + service: + type: LoadBalancer + annotations: + metallb.io/loadBalancerIPs: 192.168.50.210 diff --git a/talos/CLUSTER-BUILD.md b/talos/CLUSTER-BUILD.md index ed2576d..05ad58a 100644 --- a/talos/CLUSTER-BUILD.md +++ b/talos/CLUSTER-BUILD.md @@ -9,7 +9,12 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) - **MetalLB** Helm **0.15.3** / app **v0.15.3**; **IPAddressPool** `noble-l2` + **L2Advertisement** — pool **`192.168.50.210`–`192.168.50.229`**. - **kube-vip** DaemonSet **3/3** on control planes; VIP **`192.168.50.230`** on **`ens18`** (`vip_subnet` **`/32`** required — bare **`32`** breaks parsing). **Verified from workstation:** `kubectl config set-cluster noble --server=https://192.168.50.230:6443` then **`kubectl get --raw /healthz`** → **`ok`** (`talos/kubeconfig`; see `talos/README.md`). - **metrics-server** Helm **3.13.0** / app **v0.8.0** — `clusters/noble/apps/metrics-server/values.yaml` (`--kubelet-insecure-tls` for Talos); **`kubectl top nodes`** works. -- **Still open:** Longhorn, Traefik, cert-manager, Argo CD, observability — checklist below. +- **Longhorn** Helm **1.11.1** / app **v1.11.1** — `clusters/noble/apps/longhorn/` (PSA **privileged** namespace, `defaultDataPath` `/var/mnt/longhorn`, `preUpgradeChecker` enabled); **StorageClass** `longhorn` (default); **`nodes.longhorn.io`** all **Ready**; test **PVC** `Bound` on `longhorn`. +- **Traefik** Helm **39.0.6** / app **v3.6.11** — `clusters/noble/apps/traefik/`; **`Service`** **`LoadBalancer`** **`EXTERNAL-IP` `192.168.50.211`**; **`IngressClass`** **`traefik`** (default). Point **`*.apps.noble.lab.pcenicni.dev`** at **`192.168.50.211`**. MetalLB pool verification was done before replacing the temporary nginx test with Traefik. +- **cert-manager** Helm **v1.20.0** / app **v1.20.0** — `clusters/noble/apps/cert-manager/`; **`ClusterIssuer`** **`letsencrypt-staging`** and **`letsencrypt-prod`** (HTTP-01, ingress class **`traefik`**); ACME email **`certificates@noble.lab.pcenicni.dev`** (edit in manifests if you want a different mailbox). +- **Newt** Helm **1.2.0** / app **1.10.1** — `clusters/noble/apps/newt/` (**fossorial/newt**); Pangolin site tunnel — **`newt-pangolin-auth`** Secret (**`PANGOLIN_ENDPOINT`**, **`NEWT_ID`**, **`NEWT_SECRET`**). **Public DNS** is **not** automated with ExternalDNS: **CNAME** records at your DNS host per Pangolin’s domain instructions, plus **Integration API** for HTTP resources/targets — see **`clusters/noble/apps/newt/README.md`**. LAN access to Traefik can still use **`*.apps.noble.lab.pcenicni.dev`** → **`192.168.50.211`** (split horizon / local resolver). +- **Argo CD** Helm **9.4.17** / app **v3.3.6** — `clusters/noble/bootstrap/argocd/`; **`argocd-server`** **`LoadBalancer`** **`192.168.50.210`**; app-of-apps scaffold under **`bootstrap/argocd/apps/`** (edit **`root-application.yaml`** `repoURL` before applying). +- **Still open:** observability — checklist below. ## Inventory @@ -27,8 +32,9 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) | Kubernetes API VIP (kube-vip) | `192.168.50.230` (see `talos/README.md`; align with `talos/talconfig.yaml` `additionalApiServerCertSans`) | | MetalLB L2 pool | `192.168.50.210`–`192.168.50.229` | | Argo CD `LoadBalancer` | **Pick one IP** in the MetalLB pool (e.g. `192.168.50.210`) | -| Apps ingress DNS | `*.apps.noble.lab.pcenicni.dev` | -| ExternalDNS | Pangolin (map to supported ExternalDNS provider when documented) | +| Traefik (apps ingress) | `192.168.50.211` — **`metallb.io/loadBalancerIPs`** in `clusters/noble/apps/traefik/values.yaml` | +| Apps ingress (LAN / split horizon) | `*.apps.noble.lab.pcenicni.dev` → Traefik LB | +| Public DNS (Pangolin) | **Newt** tunnel + **CNAME** at registrar + **Integration API** — `clusters/noble/apps/newt/` | | Velero | S3-compatible URL — configure later | ## Versions @@ -39,6 +45,11 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) - Cilium: **1.16.6** (Helm chart; see `clusters/noble/apps/cilium/README.md`) - MetalLB: **0.15.3** (Helm chart; app **v0.15.3**) - metrics-server: **3.13.0** (Helm chart; app **v0.8.0**) +- Longhorn: **1.11.1** (Helm chart; app **v1.11.1**) +- Traefik: **39.0.6** (Helm chart; app **v3.6.11**) +- cert-manager: **v1.20.0** (Helm chart; app **v1.20.0**) +- Newt (Fossorial): **1.2.0** (Helm chart; app **1.10.1**) +- Argo CD: **9.4.17** (Helm chart `argo/argo-cd`; app **v3.3.6**) ## Repo paths (this workspace) @@ -52,8 +63,12 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) | kube-vip (kustomize) | `clusters/noble/apps/kube-vip/` (`vip_interface` e.g. `ens18`) | | Cilium (Helm values) | `clusters/noble/apps/cilium/` — `values.yaml` (phase 1), optional `values-kpr.yaml`, `README.md` | | MetalLB | `clusters/noble/apps/metallb/` — `namespace.yaml` (PSA **privileged**), `ip-address-pool.yaml`, `kustomization.yaml`, `README.md` | -| Longhorn Helm values | `clusters/noble/apps/longhorn/values.yaml` | +| Longhorn | `clusters/noble/apps/longhorn/` — `values.yaml`, `namespace.yaml` (PSA **privileged**), `kustomization.yaml` | | metrics-server (Helm values) | `clusters/noble/apps/metrics-server/values.yaml` | +| Traefik (Helm values) | `clusters/noble/apps/traefik/` — `values.yaml`, `namespace.yaml`, `README.md` | +| cert-manager (Helm + ClusterIssuers) | `clusters/noble/apps/cert-manager/` — `values.yaml`, `namespace.yaml`, `kustomization.yaml`, `README.md` | +| Newt / Pangolin tunnel (Helm) | `clusters/noble/apps/newt/` — `values.yaml`, `namespace.yaml`, `README.md` | +| Argo CD (bootstrap + app-of-apps) | `clusters/noble/bootstrap/argocd/` — `values.yaml`, `root-application.yaml`, `apps/`, `README.md` | **Git vs cluster:** manifests and `talconfig` live in git; **`talhelper genconfig -o out`**, bootstrap, Helm, and `kubectl` run on your LAN. See **`talos/README.md`** for workstation reachability (lab LAN/VPN), **`talosctl kubeconfig`** vs Kubernetes `server:` (VIP vs node IP), and **`--insecure`** only in maintenance. @@ -91,17 +106,18 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) - [x] **Cilium** (Helm **1.16.6**) — **required** before MetalLB if `cni: none` (`clusters/noble/apps/cilium/`) - [x] **metrics-server** — Helm **3.13.0**; values in `clusters/noble/apps/metrics-server/values.yaml`; verify `kubectl top nodes` -- [ ] **Longhorn** — Talos: `talconfig.with-longhorn.yaml` + `talos/README.md` §5; Helm: `clusters/noble/apps/longhorn/values.yaml` (`defaultDataPath` `/var/mnt/longhorn`) +- [x] **Longhorn** — Talos: user volume + kubelet mounts + extensions (`talos/README.md` §5); Helm **1.11.1**; `kubectl apply -k clusters/noble/apps/longhorn`; verify **`nodes.longhorn.io`** and test PVC **`Bound`** - [x] **MetalLB** — chart installed; **pool + L2** from `clusters/noble/apps/metallb/` applied (`192.168.50.210`–`229`) -- [ ] **`Service` `LoadBalancer`** test — assign an IP from `210`–`229` (e.g. dummy `LoadBalancer` or Traefik) -- [ ] **Traefik** `LoadBalancer` for `*.apps.noble.lab.pcenicni.dev` -- [ ] **cert-manager** + ClusterIssuer (staging → prod) -- [ ] **ExternalDNS** (Pangolin-compatible provider) +- [x] **`Service` `LoadBalancer`** / pool check — MetalLB assigns from `210`–`229` (validated before Traefik; temporary nginx test removed in favor of Traefik) +- [x] **Traefik** `LoadBalancer` for `*.apps.noble.lab.pcenicni.dev` — `clusters/noble/apps/traefik/`; **`192.168.50.211`** +- [x] **cert-manager** + ClusterIssuer (**`letsencrypt-staging`** / **`letsencrypt-prod`**) — `clusters/noble/apps/cert-manager/` +- [x] **Newt** (Pangolin tunnel; replaces ExternalDNS for public DNS) — `clusters/noble/apps/newt/` — **`newt-pangolin-auth`**; CNAME + **Integration API** per **`newt/README.md`** ## Phase C — GitOps -- [ ] **Argo CD** bootstrap (`clusters/noble/bootstrap/argocd`, root app) — path TBD when added -- [ ] Argo CD server **LoadBalancer** with dedicated pool IP +- [x] **Argo CD** bootstrap — `clusters/noble/bootstrap/argocd/` (`helm upgrade --install argocd …`) +- [x] Argo CD server **LoadBalancer** — **`192.168.50.210`** (see `values.yaml`) +- [ ] **App-of-apps** — set **`repoURL`** in **`root-application.yaml`**, add **`Application`** manifests under **`bootstrap/argocd/apps/`**, apply **`root-application.yaml`** - [ ] SSO — later ## Phase D — Observability @@ -129,9 +145,10 @@ This document is the **exported TODO** for the **noble** Talos cluster (4 nodes) - [x] `kubectl get nodes` — all **Ready** - [x] API via VIP `:6443` — **`kubectl get --raw /healthz`** → **`ok`** with kubeconfig **`server:`** `https://192.168.50.230:6443` -- [ ] Test `LoadBalancer` receives IP from `210`–`229` -- [ ] Sample Ingress + cert + ExternalDNS record -- [ ] PVC bound; Prometheus/Loki durable if configured +- [x] Ingress **`LoadBalancer`** in pool `210`–`229` (**Traefik** → **`192.168.50.211`**) +- [x] **Argo CD** UI — **`argocd-server`** **`LoadBalancer`** **`192.168.50.210`** (initial **`admin`** password from **`argocd-initial-admin-secret`**) +- [ ] Sample Ingress + cert (cert-manager ready) + Pangolin resource + CNAME +- [x] PVC **`Bound`** on **Longhorn** (`storageClassName: longhorn`); Prometheus/Loki durable when configured --- diff --git a/talos/talconfig.with-longhorn.yaml b/talos/talconfig.with-longhorn.yaml index b4d9244..2069860 100644 --- a/talos/talconfig.with-longhorn.yaml +++ b/talos/talconfig.with-longhorn.yaml @@ -10,10 +10,10 @@ # After changing schematic/extensions: regenerate configs, upgrade nodes with new installer image, then reboot if needed. # Helm must set defaultDataPath to /var/mnt/longhorn (see clusters/noble/apps/longhorn/values.yaml). # -# Image Factory schematic (iscsi-tools + util-linux-tools), nocloud installer: -# factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6 -# After edits, run talhelper genconfig — `machine.install.image` in out/*.yaml should match this schematic (path may be metal-installer/ etc. on bare metal). -# Upgrade: talosctl upgrade --image -n +# Image Factory schematic (iscsi-tools + util-linux-tools), nocloud installer — pinned per-node via `talosImageURL` +# (base URL only, no `:tag` — talhelper validates and appends `talosVersion`). +# After edits: `talhelper genconfig -o out` → `machine.install.image` in out/*.yaml. +# Cluster upgrade: `talosctl upgrade --image factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6 -n --wait` clusterName: noble talosVersion: v1.12.6 endpoint: https://192.168.50.230:6443 @@ -74,6 +74,10 @@ patches: name: none machine: kubelet: + # Avoid NodeIPController warnings when VIP / multiple addresses exist; pin to LAN. + nodeIP: + validSubnets: + - 192.168.50.0/24 extraMounts: - destination: /var/mnt/longhorn type: bind diff --git a/talos/talconfig.yaml b/talos/talconfig.yaml index b4d9244..2069860 100644 --- a/talos/talconfig.yaml +++ b/talos/talconfig.yaml @@ -10,10 +10,10 @@ # After changing schematic/extensions: regenerate configs, upgrade nodes with new installer image, then reboot if needed. # Helm must set defaultDataPath to /var/mnt/longhorn (see clusters/noble/apps/longhorn/values.yaml). # -# Image Factory schematic (iscsi-tools + util-linux-tools), nocloud installer: -# factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6 -# After edits, run talhelper genconfig — `machine.install.image` in out/*.yaml should match this schematic (path may be metal-installer/ etc. on bare metal). -# Upgrade: talosctl upgrade --image -n +# Image Factory schematic (iscsi-tools + util-linux-tools), nocloud installer — pinned per-node via `talosImageURL` +# (base URL only, no `:tag` — talhelper validates and appends `talosVersion`). +# After edits: `talhelper genconfig -o out` → `machine.install.image` in out/*.yaml. +# Cluster upgrade: `talosctl upgrade --image factory.talos.dev/nocloud-installer/249d9135de54962744e917cfe654117000cba369f9152fbab9d055a00aa3664f:v1.12.6 -n --wait` clusterName: noble talosVersion: v1.12.6 endpoint: https://192.168.50.230:6443 @@ -74,6 +74,10 @@ patches: name: none machine: kubelet: + # Avoid NodeIPController warnings when VIP / multiple addresses exist; pin to LAN. + nodeIP: + validSubnets: + - 192.168.50.0/24 extraMounts: - destination: /var/mnt/longhorn type: bind