diff --git a/.env.sample b/.env.sample new file mode 100644 index 0000000..3f77ae8 --- /dev/null +++ b/.env.sample @@ -0,0 +1,13 @@ +# Copy to **.env** in this repository root (`.env` is gitignored). +# Ansible **noble_cert_manager** role sources `.env` after cert-manager Helm install and creates +# **cert-manager/cloudflare-dns-api-token** when **CLOUDFLARE_DNS_API_TOKEN** is set. +# +# Cloudflare: Zone → DNS → Edit + Zone → Read for **pcenicni.dev** (see clusters/noble/apps/cert-manager/README.md). +CLOUDFLARE_DNS_API_TOKEN= + +# --- Optional: other deploy-time values (documented for manual use or future automation) --- + +# Pangolin / Newt — with **noble_newt_install=true**, Ansible creates **newt/newt-pangolin-auth** when all are set (see clusters/noble/apps/newt/README.md). +PANGOLIN_ENDPOINT= +NEWT_ID= +NEWT_SECRET= diff --git a/.gitignore b/.gitignore index eda42ff..aef5604 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ talos/out/ talos/kubeconfig # Local secrets -age-key.txt \ No newline at end of file +age-key.txt +.env \ No newline at end of file diff --git a/ansible/README.md b/ansible/README.md index bb78743..33bdf99 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -17,6 +17,10 @@ cd ansible ansible-playbook playbooks/deploy.yml ``` +## Deploy secrets (`.env`) + +Copy **`.env.sample`** to **`.env`** at the repository root (`.env` is gitignored). At minimum set **`CLOUDFLARE_DNS_API_TOKEN`** for cert-manager DNS-01. The **cert-manager** role applies it automatically during **`noble.yml`**. See **`.env.sample`** for optional placeholders (e.g. Newt/Pangolin). + ## Prerequisites - `talosctl` (matches node Talos version), `talhelper`, `helm`, `kubectl`. @@ -73,7 +77,7 @@ ansible-playbook playbooks/noble.yml --skip-tags newt |------|----------| | `talos_phase_a` | Talos genconfig, apply-config, bootstrap, kubeconfig | | `helm_repos` | `helm repo add` / `update` | -| `noble_*` | Cilium, metrics-server, Longhorn, MetalLB, kube-vip, Traefik, cert-manager, Newt, Argo CD, Kyverno, platform stack | +| `noble_*` | Cilium, metrics-server, Longhorn, MetalLB (20m Helm wait), kube-vip, Traefik, cert-manager, Newt, Argo CD, Kyverno, platform stack | | `noble_post_deploy` | Post-install reminders | | `talos_bootstrap` | Genconfig-only (used by older playbook) | diff --git a/ansible/playbooks/noble.yml b/ansible/playbooks/noble.yml index 9479482..f459cb8 100644 --- a/ansible/playbooks/noble.yml +++ b/ansible/playbooks/noble.yml @@ -107,7 +107,7 @@ - --request-timeout=15s environment: KUBECONFIG: "{{ noble_kubeconfig }}" - register: noble_k8s_health + register: noble_k8s_health_first failed_when: false changed_when: false tags: [always] @@ -119,9 +119,9 @@ - noble_k8s_api_server_auto_fallback | default(true) | bool - noble_k8s_api_server_override | default('') | length == 0 - not (noble_skip_k8s_health_check | default(false) | bool) - - noble_k8s_health.rc != 0 or (noble_k8s_health.stdout | default('') | trim) != 'ok' - - ('network is unreachable' in (noble_k8s_health.stderr | default('') | lower)) or - ('no route to host' in (noble_k8s_health.stderr | default('') | lower)) + - (noble_k8s_health_first.rc | default(1)) != 0 or (noble_k8s_health_first.stdout | default('') | trim) != 'ok' + - ('network is unreachable' in (noble_k8s_health_first.stderr | default('') | lower)) or + ('no route to host' in (noble_k8s_health_first.stderr | default('') | lower)) block: - name: Ensure temp dir for kubeconfig auto-fallback ansible.builtin.file: @@ -174,16 +174,27 @@ - --request-timeout=15s environment: KUBECONFIG: "{{ noble_kubeconfig }}" - register: noble_k8s_health + register: noble_k8s_health_after_fallback failed_when: false changed_when: false + - name: Mark that API was re-checked after kubeconfig fallback + ansible.builtin.set_fact: + noble_k8s_api_fallback_used: true + + - name: Normalize API health result for preflight (scalars; avoids dict merge / set_fact stringification) + ansible.builtin.set_fact: + noble_k8s_health_rc: "{{ noble_k8s_health_after_fallback.rc | default(1) if (noble_k8s_api_fallback_used | default(false) | bool) else (noble_k8s_health_first.rc | default(1)) }}" + noble_k8s_health_stdout: "{{ noble_k8s_health_after_fallback.stdout | default('') if (noble_k8s_api_fallback_used | default(false) | bool) else (noble_k8s_health_first.stdout | default('')) }}" + noble_k8s_health_stderr: "{{ noble_k8s_health_after_fallback.stderr | default('') if (noble_k8s_api_fallback_used | default(false) | bool) else (noble_k8s_health_first.stderr | default('')) }}" + tags: [always] + - name: Fail when API check did not return ok ansible.builtin.fail: msg: "{{ lookup('template', 'templates/api_health_hint.j2') }}" when: - not (noble_skip_k8s_health_check | default(false) | bool) - - noble_k8s_health.rc != 0 or (noble_k8s_health.stdout | default('') | trim) != 'ok' + - (noble_k8s_health_rc | int) != 0 or (noble_k8s_health_stdout | default('') | trim) != 'ok' tags: [always] roles: diff --git a/ansible/playbooks/templates/api_health_hint.j2 b/ansible/playbooks/templates/api_health_hint.j2 index 53d157a..ccc8aed 100644 --- a/ansible/playbooks/templates/api_health_hint.j2 +++ b/ansible/playbooks/templates/api_health_hint.j2 @@ -1,9 +1,9 @@ {# Error output for noble.yml API preflight when kubectl /healthz fails #} Cannot use the Kubernetes API from this host (kubectl get --raw /healthz). -rc={{ noble_k8s_health.rc }} -stderr: {{ noble_k8s_health.stderr | default('') | trim }} +rc={{ noble_k8s_health_rc | default('n/a') }} +stderr: {{ noble_k8s_health_stderr | default('') | trim }} -{% set err = (noble_k8s_health.stderr | default('')) | lower %} +{% set err = (noble_k8s_health_stderr | default('')) | lower %} {% if 'connection refused' in err %} Connection refused: the TCP path to that host works, but nothing is accepting HTTPS on port 6443 there. • **Not bootstrapped yet?** Finish Talos first: `talosctl bootstrap` (once on a control plane), then `talosctl kubeconfig`, then confirm `kubectl get nodes`. See talos/README.md §2–§3 and CLUSTER-BUILD.md Phase A. **Do not run this playbook before the Kubernetes API exists.** diff --git a/ansible/roles/noble_cert_manager/defaults/main.yml b/ansible/roles/noble_cert_manager/defaults/main.yml new file mode 100644 index 0000000..0d73278 --- /dev/null +++ b/ansible/roles/noble_cert_manager/defaults/main.yml @@ -0,0 +1,3 @@ +--- +# Warn when **cloudflare-dns-api-token** is missing after apply (also set in **group_vars/all.yml** when loaded). +noble_cert_manager_require_cloudflare_secret: true diff --git a/ansible/roles/noble_cert_manager/tasks/from_env.yml b/ansible/roles/noble_cert_manager/tasks/from_env.yml new file mode 100644 index 0000000..f4364ba --- /dev/null +++ b/ansible/roles/noble_cert_manager/tasks/from_env.yml @@ -0,0 +1,28 @@ +--- +# See repository **.env.sample** — copy to **.env** (gitignored). +- name: Stat repository .env for deploy secrets + ansible.builtin.stat: + path: "{{ noble_repo_root }}/.env" + register: noble_deploy_env_file + changed_when: false + +- name: Create cert-manager Cloudflare DNS secret from .env + ansible.builtin.shell: | + set -euo pipefail + set -a + . "{{ noble_repo_root }}/.env" + set +a + if [ -z "${CLOUDFLARE_DNS_API_TOKEN:-}" ]; then + echo NO_TOKEN + exit 0 + fi + kubectl -n cert-manager create secret generic cloudflare-dns-api-token \ + --from-literal=api-token="${CLOUDFLARE_DNS_API_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + echo APPLIED + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + when: noble_deploy_env_file.stat.exists | default(false) + no_log: true + register: noble_cf_secret_from_env + changed_when: "'APPLIED' in (noble_cf_secret_from_env.stdout | default(''))" diff --git a/ansible/roles/noble_cert_manager/tasks/main.yml b/ansible/roles/noble_cert_manager/tasks/main.yml index fd0a88c..be5bd02 100644 --- a/ansible/roles/noble_cert_manager/tasks/main.yml +++ b/ansible/roles/noble_cert_manager/tasks/main.yml @@ -29,6 +29,9 @@ KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true +- name: Apply secrets from repository .env (optional) + ansible.builtin.include_tasks: from_env.yml + - name: Check Cloudflare DNS API token Secret (required for ClusterIssuers) ansible.builtin.command: argv: @@ -50,7 +53,7 @@ Secret cert-manager/cloudflare-dns-api-token not found. Create it per clusters/noble/apps/cert-manager/README.md before ClusterIssuers can succeed. when: - - noble_cert_manager_require_cloudflare_secret | bool + - noble_cert_manager_require_cloudflare_secret | default(true) | bool - noble_cf_secret.rc != 0 - name: Apply ClusterIssuers (staging + prod) diff --git a/ansible/roles/noble_metallb/defaults/main.yml b/ansible/roles/noble_metallb/defaults/main.yml new file mode 100644 index 0000000..7b0c14b --- /dev/null +++ b/ansible/roles/noble_metallb/defaults/main.yml @@ -0,0 +1,3 @@ +--- +# Helm **--wait** default is often too short when images pull slowly or nodes are busy. +noble_helm_metallb_wait_timeout: 20m diff --git a/ansible/roles/noble_metallb/tasks/main.yml b/ansible/roles/noble_metallb/tasks/main.yml index 5ef2a29..24fad06 100644 --- a/ansible/roles/noble_metallb/tasks/main.yml +++ b/ansible/roles/noble_metallb/tasks/main.yml @@ -21,6 +21,8 @@ - --namespace - metallb-system - --wait + - --timeout + - "{{ noble_helm_metallb_wait_timeout }}" environment: KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true diff --git a/ansible/roles/noble_newt/defaults/main.yml b/ansible/roles/noble_newt/defaults/main.yml new file mode 100644 index 0000000..b95cae1 --- /dev/null +++ b/ansible/roles/noble_newt/defaults/main.yml @@ -0,0 +1,3 @@ +--- +# Set true after creating the newt-pangolin-auth Secret (see role / cluster docs). +noble_newt_install: true diff --git a/ansible/roles/noble_newt/tasks/from_env.yml b/ansible/roles/noble_newt/tasks/from_env.yml new file mode 100644 index 0000000..04118a7 --- /dev/null +++ b/ansible/roles/noble_newt/tasks/from_env.yml @@ -0,0 +1,30 @@ +--- +# See repository **.env.sample** — copy to **.env** (gitignored). +- name: Stat repository .env for deploy secrets + ansible.builtin.stat: + path: "{{ noble_repo_root }}/.env" + register: noble_deploy_env_file + changed_when: false + +- name: Create newt-pangolin-auth Secret from .env + ansible.builtin.shell: | + set -euo pipefail + set -a + . "{{ noble_repo_root }}/.env" + set +a + if [ -z "${PANGOLIN_ENDPOINT:-}" ] || [ -z "${NEWT_ID:-}" ] || [ -z "${NEWT_SECRET:-}" ]; then + echo NO_VARS + exit 0 + fi + kubectl -n newt create secret generic newt-pangolin-auth \ + --from-literal=PANGOLIN_ENDPOINT="${PANGOLIN_ENDPOINT}" \ + --from-literal=NEWT_ID="${NEWT_ID}" \ + --from-literal=NEWT_SECRET="${NEWT_SECRET}" \ + --dry-run=client -o yaml | kubectl apply -f - + echo APPLIED + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + when: noble_deploy_env_file.stat.exists | default(false) + no_log: true + register: noble_newt_secret_from_env + changed_when: "'APPLIED' in (noble_newt_secret_from_env.stdout | default(''))" diff --git a/ansible/roles/noble_newt/tasks/main.yml b/ansible/roles/noble_newt/tasks/main.yml index 4393504..3bde6a3 100644 --- a/ansible/roles/noble_newt/tasks/main.yml +++ b/ansible/roles/noble_newt/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Skip Newt when not enabled ansible.builtin.debug: - msg: "noble_newt_install is false — create newt-pangolin-auth Secret and set noble_newt_install=true to deploy Newt." + msg: "noble_newt_install is false — set PANGOLIN_ENDPOINT, NEWT_ID, NEWT_SECRET in repo .env (or create the Secret manually) and set noble_newt_install=true to deploy Newt." when: not (noble_newt_install | bool) - name: Create Newt namespace @@ -16,6 +16,10 @@ when: noble_newt_install | bool changed_when: true +- name: Apply Newt Pangolin auth Secret from repository .env (optional) + ansible.builtin.include_tasks: from_env.yml + when: noble_newt_install | bool + - name: Install Newt chart ansible.builtin.command: argv: diff --git a/clusters/noble/apps/cert-manager/README.md b/clusters/noble/apps/cert-manager/README.md index bbe68a9..1085df7 100644 --- a/clusters/noble/apps/cert-manager/README.md +++ b/clusters/noble/apps/cert-manager/README.md @@ -4,6 +4,10 @@ **ACME (Let’s Encrypt)** uses **DNS-01** via **Cloudflare** for zone **`pcenicni.dev`**. Create an API token with **Zone → DNS → Edit** and **Zone → Zone → Read** (or use the “Edit zone DNS” template), then: +**Option A — Ansible:** copy **`.env.sample`** to **`.env`** in the repo root, set **`CLOUDFLARE_DNS_API_TOKEN`**, run **`ansible/playbooks/noble.yml`** (or **`deploy.yml`**). The **cert-manager** role creates **cloudflare-dns-api-token** from `.env` after the chart installs. + +**Option B — kubectl:** + ```bash kubectl -n cert-manager create secret generic cloudflare-dns-api-token \ --from-literal=api-token='YOUR_CLOUDFLARE_API_TOKEN' \ diff --git a/clusters/noble/apps/metallb/README.md b/clusters/noble/apps/metallb/README.md index 93b6a34..3af5a74 100644 --- a/clusters/noble/apps/metallb/README.md +++ b/clusters/noble/apps/metallb/README.md @@ -32,7 +32,7 @@ Then restart MetalLB pods if they were failing (`kubectl get pods -n metallb-sys helm repo update helm upgrade --install metallb metallb/metallb \ --namespace metallb-system \ - --wait + --wait --timeout 20m ``` 2. Apply this folder’s pool and L2 advertisement: