Update .gitignore to include .env file and enhance README.md with instructions for deploying secrets. Refactor noble.yml to improve Kubernetes health check handling and update templates for error reporting. Modify cert-manager and metallb tasks to apply secrets from .env and adjust timeout settings. Clarify Newt installation requirements in tasks. These changes aim to streamline deployment processes and improve documentation clarity.

This commit is contained in:
Nikholas Pcenicni
2026-03-28 15:36:52 -04:00
parent 46cedc965f
commit a48ac16c14
15 changed files with 123 additions and 14 deletions

13
.env.sample Normal file
View File

@@ -0,0 +1,13 @@
# Copy to **.env** in this repository root (`.env` is gitignored).
# Ansible **noble_cert_manager** role sources `.env` after cert-manager Helm install and creates
# **cert-manager/cloudflare-dns-api-token** when **CLOUDFLARE_DNS_API_TOKEN** is set.
#
# Cloudflare: Zone → DNS → Edit + Zone → Read for **pcenicni.dev** (see clusters/noble/apps/cert-manager/README.md).
CLOUDFLARE_DNS_API_TOKEN=
# --- Optional: other deploy-time values (documented for manual use or future automation) ---
# Pangolin / Newt — with **noble_newt_install=true**, Ansible creates **newt/newt-pangolin-auth** when all are set (see clusters/noble/apps/newt/README.md).
PANGOLIN_ENDPOINT=
NEWT_ID=
NEWT_SECRET=

3
.gitignore vendored
View File

@@ -4,4 +4,5 @@ talos/out/
talos/kubeconfig
# Local secrets
age-key.txt
age-key.txt
.env

View File

@@ -17,6 +17,10 @@ cd ansible
ansible-playbook playbooks/deploy.yml
```
## Deploy secrets (`.env`)
Copy **`.env.sample`** to **`.env`** at the repository root (`.env` is gitignored). At minimum set **`CLOUDFLARE_DNS_API_TOKEN`** for cert-manager DNS-01. The **cert-manager** role applies it automatically during **`noble.yml`**. See **`.env.sample`** for optional placeholders (e.g. Newt/Pangolin).
## Prerequisites
- `talosctl` (matches node Talos version), `talhelper`, `helm`, `kubectl`.
@@ -73,7 +77,7 @@ ansible-playbook playbooks/noble.yml --skip-tags newt
|------|----------|
| `talos_phase_a` | Talos genconfig, apply-config, bootstrap, kubeconfig |
| `helm_repos` | `helm repo add` / `update` |
| `noble_*` | Cilium, metrics-server, Longhorn, MetalLB, kube-vip, Traefik, cert-manager, Newt, Argo CD, Kyverno, platform stack |
| `noble_*` | Cilium, metrics-server, Longhorn, MetalLB (20m Helm wait), kube-vip, Traefik, cert-manager, Newt, Argo CD, Kyverno, platform stack |
| `noble_post_deploy` | Post-install reminders |
| `talos_bootstrap` | Genconfig-only (used by older playbook) |

View File

@@ -107,7 +107,7 @@
- --request-timeout=15s
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
register: noble_k8s_health
register: noble_k8s_health_first
failed_when: false
changed_when: false
tags: [always]
@@ -119,9 +119,9 @@
- noble_k8s_api_server_auto_fallback | default(true) | bool
- noble_k8s_api_server_override | default('') | length == 0
- not (noble_skip_k8s_health_check | default(false) | bool)
- noble_k8s_health.rc != 0 or (noble_k8s_health.stdout | default('') | trim) != 'ok'
- ('network is unreachable' in (noble_k8s_health.stderr | default('') | lower)) or
('no route to host' in (noble_k8s_health.stderr | default('') | lower))
- (noble_k8s_health_first.rc | default(1)) != 0 or (noble_k8s_health_first.stdout | default('') | trim) != 'ok'
- ('network is unreachable' in (noble_k8s_health_first.stderr | default('') | lower)) or
('no route to host' in (noble_k8s_health_first.stderr | default('') | lower))
block:
- name: Ensure temp dir for kubeconfig auto-fallback
ansible.builtin.file:
@@ -174,16 +174,27 @@
- --request-timeout=15s
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
register: noble_k8s_health
register: noble_k8s_health_after_fallback
failed_when: false
changed_when: false
- name: Mark that API was re-checked after kubeconfig fallback
ansible.builtin.set_fact:
noble_k8s_api_fallback_used: true
- name: Normalize API health result for preflight (scalars; avoids dict merge / set_fact stringification)
ansible.builtin.set_fact:
noble_k8s_health_rc: "{{ noble_k8s_health_after_fallback.rc | default(1) if (noble_k8s_api_fallback_used | default(false) | bool) else (noble_k8s_health_first.rc | default(1)) }}"
noble_k8s_health_stdout: "{{ noble_k8s_health_after_fallback.stdout | default('') if (noble_k8s_api_fallback_used | default(false) | bool) else (noble_k8s_health_first.stdout | default('')) }}"
noble_k8s_health_stderr: "{{ noble_k8s_health_after_fallback.stderr | default('') if (noble_k8s_api_fallback_used | default(false) | bool) else (noble_k8s_health_first.stderr | default('')) }}"
tags: [always]
- name: Fail when API check did not return ok
ansible.builtin.fail:
msg: "{{ lookup('template', 'templates/api_health_hint.j2') }}"
when:
- not (noble_skip_k8s_health_check | default(false) | bool)
- noble_k8s_health.rc != 0 or (noble_k8s_health.stdout | default('') | trim) != 'ok'
- (noble_k8s_health_rc | int) != 0 or (noble_k8s_health_stdout | default('') | trim) != 'ok'
tags: [always]
roles:

View File

@@ -1,9 +1,9 @@
{# Error output for noble.yml API preflight when kubectl /healthz fails #}
Cannot use the Kubernetes API from this host (kubectl get --raw /healthz).
rc={{ noble_k8s_health.rc }}
stderr: {{ noble_k8s_health.stderr | default('') | trim }}
rc={{ noble_k8s_health_rc | default('n/a') }}
stderr: {{ noble_k8s_health_stderr | default('') | trim }}
{% set err = (noble_k8s_health.stderr | default('')) | lower %}
{% set err = (noble_k8s_health_stderr | default('')) | lower %}
{% if 'connection refused' in err %}
Connection refused: the TCP path to that host works, but nothing is accepting HTTPS on port 6443 there.
• **Not bootstrapped yet?** Finish Talos first: `talosctl bootstrap` (once on a control plane), then `talosctl kubeconfig`, then confirm `kubectl get nodes`. See talos/README.md §2§3 and CLUSTER-BUILD.md Phase A. **Do not run this playbook before the Kubernetes API exists.**

View File

@@ -0,0 +1,3 @@
---
# Warn when **cloudflare-dns-api-token** is missing after apply (also set in **group_vars/all.yml** when loaded).
noble_cert_manager_require_cloudflare_secret: true

View File

@@ -0,0 +1,28 @@
---
# See repository **.env.sample** — copy to **.env** (gitignored).
- name: Stat repository .env for deploy secrets
ansible.builtin.stat:
path: "{{ noble_repo_root }}/.env"
register: noble_deploy_env_file
changed_when: false
- name: Create cert-manager Cloudflare DNS secret from .env
ansible.builtin.shell: |
set -euo pipefail
set -a
. "{{ noble_repo_root }}/.env"
set +a
if [ -z "${CLOUDFLARE_DNS_API_TOKEN:-}" ]; then
echo NO_TOKEN
exit 0
fi
kubectl -n cert-manager create secret generic cloudflare-dns-api-token \
--from-literal=api-token="${CLOUDFLARE_DNS_API_TOKEN}" \
--dry-run=client -o yaml | kubectl apply -f -
echo APPLIED
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
when: noble_deploy_env_file.stat.exists | default(false)
no_log: true
register: noble_cf_secret_from_env
changed_when: "'APPLIED' in (noble_cf_secret_from_env.stdout | default(''))"

View File

@@ -29,6 +29,9 @@
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true
- name: Apply secrets from repository .env (optional)
ansible.builtin.include_tasks: from_env.yml
- name: Check Cloudflare DNS API token Secret (required for ClusterIssuers)
ansible.builtin.command:
argv:
@@ -50,7 +53,7 @@
Secret cert-manager/cloudflare-dns-api-token not found.
Create it per clusters/noble/apps/cert-manager/README.md before ClusterIssuers can succeed.
when:
- noble_cert_manager_require_cloudflare_secret | bool
- noble_cert_manager_require_cloudflare_secret | default(true) | bool
- noble_cf_secret.rc != 0
- name: Apply ClusterIssuers (staging + prod)

View File

@@ -0,0 +1,3 @@
---
# Helm **--wait** default is often too short when images pull slowly or nodes are busy.
noble_helm_metallb_wait_timeout: 20m

View File

@@ -21,6 +21,8 @@
- --namespace
- metallb-system
- --wait
- --timeout
- "{{ noble_helm_metallb_wait_timeout }}"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true

View File

@@ -0,0 +1,3 @@
---
# Set true after creating the newt-pangolin-auth Secret (see role / cluster docs).
noble_newt_install: true

View File

@@ -0,0 +1,30 @@
---
# See repository **.env.sample** — copy to **.env** (gitignored).
- name: Stat repository .env for deploy secrets
ansible.builtin.stat:
path: "{{ noble_repo_root }}/.env"
register: noble_deploy_env_file
changed_when: false
- name: Create newt-pangolin-auth Secret from .env
ansible.builtin.shell: |
set -euo pipefail
set -a
. "{{ noble_repo_root }}/.env"
set +a
if [ -z "${PANGOLIN_ENDPOINT:-}" ] || [ -z "${NEWT_ID:-}" ] || [ -z "${NEWT_SECRET:-}" ]; then
echo NO_VARS
exit 0
fi
kubectl -n newt create secret generic newt-pangolin-auth \
--from-literal=PANGOLIN_ENDPOINT="${PANGOLIN_ENDPOINT}" \
--from-literal=NEWT_ID="${NEWT_ID}" \
--from-literal=NEWT_SECRET="${NEWT_SECRET}" \
--dry-run=client -o yaml | kubectl apply -f -
echo APPLIED
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
when: noble_deploy_env_file.stat.exists | default(false)
no_log: true
register: noble_newt_secret_from_env
changed_when: "'APPLIED' in (noble_newt_secret_from_env.stdout | default(''))"

View File

@@ -1,7 +1,7 @@
---
- name: Skip Newt when not enabled
ansible.builtin.debug:
msg: "noble_newt_install is false — create newt-pangolin-auth Secret and set noble_newt_install=true to deploy Newt."
msg: "noble_newt_install is false — set PANGOLIN_ENDPOINT, NEWT_ID, NEWT_SECRET in repo .env (or create the Secret manually) and set noble_newt_install=true to deploy Newt."
when: not (noble_newt_install | bool)
- name: Create Newt namespace
@@ -16,6 +16,10 @@
when: noble_newt_install | bool
changed_when: true
- name: Apply Newt Pangolin auth Secret from repository .env (optional)
ansible.builtin.include_tasks: from_env.yml
when: noble_newt_install | bool
- name: Install Newt chart
ansible.builtin.command:
argv:

View File

@@ -4,6 +4,10 @@
**ACME (Lets Encrypt)** uses **DNS-01** via **Cloudflare** for zone **`pcenicni.dev`**. Create an API token with **Zone → DNS → Edit** and **Zone → Zone → Read** (or use the “Edit zone DNS” template), then:
**Option A — Ansible:** copy **`.env.sample`** to **`.env`** in the repo root, set **`CLOUDFLARE_DNS_API_TOKEN`**, run **`ansible/playbooks/noble.yml`** (or **`deploy.yml`**). The **cert-manager** role creates **cloudflare-dns-api-token** from `.env` after the chart installs.
**Option B — kubectl:**
```bash
kubectl -n cert-manager create secret generic cloudflare-dns-api-token \
--from-literal=api-token='YOUR_CLOUDFLARE_API_TOKEN' \

View File

@@ -32,7 +32,7 @@ Then restart MetalLB pods if they were failing (`kubectl get pods -n metallb-sys
helm repo update
helm upgrade --install metallb metallb/metallb \
--namespace metallb-system \
--wait
--wait --timeout 20m
```
2. Apply this folders pool and L2 advertisement: