From 0e8eaa2f0d9697eaaf781d5f20bd1af6ec576603 Mon Sep 17 00:00:00 2001 From: Nikholas Pcenicni <82239765+nikpcenicni@users.noreply.github.com> Date: Sat, 28 Mar 2026 16:32:21 -0400 Subject: [PATCH] Update .gitignore to include generated noble-lab-ui-urls.md and enhance README.md with new role documentation. Refactor noble.yml to incorporate noble_landing_urls role for improved URL management. Add ingress configurations for alertmanager, prometheus, longhorn, and vault to support TLS termination via Traefik. Update network policies and values.yaml for vault to allow traffic from Traefik. These changes aim to streamline deployment and enhance service accessibility. --- .gitignore | 5 +- ansible/README.md | 1 + ansible/playbooks/noble.yml | 2 + .../noble_landing_urls/defaults/main.yml | 43 +++++++++++++++ .../tasks/fetch_credentials.yml | 55 +++++++++++++++++++ .../roles/noble_landing_urls/tasks/main.yml | 20 +++++++ .../templates/noble-lab-ui-urls.md.j2 | 50 +++++++++++++++++ .../roles/noble_platform/defaults/main.yml | 8 +++ ansible/roles/noble_platform/tasks/main.yml | 21 +++++++ .../apps/kube-prometheus-stack/values.yaml | 30 ++++++++++ clusters/noble/apps/longhorn/values.yaml | 13 +++++ .../apps/vault/cilium-network-policy.yaml | 7 +++ clusters/noble/apps/vault/values.yaml | 14 +++++ clusters/noble/bootstrap/argocd/README.md | 11 ++++ clusters/noble/bootstrap/argocd/values.yaml | 7 ++- 15 files changed, 284 insertions(+), 3 deletions(-) create mode 100644 ansible/roles/noble_landing_urls/defaults/main.yml create mode 100644 ansible/roles/noble_landing_urls/tasks/fetch_credentials.yml create mode 100644 ansible/roles/noble_landing_urls/tasks/main.yml create mode 100644 ansible/roles/noble_landing_urls/templates/noble-lab-ui-urls.md.j2 create mode 100644 ansible/roles/noble_platform/defaults/main.yml diff --git a/.gitignore b/.gitignore index aef5604..b99a881 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,7 @@ talos/kubeconfig # Local secrets age-key.txt -.env \ No newline at end of file +.env + +# Generated by ansible noble_landing_urls +ansible/output/noble-lab-ui-urls.md \ No newline at end of file diff --git a/ansible/README.md b/ansible/README.md index 33bdf99..4378260 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -78,6 +78,7 @@ ansible-playbook playbooks/noble.yml --skip-tags newt | `talos_phase_a` | Talos genconfig, apply-config, bootstrap, kubeconfig | | `helm_repos` | `helm repo add` / `update` | | `noble_*` | Cilium, metrics-server, Longhorn, MetalLB (20m Helm wait), kube-vip, Traefik, cert-manager, Newt, Argo CD, Kyverno, platform stack | +| `noble_landing_urls` | Writes **`ansible/output/noble-lab-ui-urls.md`** — URLs, service names, and (optional) Argo/Grafana passwords from Secrets | | `noble_post_deploy` | Post-install reminders | | `talos_bootstrap` | Genconfig-only (used by older playbook) | diff --git a/ansible/playbooks/noble.yml b/ansible/playbooks/noble.yml index f459cb8..1986799 100644 --- a/ansible/playbooks/noble.yml +++ b/ansible/playbooks/noble.yml @@ -224,3 +224,5 @@ tags: [kyverno_policies, policy] - role: noble_platform tags: [platform, observability, apps] + - role: noble_landing_urls + tags: [landing, platform, observability, apps] diff --git a/ansible/roles/noble_landing_urls/defaults/main.yml b/ansible/roles/noble_landing_urls/defaults/main.yml new file mode 100644 index 0000000..84c1fe8 --- /dev/null +++ b/ansible/roles/noble_landing_urls/defaults/main.yml @@ -0,0 +1,43 @@ +--- +# Regenerated when **noble_landing_urls** runs (after platform stack). Paths match Traefik + cert-manager Ingresses. +noble_landing_urls_dest: "{{ noble_repo_root }}/ansible/output/noble-lab-ui-urls.md" + +# When true, run kubectl against the cluster to fill Argo CD / Grafana passwords in the markdown (requires working kubeconfig). +noble_landing_urls_fetch_credentials: true + +noble_lab_ui_entries: + - name: Argo CD + description: GitOps UI (sync, apps, repos) + namespace: argocd + service: argocd-server + url: https://argo.apps.noble.lab.pcenicni.dev + - name: Grafana + description: Dashboards, Loki explore (logs) + namespace: monitoring + service: kube-prometheus-grafana + url: https://grafana.apps.noble.lab.pcenicni.dev + - name: Prometheus + description: Prometheus UI (queries, targets) — lab; protect in production + namespace: monitoring + service: kube-prometheus-kube-prome-prometheus + url: https://prometheus.apps.noble.lab.pcenicni.dev + - name: Alertmanager + description: Alertmanager UI (silences, status) + namespace: monitoring + service: kube-prometheus-kube-prome-alertmanager + url: https://alertmanager.apps.noble.lab.pcenicni.dev + - name: Headlamp + description: Kubernetes UI (cluster resources) + namespace: headlamp + service: headlamp + url: https://headlamp.apps.noble.lab.pcenicni.dev + - name: Longhorn + description: Storage volumes, nodes, backups + namespace: longhorn-system + service: longhorn-frontend + url: https://longhorn.apps.noble.lab.pcenicni.dev + - name: Vault + description: Secrets engine UI (after init/unseal) + namespace: vault + service: vault + url: https://vault.apps.noble.lab.pcenicni.dev diff --git a/ansible/roles/noble_landing_urls/tasks/fetch_credentials.yml b/ansible/roles/noble_landing_urls/tasks/fetch_credentials.yml new file mode 100644 index 0000000..8bfa4f4 --- /dev/null +++ b/ansible/roles/noble_landing_urls/tasks/fetch_credentials.yml @@ -0,0 +1,55 @@ +--- +# Populates template variables from Secrets (no_log on kubectl to avoid leaking into Ansible stdout). +- name: Fetch Argo CD initial admin password (base64) + ansible.builtin.command: + argv: + - kubectl + - -n + - argocd + - get + - secret + - argocd-initial-admin-secret + - -o + - jsonpath={.data.password} + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_fetch_argocd_pw_b64 + failed_when: false + changed_when: false + no_log: true + +- name: Fetch Grafana admin user (base64) + ansible.builtin.command: + argv: + - kubectl + - -n + - monitoring + - get + - secret + - kube-prometheus-grafana + - -o + - jsonpath={.data.admin-user} + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_fetch_grafana_user_b64 + failed_when: false + changed_when: false + no_log: true + +- name: Fetch Grafana admin password (base64) + ansible.builtin.command: + argv: + - kubectl + - -n + - monitoring + - get + - secret + - kube-prometheus-grafana + - -o + - jsonpath={.data.admin-password} + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_fetch_grafana_pw_b64 + failed_when: false + changed_when: false + no_log: true diff --git a/ansible/roles/noble_landing_urls/tasks/main.yml b/ansible/roles/noble_landing_urls/tasks/main.yml new file mode 100644 index 0000000..aff57dd --- /dev/null +++ b/ansible/roles/noble_landing_urls/tasks/main.yml @@ -0,0 +1,20 @@ +--- +- name: Ensure output directory for generated landing page + ansible.builtin.file: + path: "{{ noble_repo_root }}/ansible/output" + state: directory + mode: "0755" + +- name: Fetch initial credentials from cluster Secrets (optional) + ansible.builtin.include_tasks: fetch_credentials.yml + when: noble_landing_urls_fetch_credentials | default(true) | bool + +- name: Write noble lab UI URLs (markdown landing page) + ansible.builtin.template: + src: noble-lab-ui-urls.md.j2 + dest: "{{ noble_landing_urls_dest }}" + mode: "0644" + +- name: Show landing page path + ansible.builtin.debug: + msg: "Noble lab UI list written to {{ noble_landing_urls_dest }}" diff --git a/ansible/roles/noble_landing_urls/templates/noble-lab-ui-urls.md.j2 b/ansible/roles/noble_landing_urls/templates/noble-lab-ui-urls.md.j2 new file mode 100644 index 0000000..e7ca91f --- /dev/null +++ b/ansible/roles/noble_landing_urls/templates/noble-lab-ui-urls.md.j2 @@ -0,0 +1,50 @@ +# Noble lab — web UIs (LAN) + +> **Sensitive:** This file may include **passwords read from Kubernetes Secrets** when credential fetch ran. It is **gitignored** — do not commit or share. + +**DNS:** point **`*.apps.noble.lab.pcenicni.dev`** at the Traefik **LoadBalancer** (MetalLB **`192.168.50.211`** by default — see `clusters/noble/apps/traefik/values.yaml`). + +**TLS:** **cert-manager** + **`letsencrypt-prod`** on each Ingress (public **DNS-01** for **`pcenicni.dev`**). + +This file is **generated** by Ansible (`noble_landing_urls` role). Use it as a temporary landing page to find services after deploy. + +| UI | What | Kubernetes service | Namespace | URL | +|----|------|----------------------|-----------|-----| +{% for e in noble_lab_ui_entries %} +| {{ e.name }} | {{ e.description }} | `{{ e.service }}` | `{{ e.namespace }}` | [{{ e.url }}]({{ e.url }}) | +{% endfor %} + +## Initial access (logins) + +| App | Username / identity | Password / secret | +|-----|---------------------|-------------------| +| **Argo CD** | `admin` | {% if (noble_fetch_argocd_pw_b64 is defined) and (noble_fetch_argocd_pw_b64.rc | default(1) == 0) and (noble_fetch_argocd_pw_b64.stdout | default('') | length > 0) %}`{{ noble_fetch_argocd_pw_b64.stdout | b64decode }}`{% else %}*(not fetched — use commands below)*{% endif %} | +| **Grafana** | {% if (noble_fetch_grafana_user_b64 is defined) and (noble_fetch_grafana_user_b64.rc | default(1) == 0) and (noble_fetch_grafana_user_b64.stdout | default('') | length > 0) %}`{{ noble_fetch_grafana_user_b64.stdout | b64decode }}`{% else %}*(from Secret — use commands below)*{% endif %} | {% if (noble_fetch_grafana_pw_b64 is defined) and (noble_fetch_grafana_pw_b64.rc | default(1) == 0) and (noble_fetch_grafana_pw_b64.stdout | default('') | length > 0) %}`{{ noble_fetch_grafana_pw_b64.stdout | b64decode }}`{% else %}*(not fetched — use commands below)*{% endif %} | +| **Headlamp** | ServiceAccount token | No fixed password. Sign in with a SA token, or configure OIDC — `clusters/noble/apps/headlamp/README.md`. | +| **Prometheus** | — | No auth in default install (lab). | +| **Alertmanager** | — | No auth in default install (lab). | +| **Longhorn** | — | No default login unless you enable access control in the UI settings. | +| **Vault** | Token | Root token is only from **`vault operator init`** (not stored in git). See `clusters/noble/apps/vault/README.md`. | + +### Commands to retrieve passwords (if not filled above) + +```bash +# Argo CD initial admin (Secret removed after you change password) +kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath='{.data.password}' | base64 -d +echo + +# Grafana admin user / password +kubectl -n monitoring get secret kube-prometheus-grafana -o jsonpath='{.data.admin-user}' | base64 -d +echo +kubectl -n monitoring get secret kube-prometheus-grafana -o jsonpath='{.data.admin-password}' | base64 -d +echo +``` + +To generate this file **without** calling kubectl, run Ansible with **`-e noble_landing_urls_fetch_credentials=false`**. + +## Notes + +- **Argo CD** `argocd-initial-admin-secret` disappears after you change the admin password. +- **Grafana** password is random unless you set `grafana.adminPassword` in chart values. +- **Vault** UI needs **unsealed** Vault; tokens come from your chosen auth method. +- **Prometheus / Alertmanager** UIs are unauthenticated by default — restrict when hardening (`talos/CLUSTER-BUILD.md` Phase G). diff --git a/ansible/roles/noble_platform/defaults/main.yml b/ansible/roles/noble_platform/defaults/main.yml new file mode 100644 index 0000000..0e72b05 --- /dev/null +++ b/ansible/roles/noble_platform/defaults/main.yml @@ -0,0 +1,8 @@ +--- +# kubectl apply -k can hit transient etcd timeouts under load; retries + longer API deadline help. +noble_platform_kubectl_request_timeout: 120s +noble_platform_kustomize_retries: 5 +noble_platform_kustomize_delay: 20 + +# Vault: injector (vault-k8s) owns MutatingWebhookConfiguration.caBundle; Helm upgrade can SSA-conflict. Delete webhook so Helm can recreate it. +noble_vault_delete_injector_webhook_before_helm: true diff --git a/ansible/roles/noble_platform/tasks/main.yml b/ansible/roles/noble_platform/tasks/main.yml index 5bc6f08..802344e 100644 --- a/ansible/roles/noble_platform/tasks/main.yml +++ b/ansible/roles/noble_platform/tasks/main.yml @@ -5,10 +5,15 @@ argv: - kubectl - apply + - "--request-timeout={{ noble_platform_kubectl_request_timeout }}" - -k - "{{ noble_repo_root }}/clusters/noble/apps" environment: KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_platform_kustomize + retries: "{{ noble_platform_kustomize_retries | int }}" + delay: "{{ noble_platform_kustomize_delay | int }}" + until: noble_platform_kustomize.rc == 0 changed_when: true - name: Install Sealed Secrets @@ -49,6 +54,21 @@ KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true +# vault-k8s patches webhook CA after install; Helm 3/4 SSA then conflicts on upgrade. Removing the MWC lets Helm re-apply cleanly; injector repopulates caBundle. +- name: Delete Vault agent injector MutatingWebhookConfiguration before Helm (avoids caBundle field conflict) + ansible.builtin.command: + argv: + - kubectl + - delete + - mutatingwebhookconfiguration + - vault-agent-injector-cfg + - --ignore-not-found + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_vault_mwc_delete + when: noble_vault_delete_injector_webhook_before_helm | default(true) | bool + changed_when: "'deleted' in (noble_vault_mwc_delete.stdout | default(''))" + - name: Install Vault ansible.builtin.command: argv: @@ -66,6 +86,7 @@ - --wait environment: KUBECONFIG: "{{ noble_kubeconfig }}" + HELM_SERVER_SIDE_APPLY: "false" changed_when: true - name: Install kube-prometheus-stack diff --git a/clusters/noble/apps/kube-prometheus-stack/values.yaml b/clusters/noble/apps/kube-prometheus-stack/values.yaml index de02707..9dc9077 100644 --- a/clusters/noble/apps/kube-prometheus-stack/values.yaml +++ b/clusters/noble/apps/kube-prometheus-stack/values.yaml @@ -35,6 +35,20 @@ alertmanager: resources: requests: storage: 5Gi + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - alertmanager.apps.noble.lab.pcenicni.dev + paths: + - / + pathType: Prefix + tls: + - secretName: alertmanager-apps-noble-tls + hosts: + - alertmanager.apps.noble.lab.pcenicni.dev prometheus: prometheusSpec: @@ -48,6 +62,20 @@ prometheus: resources: requests: storage: 30Gi + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - prometheus.apps.noble.lab.pcenicni.dev + paths: + - / + pathType: Prefix + tls: + - secretName: prometheus-apps-noble-tls + hosts: + - prometheus.apps.noble.lab.pcenicni.dev grafana: persistence: @@ -78,5 +106,7 @@ grafana: server: domain: grafana.apps.noble.lab.pcenicni.dev root_url: https://grafana.apps.noble.lab.pcenicni.dev/ + # Traefik sets X-Forwarded-*; required for correct redirects and cookies behind the ingress. + use_proxy_headers: true # Loki datasource: apply `clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml` (sidecar ConfigMap) instead of additionalDataSources here. diff --git a/clusters/noble/apps/longhorn/values.yaml b/clusters/noble/apps/longhorn/values.yaml index 69a34a0..2af82f8 100644 --- a/clusters/noble/apps/longhorn/values.yaml +++ b/clusters/noble/apps/longhorn/values.yaml @@ -16,6 +16,19 @@ defaultSettings: # Default 30% reserved often makes small data disks look "full" to the scheduler. storageReservedPercentageForDefaultDisk: "10" +# Longhorn UI — same *.apps.noble.lab.pcenicni.dev pattern as Grafana / Headlamp (Traefik LB → cert-manager TLS). +ingress: + enabled: true + ingressClassName: traefik + host: longhorn.apps.noble.lab.pcenicni.dev + path: / + pathType: Prefix + tls: true + tlsSecret: longhorn-apps-noble-tls + secureBackends: false + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + # Pre-upgrade Job: keep enabled for normal Helm upgrades (disable only if GitOps sync fights the Job). preUpgradeChecker: jobEnabled: true diff --git a/clusters/noble/apps/vault/cilium-network-policy.yaml b/clusters/noble/apps/vault/cilium-network-policy.yaml index 9a910eb..c381899 100644 --- a/clusters/noble/apps/vault/cilium-network-policy.yaml +++ b/clusters/noble/apps/vault/cilium-network-policy.yaml @@ -24,6 +24,13 @@ spec: - ports: - port: "8200" protocol: TCP + - fromEndpoints: + - matchLabels: + "k8s:io.kubernetes.pod.namespace": traefik + toPorts: + - ports: + - port: "8200" + protocol: TCP - fromEndpoints: - matchLabels: "k8s:io.kubernetes.pod.namespace": vault diff --git a/clusters/noble/apps/vault/values.yaml b/clusters/noble/apps/vault/values.yaml index 4c179e5..9bf9945 100644 --- a/clusters/noble/apps/vault/values.yaml +++ b/clusters/noble/apps/vault/values.yaml @@ -44,5 +44,19 @@ server: path: "/v1/sys/health?uninitcode=204&sealedcode=204&standbyok=true" port: 8200 + # LAN: TLS terminates at Traefik + cert-manager; listener stays HTTP (global.tlsDisable). + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - host: vault.apps.noble.lab.pcenicni.dev + paths: [] + tls: + - secretName: vault-apps-noble-tls + hosts: + - vault.apps.noble.lab.pcenicni.dev + ui: enabled: true diff --git a/clusters/noble/bootstrap/argocd/README.md b/clusters/noble/bootstrap/argocd/README.md index edae846..8b7dec4 100644 --- a/clusters/noble/bootstrap/argocd/README.md +++ b/clusters/noble/bootstrap/argocd/README.md @@ -35,6 +35,17 @@ echo Change the password in the UI or via `argocd account update-password`. +### TLS: changing ClusterIssuer (e.g. staging → prod) + +If **`helm upgrade --wait`** fails with *Secret was previously issued by `letsencrypt-staging`* (or another issuer), cert-manager will not replace the TLS Secret in place. Remove the old cert material once, then upgrade again: + +```bash +kubectl -n argocd delete certificate argocd-server --ignore-not-found +kubectl -n argocd delete secret argocd-server-tls --ignore-not-found +helm upgrade --install argocd argo/argo-cd -n argocd --create-namespace \ + --version 9.4.17 -f clusters/noble/bootstrap/argocd/values.yaml --wait +``` + ## 3. Register this repo (if private) Use **Settings → Repositories** in the UI, or `argocd repo add` / a `Secret` of type `repository`. diff --git a/clusters/noble/bootstrap/argocd/values.yaml b/clusters/noble/bootstrap/argocd/values.yaml index b606dab..1055da1 100644 --- a/clusters/noble/bootstrap/argocd/values.yaml +++ b/clusters/noble/bootstrap/argocd/values.yaml @@ -32,17 +32,20 @@ server: certificate: enabled: true domain: argo.apps.noble.lab.pcenicni.dev + # If you change issuer.name, delete Certificate/Secret once so cert-manager can re-issue (see README.md). issuer: group: cert-manager.io kind: ClusterIssuer - name: letsencrypt-staging + name: letsencrypt-prod ingress: enabled: true ingressClassName: traefik hostname: argo.apps.noble.lab.pcenicni.dev tls: true - annotations: {} + # Traefik terminates TLS; Argo serves HTTP/2 cleartext (insecure). Without h2c, UI/API can 404 or fail gRPC. + annotations: + traefik.ingress.kubernetes.io/service.serversscheme: h2c service: type: ClusterIP