diff --git a/.gitignore b/.gitignore index aef5604..b99a881 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,7 @@ talos/kubeconfig # Local secrets age-key.txt -.env \ No newline at end of file +.env + +# Generated by ansible noble_landing_urls +ansible/output/noble-lab-ui-urls.md \ No newline at end of file diff --git a/ansible/README.md b/ansible/README.md index 33bdf99..4378260 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -78,6 +78,7 @@ ansible-playbook playbooks/noble.yml --skip-tags newt | `talos_phase_a` | Talos genconfig, apply-config, bootstrap, kubeconfig | | `helm_repos` | `helm repo add` / `update` | | `noble_*` | Cilium, metrics-server, Longhorn, MetalLB (20m Helm wait), kube-vip, Traefik, cert-manager, Newt, Argo CD, Kyverno, platform stack | +| `noble_landing_urls` | Writes **`ansible/output/noble-lab-ui-urls.md`** — URLs, service names, and (optional) Argo/Grafana passwords from Secrets | | `noble_post_deploy` | Post-install reminders | | `talos_bootstrap` | Genconfig-only (used by older playbook) | diff --git a/ansible/playbooks/noble.yml b/ansible/playbooks/noble.yml index f459cb8..1986799 100644 --- a/ansible/playbooks/noble.yml +++ b/ansible/playbooks/noble.yml @@ -224,3 +224,5 @@ tags: [kyverno_policies, policy] - role: noble_platform tags: [platform, observability, apps] + - role: noble_landing_urls + tags: [landing, platform, observability, apps] diff --git a/ansible/roles/noble_landing_urls/defaults/main.yml b/ansible/roles/noble_landing_urls/defaults/main.yml new file mode 100644 index 0000000..84c1fe8 --- /dev/null +++ b/ansible/roles/noble_landing_urls/defaults/main.yml @@ -0,0 +1,43 @@ +--- +# Regenerated when **noble_landing_urls** runs (after platform stack). Paths match Traefik + cert-manager Ingresses. +noble_landing_urls_dest: "{{ noble_repo_root }}/ansible/output/noble-lab-ui-urls.md" + +# When true, run kubectl against the cluster to fill Argo CD / Grafana passwords in the markdown (requires working kubeconfig). +noble_landing_urls_fetch_credentials: true + +noble_lab_ui_entries: + - name: Argo CD + description: GitOps UI (sync, apps, repos) + namespace: argocd + service: argocd-server + url: https://argo.apps.noble.lab.pcenicni.dev + - name: Grafana + description: Dashboards, Loki explore (logs) + namespace: monitoring + service: kube-prometheus-grafana + url: https://grafana.apps.noble.lab.pcenicni.dev + - name: Prometheus + description: Prometheus UI (queries, targets) — lab; protect in production + namespace: monitoring + service: kube-prometheus-kube-prome-prometheus + url: https://prometheus.apps.noble.lab.pcenicni.dev + - name: Alertmanager + description: Alertmanager UI (silences, status) + namespace: monitoring + service: kube-prometheus-kube-prome-alertmanager + url: https://alertmanager.apps.noble.lab.pcenicni.dev + - name: Headlamp + description: Kubernetes UI (cluster resources) + namespace: headlamp + service: headlamp + url: https://headlamp.apps.noble.lab.pcenicni.dev + - name: Longhorn + description: Storage volumes, nodes, backups + namespace: longhorn-system + service: longhorn-frontend + url: https://longhorn.apps.noble.lab.pcenicni.dev + - name: Vault + description: Secrets engine UI (after init/unseal) + namespace: vault + service: vault + url: https://vault.apps.noble.lab.pcenicni.dev diff --git a/ansible/roles/noble_landing_urls/tasks/fetch_credentials.yml b/ansible/roles/noble_landing_urls/tasks/fetch_credentials.yml new file mode 100644 index 0000000..8bfa4f4 --- /dev/null +++ b/ansible/roles/noble_landing_urls/tasks/fetch_credentials.yml @@ -0,0 +1,55 @@ +--- +# Populates template variables from Secrets (no_log on kubectl to avoid leaking into Ansible stdout). +- name: Fetch Argo CD initial admin password (base64) + ansible.builtin.command: + argv: + - kubectl + - -n + - argocd + - get + - secret + - argocd-initial-admin-secret + - -o + - jsonpath={.data.password} + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_fetch_argocd_pw_b64 + failed_when: false + changed_when: false + no_log: true + +- name: Fetch Grafana admin user (base64) + ansible.builtin.command: + argv: + - kubectl + - -n + - monitoring + - get + - secret + - kube-prometheus-grafana + - -o + - jsonpath={.data.admin-user} + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_fetch_grafana_user_b64 + failed_when: false + changed_when: false + no_log: true + +- name: Fetch Grafana admin password (base64) + ansible.builtin.command: + argv: + - kubectl + - -n + - monitoring + - get + - secret + - kube-prometheus-grafana + - -o + - jsonpath={.data.admin-password} + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_fetch_grafana_pw_b64 + failed_when: false + changed_when: false + no_log: true diff --git a/ansible/roles/noble_landing_urls/tasks/main.yml b/ansible/roles/noble_landing_urls/tasks/main.yml new file mode 100644 index 0000000..aff57dd --- /dev/null +++ b/ansible/roles/noble_landing_urls/tasks/main.yml @@ -0,0 +1,20 @@ +--- +- name: Ensure output directory for generated landing page + ansible.builtin.file: + path: "{{ noble_repo_root }}/ansible/output" + state: directory + mode: "0755" + +- name: Fetch initial credentials from cluster Secrets (optional) + ansible.builtin.include_tasks: fetch_credentials.yml + when: noble_landing_urls_fetch_credentials | default(true) | bool + +- name: Write noble lab UI URLs (markdown landing page) + ansible.builtin.template: + src: noble-lab-ui-urls.md.j2 + dest: "{{ noble_landing_urls_dest }}" + mode: "0644" + +- name: Show landing page path + ansible.builtin.debug: + msg: "Noble lab UI list written to {{ noble_landing_urls_dest }}" diff --git a/ansible/roles/noble_landing_urls/templates/noble-lab-ui-urls.md.j2 b/ansible/roles/noble_landing_urls/templates/noble-lab-ui-urls.md.j2 new file mode 100644 index 0000000..e7ca91f --- /dev/null +++ b/ansible/roles/noble_landing_urls/templates/noble-lab-ui-urls.md.j2 @@ -0,0 +1,50 @@ +# Noble lab — web UIs (LAN) + +> **Sensitive:** This file may include **passwords read from Kubernetes Secrets** when credential fetch ran. It is **gitignored** — do not commit or share. + +**DNS:** point **`*.apps.noble.lab.pcenicni.dev`** at the Traefik **LoadBalancer** (MetalLB **`192.168.50.211`** by default — see `clusters/noble/apps/traefik/values.yaml`). + +**TLS:** **cert-manager** + **`letsencrypt-prod`** on each Ingress (public **DNS-01** for **`pcenicni.dev`**). + +This file is **generated** by Ansible (`noble_landing_urls` role). Use it as a temporary landing page to find services after deploy. + +| UI | What | Kubernetes service | Namespace | URL | +|----|------|----------------------|-----------|-----| +{% for e in noble_lab_ui_entries %} +| {{ e.name }} | {{ e.description }} | `{{ e.service }}` | `{{ e.namespace }}` | [{{ e.url }}]({{ e.url }}) | +{% endfor %} + +## Initial access (logins) + +| App | Username / identity | Password / secret | +|-----|---------------------|-------------------| +| **Argo CD** | `admin` | {% if (noble_fetch_argocd_pw_b64 is defined) and (noble_fetch_argocd_pw_b64.rc | default(1) == 0) and (noble_fetch_argocd_pw_b64.stdout | default('') | length > 0) %}`{{ noble_fetch_argocd_pw_b64.stdout | b64decode }}`{% else %}*(not fetched — use commands below)*{% endif %} | +| **Grafana** | {% if (noble_fetch_grafana_user_b64 is defined) and (noble_fetch_grafana_user_b64.rc | default(1) == 0) and (noble_fetch_grafana_user_b64.stdout | default('') | length > 0) %}`{{ noble_fetch_grafana_user_b64.stdout | b64decode }}`{% else %}*(from Secret — use commands below)*{% endif %} | {% if (noble_fetch_grafana_pw_b64 is defined) and (noble_fetch_grafana_pw_b64.rc | default(1) == 0) and (noble_fetch_grafana_pw_b64.stdout | default('') | length > 0) %}`{{ noble_fetch_grafana_pw_b64.stdout | b64decode }}`{% else %}*(not fetched — use commands below)*{% endif %} | +| **Headlamp** | ServiceAccount token | No fixed password. Sign in with a SA token, or configure OIDC — `clusters/noble/apps/headlamp/README.md`. | +| **Prometheus** | — | No auth in default install (lab). | +| **Alertmanager** | — | No auth in default install (lab). | +| **Longhorn** | — | No default login unless you enable access control in the UI settings. | +| **Vault** | Token | Root token is only from **`vault operator init`** (not stored in git). See `clusters/noble/apps/vault/README.md`. | + +### Commands to retrieve passwords (if not filled above) + +```bash +# Argo CD initial admin (Secret removed after you change password) +kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath='{.data.password}' | base64 -d +echo + +# Grafana admin user / password +kubectl -n monitoring get secret kube-prometheus-grafana -o jsonpath='{.data.admin-user}' | base64 -d +echo +kubectl -n monitoring get secret kube-prometheus-grafana -o jsonpath='{.data.admin-password}' | base64 -d +echo +``` + +To generate this file **without** calling kubectl, run Ansible with **`-e noble_landing_urls_fetch_credentials=false`**. + +## Notes + +- **Argo CD** `argocd-initial-admin-secret` disappears after you change the admin password. +- **Grafana** password is random unless you set `grafana.adminPassword` in chart values. +- **Vault** UI needs **unsealed** Vault; tokens come from your chosen auth method. +- **Prometheus / Alertmanager** UIs are unauthenticated by default — restrict when hardening (`talos/CLUSTER-BUILD.md` Phase G). diff --git a/ansible/roles/noble_platform/defaults/main.yml b/ansible/roles/noble_platform/defaults/main.yml new file mode 100644 index 0000000..0e72b05 --- /dev/null +++ b/ansible/roles/noble_platform/defaults/main.yml @@ -0,0 +1,8 @@ +--- +# kubectl apply -k can hit transient etcd timeouts under load; retries + longer API deadline help. +noble_platform_kubectl_request_timeout: 120s +noble_platform_kustomize_retries: 5 +noble_platform_kustomize_delay: 20 + +# Vault: injector (vault-k8s) owns MutatingWebhookConfiguration.caBundle; Helm upgrade can SSA-conflict. Delete webhook so Helm can recreate it. +noble_vault_delete_injector_webhook_before_helm: true diff --git a/ansible/roles/noble_platform/tasks/main.yml b/ansible/roles/noble_platform/tasks/main.yml index 5bc6f08..802344e 100644 --- a/ansible/roles/noble_platform/tasks/main.yml +++ b/ansible/roles/noble_platform/tasks/main.yml @@ -5,10 +5,15 @@ argv: - kubectl - apply + - "--request-timeout={{ noble_platform_kubectl_request_timeout }}" - -k - "{{ noble_repo_root }}/clusters/noble/apps" environment: KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_platform_kustomize + retries: "{{ noble_platform_kustomize_retries | int }}" + delay: "{{ noble_platform_kustomize_delay | int }}" + until: noble_platform_kustomize.rc == 0 changed_when: true - name: Install Sealed Secrets @@ -49,6 +54,21 @@ KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true +# vault-k8s patches webhook CA after install; Helm 3/4 SSA then conflicts on upgrade. Removing the MWC lets Helm re-apply cleanly; injector repopulates caBundle. +- name: Delete Vault agent injector MutatingWebhookConfiguration before Helm (avoids caBundle field conflict) + ansible.builtin.command: + argv: + - kubectl + - delete + - mutatingwebhookconfiguration + - vault-agent-injector-cfg + - --ignore-not-found + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + register: noble_vault_mwc_delete + when: noble_vault_delete_injector_webhook_before_helm | default(true) | bool + changed_when: "'deleted' in (noble_vault_mwc_delete.stdout | default(''))" + - name: Install Vault ansible.builtin.command: argv: @@ -66,6 +86,7 @@ - --wait environment: KUBECONFIG: "{{ noble_kubeconfig }}" + HELM_SERVER_SIDE_APPLY: "false" changed_when: true - name: Install kube-prometheus-stack diff --git a/clusters/noble/apps/kube-prometheus-stack/values.yaml b/clusters/noble/apps/kube-prometheus-stack/values.yaml index de02707..9dc9077 100644 --- a/clusters/noble/apps/kube-prometheus-stack/values.yaml +++ b/clusters/noble/apps/kube-prometheus-stack/values.yaml @@ -35,6 +35,20 @@ alertmanager: resources: requests: storage: 5Gi + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - alertmanager.apps.noble.lab.pcenicni.dev + paths: + - / + pathType: Prefix + tls: + - secretName: alertmanager-apps-noble-tls + hosts: + - alertmanager.apps.noble.lab.pcenicni.dev prometheus: prometheusSpec: @@ -48,6 +62,20 @@ prometheus: resources: requests: storage: 30Gi + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - prometheus.apps.noble.lab.pcenicni.dev + paths: + - / + pathType: Prefix + tls: + - secretName: prometheus-apps-noble-tls + hosts: + - prometheus.apps.noble.lab.pcenicni.dev grafana: persistence: @@ -78,5 +106,7 @@ grafana: server: domain: grafana.apps.noble.lab.pcenicni.dev root_url: https://grafana.apps.noble.lab.pcenicni.dev/ + # Traefik sets X-Forwarded-*; required for correct redirects and cookies behind the ingress. + use_proxy_headers: true # Loki datasource: apply `clusters/noble/apps/grafana-loki-datasource/loki-datasource.yaml` (sidecar ConfigMap) instead of additionalDataSources here. diff --git a/clusters/noble/apps/longhorn/values.yaml b/clusters/noble/apps/longhorn/values.yaml index 69a34a0..2af82f8 100644 --- a/clusters/noble/apps/longhorn/values.yaml +++ b/clusters/noble/apps/longhorn/values.yaml @@ -16,6 +16,19 @@ defaultSettings: # Default 30% reserved often makes small data disks look "full" to the scheduler. storageReservedPercentageForDefaultDisk: "10" +# Longhorn UI — same *.apps.noble.lab.pcenicni.dev pattern as Grafana / Headlamp (Traefik LB → cert-manager TLS). +ingress: + enabled: true + ingressClassName: traefik + host: longhorn.apps.noble.lab.pcenicni.dev + path: / + pathType: Prefix + tls: true + tlsSecret: longhorn-apps-noble-tls + secureBackends: false + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + # Pre-upgrade Job: keep enabled for normal Helm upgrades (disable only if GitOps sync fights the Job). preUpgradeChecker: jobEnabled: true diff --git a/clusters/noble/apps/vault/cilium-network-policy.yaml b/clusters/noble/apps/vault/cilium-network-policy.yaml index 9a910eb..c381899 100644 --- a/clusters/noble/apps/vault/cilium-network-policy.yaml +++ b/clusters/noble/apps/vault/cilium-network-policy.yaml @@ -24,6 +24,13 @@ spec: - ports: - port: "8200" protocol: TCP + - fromEndpoints: + - matchLabels: + "k8s:io.kubernetes.pod.namespace": traefik + toPorts: + - ports: + - port: "8200" + protocol: TCP - fromEndpoints: - matchLabels: "k8s:io.kubernetes.pod.namespace": vault diff --git a/clusters/noble/apps/vault/values.yaml b/clusters/noble/apps/vault/values.yaml index 4c179e5..9bf9945 100644 --- a/clusters/noble/apps/vault/values.yaml +++ b/clusters/noble/apps/vault/values.yaml @@ -44,5 +44,19 @@ server: path: "/v1/sys/health?uninitcode=204&sealedcode=204&standbyok=true" port: 8200 + # LAN: TLS terminates at Traefik + cert-manager; listener stays HTTP (global.tlsDisable). + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - host: vault.apps.noble.lab.pcenicni.dev + paths: [] + tls: + - secretName: vault-apps-noble-tls + hosts: + - vault.apps.noble.lab.pcenicni.dev + ui: enabled: true diff --git a/clusters/noble/bootstrap/argocd/README.md b/clusters/noble/bootstrap/argocd/README.md index edae846..8b7dec4 100644 --- a/clusters/noble/bootstrap/argocd/README.md +++ b/clusters/noble/bootstrap/argocd/README.md @@ -35,6 +35,17 @@ echo Change the password in the UI or via `argocd account update-password`. +### TLS: changing ClusterIssuer (e.g. staging → prod) + +If **`helm upgrade --wait`** fails with *Secret was previously issued by `letsencrypt-staging`* (or another issuer), cert-manager will not replace the TLS Secret in place. Remove the old cert material once, then upgrade again: + +```bash +kubectl -n argocd delete certificate argocd-server --ignore-not-found +kubectl -n argocd delete secret argocd-server-tls --ignore-not-found +helm upgrade --install argocd argo/argo-cd -n argocd --create-namespace \ + --version 9.4.17 -f clusters/noble/bootstrap/argocd/values.yaml --wait +``` + ## 3. Register this repo (if private) Use **Settings → Repositories** in the UI, or `argocd repo add` / a `Secret` of type `repository`. diff --git a/clusters/noble/bootstrap/argocd/values.yaml b/clusters/noble/bootstrap/argocd/values.yaml index b606dab..1055da1 100644 --- a/clusters/noble/bootstrap/argocd/values.yaml +++ b/clusters/noble/bootstrap/argocd/values.yaml @@ -32,17 +32,20 @@ server: certificate: enabled: true domain: argo.apps.noble.lab.pcenicni.dev + # If you change issuer.name, delete Certificate/Secret once so cert-manager can re-issue (see README.md). issuer: group: cert-manager.io kind: ClusterIssuer - name: letsencrypt-staging + name: letsencrypt-prod ingress: enabled: true ingressClassName: traefik hostname: argo.apps.noble.lab.pcenicni.dev tls: true - annotations: {} + # Traefik terminates TLS; Argo serves HTTP/2 cleartext (insecure). Without h2c, UI/API can 404 or fail gRPC. + annotations: + traefik.ingress.kubernetes.io/service.serversscheme: h2c service: type: ClusterIP