diff --git a/ansible/README.md b/ansible/README.md index 49bf470..f269465 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -82,7 +82,7 @@ ansible-playbook playbooks/noble.yml --tags authentik -e noble_authentik_install ### Variables — `inventory/group_vars/` and role defaults - **`inventory/group_vars/all.yml`:** **`noble_newt_install`**, **`noble_velero_install`**, **`noble_authentik_install`**, **`noble_cert_manager_require_cloudflare_secret`**, **`noble_argocd_apply_root_application`**, **`noble_argocd_apply_bootstrap_root_application`**, **`noble_k8s_api_server_override`**, **`noble_k8s_api_server_auto_fallback`**, **`noble_k8s_api_server_fallback`**, **`noble_skip_k8s_health_check`** -- **`roles/noble_platform/defaults/main.yml`:** **`noble_apply_sops_secrets`**, **`noble_sops_age_key_file`** (SOPS secrets under **`clusters/noble/secrets/`**) +- **`roles/noble_platform/defaults/main.yml`:** **`noble_apply_sops_secrets`**, **`noble_sops_age_key_file`**, **`noble_platform_loki_helm_wait_timeout`**, **`noble_platform_wait_longhorn_csi_before_loki`**, **`noble_platform_longhorn_csi_rollout_timeout`** ## Roles diff --git a/ansible/roles/noble_authentik/defaults/main.yml b/ansible/roles/noble_authentik/defaults/main.yml index 35b040a..baa32df 100644 --- a/ansible/roles/noble_authentik/defaults/main.yml +++ b/ansible/roles/noble_authentik/defaults/main.yml @@ -14,6 +14,8 @@ noble_authentik_namespace: authentik # Helm release name (deployments: **{release}-server**, **{release}-worker**). noble_authentik_release_name: authentik noble_authentik_oauth2_proxy_chart_version: "10.4.3" +# Helm **--wait** timeout for **oauth2-proxy** (first pull / API checks can exceed 10m). +noble_authentik_oauth2_proxy_helm_wait_timeout: 10m noble_authentik_host: auth.apps.noble.lab.pcenicni.dev noble_authentik_public_url: "https://{{ noble_authentik_host }}" diff --git a/ansible/roles/noble_authentik/files/worker_add_bootstrap_user_groups.py b/ansible/roles/noble_authentik/files/worker_add_bootstrap_user_groups.py index ab40040..ff991a4 100644 --- a/ansible/roles/noble_authentik/files/worker_add_bootstrap_user_groups.py +++ b/ansible/roles/noble_authentik/files/worker_add_bootstrap_user_groups.py @@ -51,5 +51,4 @@ def main() -> None: print("worker: bootstrap user group membership updated", flush=True) -if __name__ == "__main__": - main() +main() diff --git a/ansible/roles/noble_authentik/files/worker_ensure_authentik_admin_access.py b/ansible/roles/noble_authentik/files/worker_ensure_authentik_admin_access.py index 4c6b229..23da009 100644 --- a/ansible/roles/noble_authentik/files/worker_ensure_authentik_admin_access.py +++ b/ansible/roles/noble_authentik/files/worker_ensure_authentik_admin_access.py @@ -69,5 +69,4 @@ def main() -> None: ) -if __name__ == "__main__": - main() +main() diff --git a/ansible/roles/noble_authentik/files/worker_upsert_oauth_oidc.py b/ansible/roles/noble_authentik/files/worker_upsert_oauth_oidc.py index 0b0c0c8..22f63ec 100644 --- a/ansible/roles/noble_authentik/files/worker_upsert_oauth_oidc.py +++ b/ansible/roles/noble_authentik/files/worker_upsert_oauth_oidc.py @@ -106,5 +106,4 @@ def main() -> None: print("worker: OAuth2 providers + applications upserted", flush=True) -if __name__ == "__main__": - main() +main() diff --git a/ansible/roles/noble_authentik/tasks/main.yml b/ansible/roles/noble_authentik/tasks/main.yml index db98aef..2a272e2 100644 --- a/ansible/roles/noble_authentik/tasks/main.yml +++ b/ansible/roles/noble_authentik/tasks/main.yml @@ -276,7 +276,15 @@ environment: KUBECONFIG: "{{ noble_kubeconfig }}" register: noble_authentik_worker_admin_access - changed_when: true + changed_when: >- + "worker:" in (noble_authentik_worker_admin_access.stdout | default("")) + and "authentik Admins" in (noble_authentik_worker_admin_access.stdout | default("")) + failed_when: >- + (noble_authentik_worker_admin_access.rc | default(-1)) != 0 + or ( + "worker:" not in (noble_authentik_worker_admin_access.stdout | default("")) + or "authentik Admins" not in (noble_authentik_worker_admin_access.stdout | default("")) + ) when: - noble_authentik_configure_idp | default(true) | bool - noble_authentik_ensure_admin_ui_access | default(true) | bool @@ -321,7 +329,15 @@ environment: KUBECONFIG: "{{ noble_kubeconfig }}" register: noble_authentik_worker_oidc_upsert - changed_when: true + changed_when: >- + "worker: OAuth2 providers + applications upserted" + in (noble_authentik_worker_oidc_upsert.stdout | default("")) + failed_when: >- + (noble_authentik_worker_oidc_upsert.rc | default(-1)) != 0 + or ( + "worker: OAuth2 providers + applications upserted" + not in (noble_authentik_worker_oidc_upsert.stdout | default("")) + ) when: - noble_authentik_configure_idp | default(true) | bool - (noble_authentik_oidc_provision_via | default('worker') | lower) == 'worker' @@ -366,7 +382,10 @@ environment: KUBECONFIG: "{{ noble_kubeconfig }}" register: noble_authentik_worker_user_groups - changed_when: true + changed_when: >- + "worker: bootstrap user group membership updated" + in (noble_authentik_worker_user_groups.stdout | default("")) + failed_when: (noble_authentik_worker_user_groups.rc | default(-1)) != 0 when: - noble_authentik_configure_idp | default(true) | bool - (noble_authentik_oidc_provision_via | default('worker') | lower) == 'worker' @@ -467,7 +486,7 @@ - --force-conflicts - --wait - --timeout - - 10m + - "{{ noble_authentik_oauth2_proxy_helm_wait_timeout }}" environment: KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true diff --git a/ansible/roles/noble_platform/defaults/main.yml b/ansible/roles/noble_platform/defaults/main.yml index 15a3947..bbcfd33 100644 --- a/ansible/roles/noble_platform/defaults/main.yml +++ b/ansible/roles/noble_platform/defaults/main.yml @@ -11,6 +11,11 @@ noble_platform_kube_prometheus_operator_wait_retries: 60 noble_platform_kube_prometheus_operator_wait_delay: 5 # Longhorn PVCs + full stack often need 45-60m; node-exporter DaemonSet can be last at 3/4 until one node catches up. noble_platform_kube_prometheus_helm_wait_timeout: 60m +# Loki SingleBinary + Longhorn PVC: Helm **--wait** can exceed **5m** defaults; raise if Longhorn attach is slow. +noble_platform_loki_helm_wait_timeout: 30m +# Before Loki (first Longhorn PVC workload), ensure CSI plugin DaemonSet is fully rolled out (avoids **FailedMount** / backend timeouts). +noble_platform_wait_longhorn_csi_before_loki: true +noble_platform_longhorn_csi_rollout_timeout: 15m # Decrypt **clusters/noble/secrets/*.yaml** with SOPS and kubectl apply (requires **sops**, **age**, and **age-key.txt**). noble_apply_sops_secrets: true diff --git a/ansible/roles/noble_platform/tasks/main.yml b/ansible/roles/noble_platform/tasks/main.yml index ce5d273..0027dd4 100644 --- a/ansible/roles/noble_platform/tasks/main.yml +++ b/ansible/roles/noble_platform/tasks/main.yml @@ -131,6 +131,21 @@ KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true +- name: Wait for Longhorn CSI plugin before Loki (PVC attach) + ansible.builtin.command: + argv: + - kubectl + - rollout + - status + - daemonset/longhorn-csi-plugin + - -n + - longhorn-system + - --timeout={{ noble_platform_longhorn_csi_rollout_timeout }} + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + when: noble_platform_wait_longhorn_csi_before_loki | default(true) | bool + changed_when: false + - name: Install Loki ansible.builtin.command: argv: @@ -147,6 +162,8 @@ - "{{ noble_repo_root }}/clusters/noble/bootstrap/loki/values.yaml" - --force-conflicts - --wait + - --timeout + - "{{ noble_platform_loki_helm_wait_timeout }}" environment: KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true diff --git a/clusters/noble/bootstrap/loki/values.yaml b/clusters/noble/bootstrap/loki/values.yaml index c1fe007..c6355d6 100644 --- a/clusters/noble/bootstrap/loki/values.yaml +++ b/clusters/noble/bootstrap/loki/values.yaml @@ -9,6 +9,12 @@ # --version 6.55.0 -f clusters/noble/bootstrap/loki/values.yaml --wait --timeout 30m # # Query/push URL for Grafana + Fluent Bit: http://loki-gateway.loki.svc.cluster.local:80 +# +# Troubleshooting: if **helm --wait** times out with **StatefulSet/loki/loki not ready**, run +# **kubectl -n loki describe pod loki-0**. **FailedMount** + **longhorn-backend** / **hasn't been attached yet** +# is a **Longhorn CSI** issue (not Loki config): confirm **kubectl -n longhorn-system rollout status +# daemonset/longhorn-csi-plugin** succeeds, check Longhorn UI → Volume, and consider **kubectl delete pod -n loki loki-0** +# to recreate the pod after storage is healthy. deploymentMode: SingleBinary