From 4bc8da02255d72594062f9af23645044e0010d2f Mon Sep 17 00:00:00 2001 From: Nikholas Pcenicni <82239765+nikpcenicni@users.noreply.github.com> Date: Thu, 14 May 2026 15:52:42 -0400 Subject: [PATCH] Enhance Authentik role by adding Helm wait timeout for oauth2-proxy and improving task conditions for admin access and OAuth2 provider upserts. Update README with new variable descriptions and adjust Longhorn deployment tasks to ensure proper rollout before Loki installation, enhancing overall deployment reliability. --- ansible/README.md | 2 +- .../roles/noble_authentik/defaults/main.yml | 2 ++ .../files/worker_add_bootstrap_user_groups.py | 3 +-- .../worker_ensure_authentik_admin_access.py | 3 +-- .../files/worker_upsert_oauth_oidc.py | 3 +-- ansible/roles/noble_authentik/tasks/main.yml | 27 ++++++++++++++++--- .../roles/noble_platform/defaults/main.yml | 5 ++++ ansible/roles/noble_platform/tasks/main.yml | 17 ++++++++++++ clusters/noble/bootstrap/loki/values.yaml | 6 +++++ 9 files changed, 57 insertions(+), 11 deletions(-) diff --git a/ansible/README.md b/ansible/README.md index 49bf470..f269465 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -82,7 +82,7 @@ ansible-playbook playbooks/noble.yml --tags authentik -e noble_authentik_install ### Variables — `inventory/group_vars/` and role defaults - **`inventory/group_vars/all.yml`:** **`noble_newt_install`**, **`noble_velero_install`**, **`noble_authentik_install`**, **`noble_cert_manager_require_cloudflare_secret`**, **`noble_argocd_apply_root_application`**, **`noble_argocd_apply_bootstrap_root_application`**, **`noble_k8s_api_server_override`**, **`noble_k8s_api_server_auto_fallback`**, **`noble_k8s_api_server_fallback`**, **`noble_skip_k8s_health_check`** -- **`roles/noble_platform/defaults/main.yml`:** **`noble_apply_sops_secrets`**, **`noble_sops_age_key_file`** (SOPS secrets under **`clusters/noble/secrets/`**) +- **`roles/noble_platform/defaults/main.yml`:** **`noble_apply_sops_secrets`**, **`noble_sops_age_key_file`**, **`noble_platform_loki_helm_wait_timeout`**, **`noble_platform_wait_longhorn_csi_before_loki`**, **`noble_platform_longhorn_csi_rollout_timeout`** ## Roles diff --git a/ansible/roles/noble_authentik/defaults/main.yml b/ansible/roles/noble_authentik/defaults/main.yml index 35b040a..baa32df 100644 --- a/ansible/roles/noble_authentik/defaults/main.yml +++ b/ansible/roles/noble_authentik/defaults/main.yml @@ -14,6 +14,8 @@ noble_authentik_namespace: authentik # Helm release name (deployments: **{release}-server**, **{release}-worker**). noble_authentik_release_name: authentik noble_authentik_oauth2_proxy_chart_version: "10.4.3" +# Helm **--wait** timeout for **oauth2-proxy** (first pull / API checks can exceed 10m). +noble_authentik_oauth2_proxy_helm_wait_timeout: 10m noble_authentik_host: auth.apps.noble.lab.pcenicni.dev noble_authentik_public_url: "https://{{ noble_authentik_host }}" diff --git a/ansible/roles/noble_authentik/files/worker_add_bootstrap_user_groups.py b/ansible/roles/noble_authentik/files/worker_add_bootstrap_user_groups.py index ab40040..ff991a4 100644 --- a/ansible/roles/noble_authentik/files/worker_add_bootstrap_user_groups.py +++ b/ansible/roles/noble_authentik/files/worker_add_bootstrap_user_groups.py @@ -51,5 +51,4 @@ def main() -> None: print("worker: bootstrap user group membership updated", flush=True) -if __name__ == "__main__": - main() +main() diff --git a/ansible/roles/noble_authentik/files/worker_ensure_authentik_admin_access.py b/ansible/roles/noble_authentik/files/worker_ensure_authentik_admin_access.py index 4c6b229..23da009 100644 --- a/ansible/roles/noble_authentik/files/worker_ensure_authentik_admin_access.py +++ b/ansible/roles/noble_authentik/files/worker_ensure_authentik_admin_access.py @@ -69,5 +69,4 @@ def main() -> None: ) -if __name__ == "__main__": - main() +main() diff --git a/ansible/roles/noble_authentik/files/worker_upsert_oauth_oidc.py b/ansible/roles/noble_authentik/files/worker_upsert_oauth_oidc.py index 0b0c0c8..22f63ec 100644 --- a/ansible/roles/noble_authentik/files/worker_upsert_oauth_oidc.py +++ b/ansible/roles/noble_authentik/files/worker_upsert_oauth_oidc.py @@ -106,5 +106,4 @@ def main() -> None: print("worker: OAuth2 providers + applications upserted", flush=True) -if __name__ == "__main__": - main() +main() diff --git a/ansible/roles/noble_authentik/tasks/main.yml b/ansible/roles/noble_authentik/tasks/main.yml index db98aef..2a272e2 100644 --- a/ansible/roles/noble_authentik/tasks/main.yml +++ b/ansible/roles/noble_authentik/tasks/main.yml @@ -276,7 +276,15 @@ environment: KUBECONFIG: "{{ noble_kubeconfig }}" register: noble_authentik_worker_admin_access - changed_when: true + changed_when: >- + "worker:" in (noble_authentik_worker_admin_access.stdout | default("")) + and "authentik Admins" in (noble_authentik_worker_admin_access.stdout | default("")) + failed_when: >- + (noble_authentik_worker_admin_access.rc | default(-1)) != 0 + or ( + "worker:" not in (noble_authentik_worker_admin_access.stdout | default("")) + or "authentik Admins" not in (noble_authentik_worker_admin_access.stdout | default("")) + ) when: - noble_authentik_configure_idp | default(true) | bool - noble_authentik_ensure_admin_ui_access | default(true) | bool @@ -321,7 +329,15 @@ environment: KUBECONFIG: "{{ noble_kubeconfig }}" register: noble_authentik_worker_oidc_upsert - changed_when: true + changed_when: >- + "worker: OAuth2 providers + applications upserted" + in (noble_authentik_worker_oidc_upsert.stdout | default("")) + failed_when: >- + (noble_authentik_worker_oidc_upsert.rc | default(-1)) != 0 + or ( + "worker: OAuth2 providers + applications upserted" + not in (noble_authentik_worker_oidc_upsert.stdout | default("")) + ) when: - noble_authentik_configure_idp | default(true) | bool - (noble_authentik_oidc_provision_via | default('worker') | lower) == 'worker' @@ -366,7 +382,10 @@ environment: KUBECONFIG: "{{ noble_kubeconfig }}" register: noble_authentik_worker_user_groups - changed_when: true + changed_when: >- + "worker: bootstrap user group membership updated" + in (noble_authentik_worker_user_groups.stdout | default("")) + failed_when: (noble_authentik_worker_user_groups.rc | default(-1)) != 0 when: - noble_authentik_configure_idp | default(true) | bool - (noble_authentik_oidc_provision_via | default('worker') | lower) == 'worker' @@ -467,7 +486,7 @@ - --force-conflicts - --wait - --timeout - - 10m + - "{{ noble_authentik_oauth2_proxy_helm_wait_timeout }}" environment: KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true diff --git a/ansible/roles/noble_platform/defaults/main.yml b/ansible/roles/noble_platform/defaults/main.yml index 15a3947..bbcfd33 100644 --- a/ansible/roles/noble_platform/defaults/main.yml +++ b/ansible/roles/noble_platform/defaults/main.yml @@ -11,6 +11,11 @@ noble_platform_kube_prometheus_operator_wait_retries: 60 noble_platform_kube_prometheus_operator_wait_delay: 5 # Longhorn PVCs + full stack often need 45-60m; node-exporter DaemonSet can be last at 3/4 until one node catches up. noble_platform_kube_prometheus_helm_wait_timeout: 60m +# Loki SingleBinary + Longhorn PVC: Helm **--wait** can exceed **5m** defaults; raise if Longhorn attach is slow. +noble_platform_loki_helm_wait_timeout: 30m +# Before Loki (first Longhorn PVC workload), ensure CSI plugin DaemonSet is fully rolled out (avoids **FailedMount** / backend timeouts). +noble_platform_wait_longhorn_csi_before_loki: true +noble_platform_longhorn_csi_rollout_timeout: 15m # Decrypt **clusters/noble/secrets/*.yaml** with SOPS and kubectl apply (requires **sops**, **age**, and **age-key.txt**). noble_apply_sops_secrets: true diff --git a/ansible/roles/noble_platform/tasks/main.yml b/ansible/roles/noble_platform/tasks/main.yml index ce5d273..0027dd4 100644 --- a/ansible/roles/noble_platform/tasks/main.yml +++ b/ansible/roles/noble_platform/tasks/main.yml @@ -131,6 +131,21 @@ KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true +- name: Wait for Longhorn CSI plugin before Loki (PVC attach) + ansible.builtin.command: + argv: + - kubectl + - rollout + - status + - daemonset/longhorn-csi-plugin + - -n + - longhorn-system + - --timeout={{ noble_platform_longhorn_csi_rollout_timeout }} + environment: + KUBECONFIG: "{{ noble_kubeconfig }}" + when: noble_platform_wait_longhorn_csi_before_loki | default(true) | bool + changed_when: false + - name: Install Loki ansible.builtin.command: argv: @@ -147,6 +162,8 @@ - "{{ noble_repo_root }}/clusters/noble/bootstrap/loki/values.yaml" - --force-conflicts - --wait + - --timeout + - "{{ noble_platform_loki_helm_wait_timeout }}" environment: KUBECONFIG: "{{ noble_kubeconfig }}" changed_when: true diff --git a/clusters/noble/bootstrap/loki/values.yaml b/clusters/noble/bootstrap/loki/values.yaml index c1fe007..c6355d6 100644 --- a/clusters/noble/bootstrap/loki/values.yaml +++ b/clusters/noble/bootstrap/loki/values.yaml @@ -9,6 +9,12 @@ # --version 6.55.0 -f clusters/noble/bootstrap/loki/values.yaml --wait --timeout 30m # # Query/push URL for Grafana + Fluent Bit: http://loki-gateway.loki.svc.cluster.local:80 +# +# Troubleshooting: if **helm --wait** times out with **StatefulSet/loki/loki not ready**, run +# **kubectl -n loki describe pod loki-0**. **FailedMount** + **longhorn-backend** / **hasn't been attached yet** +# is a **Longhorn CSI** issue (not Loki config): confirm **kubectl -n longhorn-system rollout status +# daemonset/longhorn-csi-plugin** succeeds, check Longhorn UI → Volume, and consider **kubectl delete pod -n loki loki-0** +# to recreate the pod after storage is healthy. deploymentMode: SingleBinary