Files
home-server/ansible/roles/noble_platform/tasks/main.yml

221 lines
6.6 KiB
YAML

---
# Mirrors former **noble-platform** Argo Application: Helm releases + plain manifests under clusters/noble/bootstrap.
- name: Apply clusters/noble/bootstrap kustomize (namespaces, Grafana Loki datasource)
ansible.builtin.command:
argv:
- kubectl
- apply
- "--request-timeout={{ noble_platform_kubectl_request_timeout }}"
- -k
- "{{ noble_repo_root }}/clusters/noble/bootstrap"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
register: noble_platform_kustomize
retries: "{{ noble_platform_kustomize_retries | int }}"
delay: "{{ noble_platform_kustomize_delay | int }}"
until: noble_platform_kustomize.rc == 0
changed_when: true
- name: Stat SOPS age private key (age-key.txt)
ansible.builtin.stat:
path: "{{ noble_sops_age_key_file }}"
register: noble_sops_age_key_stat
- name: Apply SOPS-encrypted cluster secrets (clusters/noble/secrets/*.yaml)
ansible.builtin.shell: |
set -euo pipefail
shopt -s nullglob
for f in "{{ noble_repo_root }}/clusters/noble/secrets"/*.yaml; do
sops -d "$f" | kubectl apply -f -
done
args:
executable: /bin/bash
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
SOPS_AGE_KEY_FILE: "{{ noble_sops_age_key_file }}"
when:
- noble_apply_sops_secrets | default(true) | bool
- noble_sops_age_key_stat.stat.exists
changed_when: true
# Helm --wait alone cannot extend the operator Deployment's progressDeadlineSeconds (default 10m).
- name: Install kube-prometheus-stack (apply without Helm wait)
ansible.builtin.command:
argv:
- helm
- upgrade
- --install
- kube-prometheus
- prometheus-community/kube-prometheus-stack
- -n
- monitoring
- --version
- "85.0.3"
- -f
- "{{ noble_repo_root }}/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml"
- --force-conflicts
- --wait=false
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true
- name: Wait for prometheus-operator Deployment object
ansible.builtin.command:
argv:
- kubectl
- get
- deployment/kube-prometheus-kube-prome-operator
- -n
- monitoring
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
register: noble_kube_prom_operator_deploy
until: noble_kube_prom_operator_deploy.rc == 0
retries: "{{ noble_platform_kube_prometheus_operator_wait_retries | int }}"
delay: "{{ noble_platform_kube_prometheus_operator_wait_delay | int }}"
changed_when: false
- name: Extend prometheus-operator Deployment progress deadline
ansible.builtin.command:
argv:
- kubectl
- patch
- deployment/kube-prometheus-kube-prome-operator
- -n
- monitoring
- --type=merge
- -p
- "{{ {'spec': {'progressDeadlineSeconds': (noble_platform_kube_prometheus_operator_progress_deadline_seconds | int)}} | to_json }}"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true
- name: Restart prometheus-operator if Deployment already hit progress deadline
ansible.builtin.shell: |
set -euo pipefail
dep=kube-prometheus-kube-prome-operator
msg=$(kubectl get deployment "$dep" -n monitoring -o jsonpath='{.status.conditions[?(@.type=="Progressing")].message}' 2>/dev/null || true)
reason=$(kubectl get deployment "$dep" -n monitoring -o jsonpath='{.status.conditions[?(@.type=="Progressing")].reason}' 2>/dev/null || true)
combined="${reason}${msg}"
if printf '%s' "$combined" | grep -qiE 'ProgressDeadlineExceeded|progress[[:space:]]*deadline[[:space:]]*exceeded'; then
kubectl rollout restart deployment/"$dep" -n monitoring
echo restarted
fi
args:
executable: /bin/bash
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
register: noble_kube_prom_operator_restart
changed_when: "'restarted' in noble_kube_prom_operator_restart.stdout"
# Helm --wait prints nothing until done or timeout; override noble_platform_kube_prometheus_helm_wait_timeout if needed.
- name: Install kube-prometheus-stack (Helm wait for full release; often 30-60m silent - watch kubectl -n monitoring get pods,ds,pvc)
ansible.builtin.command:
argv:
- helm
- upgrade
- --install
- kube-prometheus
- prometheus-community/kube-prometheus-stack
- -n
- monitoring
- --version
- "85.0.3"
- -f
- "{{ noble_repo_root }}/clusters/noble/bootstrap/kube-prometheus-stack/values.yaml"
- --force-conflicts
- --wait
- --timeout
- "{{ noble_platform_kube_prometheus_helm_wait_timeout }}"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true
- name: Wait for Longhorn CSI plugin before Loki (PVC attach)
ansible.builtin.command:
argv:
- kubectl
- rollout
- status
- daemonset/longhorn-csi-plugin
- -n
- longhorn-system
- --timeout={{ noble_platform_longhorn_csi_rollout_timeout }}
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
when: noble_platform_wait_longhorn_csi_before_loki | default(true) | bool
changed_when: false
- name: Install Loki
ansible.builtin.command:
argv:
- helm
- upgrade
- --install
- loki
- grafana/loki
- -n
- loki
- --version
- "7.0.0"
- -f
- "{{ noble_repo_root }}/clusters/noble/bootstrap/loki/values.yaml"
- --force-conflicts
- --wait
- --timeout
- "{{ noble_platform_loki_helm_wait_timeout }}"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true
- name: Install Fluent Bit
ansible.builtin.command:
argv:
- helm
- upgrade
- --install
- fluent-bit
- fluent/fluent-bit
- -n
- logging
- --version
- "0.57.5"
- -f
- "{{ noble_repo_root }}/clusters/noble/bootstrap/fluent-bit/values.yaml"
- --force-conflicts
- --wait
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true
- name: Install Headlamp
ansible.builtin.command:
argv:
- helm
- upgrade
- --install
- headlamp
- headlamp/headlamp
- --version
- "0.42.0"
- -n
- headlamp
- -f
- "{{ noble_repo_root }}/clusters/noble/bootstrap/headlamp/values.yaml"
- --force-conflicts
- --wait
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true
- name: Apply Headlamp static manifests (metrics RBAC + OIDC group binding when used)
ansible.builtin.command:
argv:
- kubectl
- apply
- -k
- "{{ noble_repo_root }}/clusters/noble/bootstrap/headlamp"
environment:
KUBECONFIG: "{{ noble_kubeconfig }}"
changed_when: true