From 26183c2523d056daf265b3c8a6833e9689e85f7b Mon Sep 17 00:00:00 2001 From: David Louks <2402775+dlouks@users.noreply.github.com> Date: Fri, 15 Jan 2021 15:17:43 -0600 Subject: [PATCH] Remove ignore_errors from drain tasks and enable retires (#7151) * Remove ignore_errors from drain tasks and enable retires * Fix lint error by checking if stdout length is not 0, ie string is not empty. (cherry picked from commit ccd3aeebbc5c4da85155b365bab66d6441dc3e81) --- .../remove-node/pre-remove/defaults/main.yml | 2 ++ roles/remove-node/pre-remove/tasks/main.yml | 35 +++++++++++++------ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/roles/remove-node/pre-remove/defaults/main.yml b/roles/remove-node/pre-remove/defaults/main.yml index 1e5b849cc..deaa8afb7 100644 --- a/roles/remove-node/pre-remove/defaults/main.yml +++ b/roles/remove-node/pre-remove/defaults/main.yml @@ -2,3 +2,5 @@ allow_ungraceful_removal: false drain_grace_period: 300 drain_timeout: 360s +drain_retries: 3 +drain_retry_delay_seconds: 10 diff --git a/roles/remove-node/pre-remove/tasks/main.yml b/roles/remove-node/pre-remove/tasks/main.yml index 563fa036e..42316e209 100644 --- a/roles/remove-node/pre-remove/tasks/main.yml +++ b/roles/remove-node/pre-remove/tasks/main.yml @@ -1,14 +1,26 @@ --- -- name: cordon-node | Mark all nodes as unschedulable before drain # noqa 301 - command: >- - {{ bin_dir }}/kubectl cordon {{ hostvars[item]['kube_override_hostname']|default(item) }} - with_items: - - "{{ node.split(',') | default(groups['kube-node']) }}" - register: result - failed_when: result.rc != 0 and not allow_ungraceful_removal +- name: remove-node | Set `nodes_to_drain` as empty list + set_fact: + nodes_to_drain: [] + +- name: remove-node | Identify nodes to drain, ignore non-cluster nodes + shell: | + set -o pipefail + {{ bin_dir }}/kubectl get nodes -o json \ + | jq .items[].metadata.name \ + | jq "select(. | test(\"^{{ hostvars[item]['kube_override_hostname']|default(item) }}$\"))" + loop: "{{ node.split(',') | default(groups['kube-node']) }}" + register: nodes delegate_to: "{{ groups['kube-master']|first }}" + changed_when: false + run_once: true + +- name: remove-node | Generate list of nodes to drain + set_fact: + nodes_to_drain: "{{ nodes_to_drain }} + [ '{{ item.stdout | regex_replace('\"', '') }}' ]" + loop: "{{ nodes.results }}" + when: item.stdout | length != 0 run_once: true - ignore_errors: yes - name: remove-node | Drain node except daemonsets resource # noqa 301 command: >- @@ -18,10 +30,11 @@ --grace-period {{ drain_grace_period }} --timeout {{ drain_timeout }} --delete-local-data {{ hostvars[item]['kube_override_hostname']|default(item) }} - with_items: - - "{{ node.split(',') | default(groups['kube-node']) }}" + loop: "{{ nodes_to_drain }}" register: result failed_when: result.rc != 0 and not allow_ungraceful_removal delegate_to: "{{ groups['kube-master']|first }}" run_once: true - ignore_errors: yes + until: result.rc == 0 or allow_ungraceful_removal + retries: "{{ drain_retries }}" + delay: "{{ drain_retry_delay_seconds }}"