Add retries to drain during upgrade. Allow leaving nodes cordoned after drain failure. Allow continuing upgrade if drain fails. (#7206)

This commit is contained in:
David Louks 2021-01-26 13:10:31 -06:00 committed by GitHub
parent 9007d6621a
commit d378d789cf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 2 deletions

View file

@ -3,6 +3,11 @@ drain_grace_period: 300
drain_timeout: 360s
drain_pod_selector: ""
drain_nodes: true
drain_retries: 3
drain_retry_delay_seconds: 10
upgrade_node_uncordon_after_drain_failure: true
upgrade_node_fail_if_drain_fails: true
upgrade_node_confirm: false
upgrade_node_pause_seconds: 0

View file

@ -77,14 +77,19 @@
--timeout {{ drain_timeout }}
--delete-local-data {{ kube_override_hostname|default(inventory_hostname) }}
{% if drain_pod_selector %}--pod-selector '{{ drain_pod_selector }}'{% endif %}
when:
- drain_nodes
when: drain_nodes
register: result
until: result.rc == 0
retries: "{{ drain_retries }}"
delay: "{{ drain_retry_delay_seconds }}"
rescue:
- name: Set node back to schedulable
command: "{{ bin_dir }}/kubectl --kubeconfig /etc/kubernetes/admin.conf uncordon {{ inventory_hostname }}"
when: upgrade_node_uncordon_after_drain_failure
- name: Fail after rescue
fail:
msg: "Failed to drain node {{ inventory_hostname }}"
when: upgrade_node_fail_if_drain_fails
delegate_to: "{{ groups['kube-master'][0] }}"
when:
- needs_cordoning