c12s-kubespray/roles/recover_control_plane/etcd/tasks/main.yml

---
- name: Get etcd endpoint health
  command: "{{ bin_dir }}/etcdctl endpoint health"
  register: etcd_endpoint_health
  ignore_errors: true  # noqa ignore-errors
  changed_when: false
  check_mode: no
  environment:
    ETCDCTL_API: 3
    ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}"
    ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
    ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
    ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
  when:
    - groups['broken_etcd']

- name: Set healthy fact
  set_fact:
    healthy: "{{ etcd_endpoint_health.stderr is match('Error: unhealthy cluster') }}"
  when:
    - groups['broken_etcd']

# When there is an error, everything is printed in stderr_lines, even "is healthy" messages.
- name: Set has_quorum fact
  set_fact:
    has_quorum: "{{ etcd_endpoint_health.stderr_lines | select('match', '.*is healthy.*') | list | length >= etcd_endpoint_health.stderr_lines | select('match', '.*is unhealthy.*') | list | length }}"
  when:
    - groups['broken_etcd']

- include_tasks: recover_lost_quorum.yml
  when:
    - groups['broken_etcd']
    - not has_quorum

- name: Remove etcd data dir
  file:
    path: "{{ etcd_data_dir }}"
    state: absent
  delegate_to: "{{ item }}"
  with_items: "{{ groups['broken_etcd'] }}"
  ignore_errors: true  # noqa ignore-errors
  when:
    - groups['broken_etcd']
    - has_quorum

- name: Delete old certificates
  # noqa 302 ignore-error - rm is ok here for now
  shell: "rm {{ etcd_cert_dir }}/*{{ item }}*"
  with_items: "{{ groups['broken_etcd'] }}"
  register: delete_old_cerificates
  ignore_errors: true
  when: groups['broken_etcd']

- name: Fail if unable to delete old certificates
  fail:
    msg: "Unable to delete old certificates for: {{ item.item }}"
  loop: "{{ delete_old_cerificates.results }}"
  changed_when: false
  when:
    - groups['broken_etcd']
    - "item.rc != 0 and not 'No such file or directory' in item.stderr"

- name: Get etcd cluster members
  command: "{{ bin_dir }}/etcdctl member list"
  register: member_list
  changed_when: false
  check_mode: no
  environment:
    ETCDCTL_API: 3
    ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}"
    ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
    ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
    ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
  when:
    - groups['broken_etcd']
    - not healthy
    - has_quorum

- name: Remove broken cluster members
  command: "{{ bin_dir }}/etcdctl member remove {{ item[1].replace(' ','').split(',')[0] }}"
  environment:
    ETCDCTL_API: 3
    ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}"
    ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
    ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
    ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
  with_nested:
    - "{{ groups['broken_etcd'] }}"
    - "{{ member_list.stdout_lines }}"
  when:
    - groups['broken_etcd']
    - not healthy
    - has_quorum
    - hostvars[item[0]]['etcd_member_name'] == item[1].replace(' ','').split(',')[2]