c12s-kubespray/roles/recover_control_plane/etcd/tasks/main.yml

---
- name: Get etcd endpoint health
  shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem --endpoints={{ etcd_access_addresses }} endpoint health"
  register: etcd_endpoint_health
  ignore_errors: true
  changed_when: false
  check_mode: no
  environment:
    - ETCDCTL_API: 3
  when:
    - groups['broken_etcd']

- name: Set healthy fact
  set_fact:
    healthy: "{{ etcd_endpoint_health.stderr | match('Error: unhealthy cluster') }}"
  when:
    - groups['broken_etcd']

- name: Set has_quorum fact
  set_fact:
    has_quorum: "{{ etcd_endpoint_health.stdout_lines | select('match', '.*is healthy.*') | list | length >= etcd_endpoint_health.stderr_lines | select('match', '.*is unhealthy.*') | list | length }}"

- include_tasks: recover_lost_quorum.yml
  when:
    - groups['broken_etcd']
    - not has_quorum

- name: Remove etcd data dir
  file:
    path: "{{ etcd_data_dir }}"
    state: absent
  delegate_to: "{{ item }}"
  with_items: "{{ groups['broken_etcd'] }}"
  when:
    - groups['broken_etcd']
    - has_quorum

- name: Delete old certificates
  # noqa 302 - rm is ok here for now
  shell: "rm {{ etcd_cert_dir }}/*{{ item }}*"
  with_items: "{{ groups['broken_etcd'] }}"
  register: delete_old_cerificates
  ignore_errors: true
  when: groups['broken_etcd']

- name: Fail if unable to delete old certificates
  fail:
    msg: "Unable to delete old certificates for: {{ item.item }}"
  loop: "{{ delete_old_cerificates.results }}"
  changed_when: false
  when:
    - groups['broken_etcd']
    - "item.rc != 0 and not 'No such file or directory' in item.stderr"

- name: Get etcd cluster members
  shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem member list"
  register: member_list
  changed_when: false
  check_mode: no
  environment:
    - ETCDCTL_API: 3
  when:
    - groups['broken_etcd']
    - not healthy
    - has_quorum

- name: Remove broken cluster members
  shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}"
  environment:
    - ETCDCTL_API: 3
  with_nested:
    - "{{ groups['broken_etcd'] }}"
    - "{{ member_list.stdout_lines }}"
  when:
    - groups['broken_etcd']
    - not healthy
    - has_quorum
    - hostvars[item[0]]['etcd_member_name'] == item[1].replace(' ','').split(',')[2]