From ac2135e450c731be0d855317cc32d8d8d1e1bf92 Mon Sep 17 00:00:00 2001
From: qvicksilver <jonathan@sofiero.net>
Date: Tue, 11 Feb 2020 10:38:01 +0100
Subject: [PATCH] Fix recover-control-plane to work with etcd 3.3.x and add CI
 (#5500)

* Fix recover-control-plane to work with etcd 3.3.x and add CI

* Set default values for testcase

* Add actual test jobs

* Attempt to satisty gitlab ci linter

* Fix ansible targets

* Set etcd_member_name as stated in the docs...

* Recovering from 0 masters is not supported yet

* Add other master to broken_kube-master group as well

* Increase number of retries to see if etcd needs more time to heal

* Make number of retries for ETCD loops configurable, increase it for recovery CI and document it
---
 .gitlab-ci.yml                                |  2 +
 .gitlab-ci/packet.yml                         | 16 ++++
 docs/recover-control-plane.md                 | 30 ++------
 recover-control-plane.yml                     |  1 -
 roles/etcd/defaults/main.yml                  |  3 +
 roles/etcd/tasks/configure.yml                |  4 +-
 roles/etcd/tasks/install_docker.yml           |  2 +-
 roles/etcd/tasks/join_etcd-events_member.yml  |  2 +-
 roles/etcd/tasks/join_etcd_member.yml         |  2 +-
 .../recover_control_plane/etcd/tasks/main.yml | 77 ++++++++++++++++++-
 .../etcd/tasks/prepare.yml                    | 48 ------------
 .../etcd/tasks/recover_lost_quorum.yml        | 10 +--
 .../master/tasks/main.yml                     | 17 ++--
 .../pre-recover/defaults/main.yml             |  2 -
 .../pre-recover/tasks/main.yml                | 36 ---------
 .../roles/packet-ci/tasks/main.yml            |  2 +-
 .../roles/packet-ci/templates/inventory.j2    | 39 ++++++++++
 ...et_ubuntu18-calico-ha-recover-noquorum.yml | 10 +++
 .../packet_ubuntu18-calico-ha-recover.yml     | 10 +++
 tests/scripts/testcases_run.sh                |  6 ++
 tests/templates/inventory-aws.j2              |  6 ++
 tests/templates/inventory-do.j2               |  6 ++
 tests/templates/inventory-gce.j2              |  7 ++
 23 files changed, 204 insertions(+), 134 deletions(-)
 delete mode 100644 roles/recover_control_plane/etcd/tasks/prepare.yml
 delete mode 100644 roles/recover_control_plane/pre-recover/defaults/main.yml
 delete mode 100644 roles/recover_control_plane/pre-recover/tasks/main.yml
 create mode 100644 tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml
 create mode 100644 tests/files/packet_ubuntu18-calico-ha-recover.yml

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index faea911c8..436fdab41 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -26,6 +26,8 @@ variables:
   RESET_CHECK: "false"
   UPGRADE_TEST: "false"
   LOG_LEVEL: "-vv"
+  RECOVER_CONTROL_PLANE_TEST: "false"
+  RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[2:],kube-master[1:]"
 
 before_script:
   - ./tests/scripts/rebase.sh
diff --git a/.gitlab-ci/packet.yml b/.gitlab-ci/packet.yml
index 9aa398ee3..86164c392 100644
--- a/.gitlab-ci/packet.yml
+++ b/.gitlab-ci/packet.yml
@@ -124,3 +124,19 @@ packet_amazon-linux-2-aio:
   stage: deploy-part2
   extends: .packet
   when: manual
+
+packet_ubuntu18-calico-ha-recover:
+  stage: deploy-part2
+  extends: .packet
+  when: on_success
+  variables:
+    RECOVER_CONTROL_PLANE_TEST: "true"
+    RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[2:],kube-master[1:]"
+
+packet_ubuntu18-calico-ha-recover-noquorum:
+  stage: deploy-part2
+  extends: .packet
+  when: on_success
+  variables:
+    RECOVER_CONTROL_PLANE_TEST: "true"
+    RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[1:],kube-master[1:]"
diff --git a/docs/recover-control-plane.md b/docs/recover-control-plane.md
index 90f789589..d24a4c73a 100644
--- a/docs/recover-control-plane.md
+++ b/docs/recover-control-plane.md
@@ -17,37 +17,23 @@ Examples of what broken means in this context:
 
 __Note that you need at least one functional node to be able to recover using this method.__
 
-## If etcd quorum is intact
+## Runbook
 
-* Set the etcd member names of the broken node(s) in the variable "old\_etcd\_members", this variable is used to remove the broken nodes from the etcd cluster.
-```old_etcd_members=etcd2,etcd3```
-* If you reuse identities for your etcd nodes add the inventory names for those nodes to the variable "old\_etcds". This will remove any previously generated certificates for those nodes.
-```old_etcds=etcd2.example.com,etcd3.example.com```
-* If you would like to remove the broken node objects from the kubernetes cluster add their inventory names to the variable "old\_kube\_masters"
-```old_kube_masters=master2.example.com,master3.example.com```
+* Move any broken etcd nodes into the "broken\_etcd" group, make sure the "etcd\_member\_name" variable is set.
+* Move any broken master nodes into the "broken\_kube-master" group.
 
-Then run the playbook with ```--limit etcd,kube-master```
+Then run the playbook with ```--limit etcd,kube-master``` and increase the number of ETCD retries by setting ```-e etcd_retries=10``` or something even larger. The amount of retries required is difficult to predict.
 
-When finished you should have a fully working and highly available control plane again.
+When finished you should have a fully working control plane again.
 
-## If etcd quorum is lost
+## Recover from lost quorum
 
-* If you reuse identities for your etcd nodes add the inventory names for those nodes to the variable "old\_etcds". This will remove any previously generated certificates for those nodes.
-```old_etcds=etcd2.example.com,etcd3.example.com```
-* If you would like to remove the broken node objects from the kubernetes cluster add their inventory names to the variable "old\_kube\_masters"
-```old_kube_masters=master2.example.com,master3.example.com```
+The playbook attempts to figure out it the etcd quorum is intact. If quorum is lost it will attempt to take a snapshot from the first node in the "etcd" group and restore from that. If you would like to restore from an alternate snapshot set the path to that snapshot in the "etcd\_snapshot" variable.
 
-Then run the playbook with ```--limit etcd,kube-master```
-
-When finished you should have a fully working and highly available control plane again.
-
-The playbook will attempt to take a snapshot from the first node in the "etcd" group and restore from that. If you would like to restore from an alternate snapshot set the path to that snapshot in the "etcd\_snapshot" variable.
-
-```etcd_snapshot=/tmp/etcd_snapshot```
+```-e etcd_snapshot=/tmp/etcd_snapshot```
 
 ## Caveats
 
-* The playbook has only been tested on control planes where the etcd and kube-master nodes are the same, the playbook will warn if run on a cluster with separate etcd and kube-master nodes.
 * The playbook has only been tested with fairly small etcd databases.
 * If your new control plane nodes have new ip addresses you may have to change settings in various places.
 * There may be disruptions while running the playbook.
diff --git a/recover-control-plane.yml b/recover-control-plane.yml
index cd6482efb..cd6bfde2b 100644
--- a/recover-control-plane.yml
+++ b/recover-control-plane.yml
@@ -22,7 +22,6 @@
 - hosts: "{{ groups['etcd'] | first }}"
   roles:
     - { role: kubespray-defaults}
-    - { role: recover_control_plane/pre-recover }
     - { role: recover_control_plane/etcd }
 
 - hosts: "{{ groups['kube-master'] | first }}"
diff --git a/roles/etcd/defaults/main.yml b/roles/etcd/defaults/main.yml
index 48a68b61c..ac38f6d7f 100644
--- a/roles/etcd/defaults/main.yml
+++ b/roles/etcd/defaults/main.yml
@@ -62,3 +62,6 @@ etcd_secure_client: true
 
 # Enable peer client cert authentication
 etcd_peer_client_auth: true
+
+# Number of loop retries
+etcd_retries: 4
diff --git a/roles/etcd/tasks/configure.yml b/roles/etcd/tasks/configure.yml
index e3f9c31dd..d87917176 100644
--- a/roles/etcd/tasks/configure.yml
+++ b/roles/etcd/tasks/configure.yml
@@ -67,7 +67,7 @@
   shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_client_url }} cluster-health | grep -q 'cluster is healthy'"
   register: etcd_cluster_is_healthy
   until: etcd_cluster_is_healthy.rc == 0
-  retries: 4
+  retries: "{{ etcd_retries }}"
   delay: "{{ retry_stagger | random + 3 }}"
   ignore_errors: false
   changed_when: false
@@ -88,7 +88,7 @@
   shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_events_client_url }} cluster-health | grep -q 'cluster is healthy'"
   register: etcd_events_cluster_is_healthy
   until: etcd_events_cluster_is_healthy.rc == 0
-  retries: 4
+  retries: "{{ etcd_retries }}"
   delay: "{{ retry_stagger | random + 3 }}"
   ignore_errors: false
   changed_when: false
diff --git a/roles/etcd/tasks/install_docker.yml b/roles/etcd/tasks/install_docker.yml
index 7859134b0..6c38ad9f3 100644
--- a/roles/etcd/tasks/install_docker.yml
+++ b/roles/etcd/tasks/install_docker.yml
@@ -6,7 +6,7 @@
            {{ docker_bin_dir }}/docker rm -f etcdctl-binarycopy"
   register: etcd_task_result
   until: etcd_task_result.rc == 0
-  retries: 4
+  retries: "{{ etcd_retries }}"
   delay: "{{ retry_stagger | random + 3 }}"
   changed_when: false
   when: etcd_cluster_setup
diff --git a/roles/etcd/tasks/join_etcd-events_member.yml b/roles/etcd/tasks/join_etcd-events_member.yml
index b75460c41..0f214302e 100644
--- a/roles/etcd/tasks/join_etcd-events_member.yml
+++ b/roles/etcd/tasks/join_etcd-events_member.yml
@@ -3,7 +3,7 @@
   shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} member add {{ etcd_member_name }} {{ etcd_events_peer_url }}"
   register: member_add_result
   until: member_add_result.rc == 0
-  retries: 4
+  retries: "{{ etcd_retries }}"
   delay: "{{ retry_stagger | random + 3 }}"
   when: target_node == inventory_hostname
   environment:
diff --git a/roles/etcd/tasks/join_etcd_member.yml b/roles/etcd/tasks/join_etcd_member.yml
index d512eb78a..928d22642 100644
--- a/roles/etcd/tasks/join_etcd_member.yml
+++ b/roles/etcd/tasks/join_etcd_member.yml
@@ -3,7 +3,7 @@
   shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member add {{ etcd_member_name }} {{ etcd_peer_url }}"
   register: member_add_result
   until: member_add_result.rc == 0
-  retries: 4
+  retries: "{{ etcd_retries }}"
   delay: "{{ retry_stagger | random + 3 }}"
   when: target_node == inventory_hostname
   environment:
diff --git a/roles/recover_control_plane/etcd/tasks/main.yml b/roles/recover_control_plane/etcd/tasks/main.yml
index d1d2d1fa5..92c275a1f 100644
--- a/roles/recover_control_plane/etcd/tasks/main.yml
+++ b/roles/recover_control_plane/etcd/tasks/main.yml
@@ -1,7 +1,78 @@
 ---
-- include_tasks: prepare.yml
+- name: Get etcd endpoint health
+  shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem --endpoints={{ etcd_access_addresses }} endpoint health"
+  register: etcd_endpoint_health
+  ignore_errors: true
+  changed_when: false
+  check_mode: no
+  environment:
+    - ETCDCTL_API: 3
+  when:
+    - groups['broken_etcd']
+
+- name: Set healthy fact
+  set_fact:
+    healthy: "{{ etcd_endpoint_health.stderr | match('Error: unhealthy cluster') }}"
+  when:
+    - groups['broken_etcd']
+
+- name: Set has_quorum fact
+  set_fact:
+    has_quorum: "{{ etcd_endpoint_health.stdout_lines | select('match', '.*is healthy.*') | list | length >= etcd_endpoint_health.stderr_lines | select('match', '.*is unhealthy.*') | list | length }}"
 
 - include_tasks: recover_lost_quorum.yml
   when:
-    - has_etcdctl
-    - not etcd_cluster_is_healthy
+    - groups['broken_etcd']
+    - not has_quorum
+
+- name: Remove etcd data dir
+  file:
+    path: "{{ etcd_data_dir }}"
+    state: absent
+  delegate_to: "{{ item }}"
+  with_items: "{{ groups['broken_etcd'] }}"
+  when:
+    - groups['broken_etcd']
+    - has_quorum
+
+- name: Delete old certificates
+  # noqa 302 - rm is ok here for now
+  shell: "rm {{ etcd_cert_dir }}/*{{ item }}*"
+  with_items: "{{ groups['broken_etcd'] }}"
+  register: delete_old_cerificates
+  ignore_errors: true
+  when: groups['broken_etcd']
+
+- name: Fail if unable to delete old certificates
+  fail:
+    msg: "Unable to delete old certificates for: {{ item.item }}"
+  loop: "{{ delete_old_cerificates.results }}"
+  changed_when: false
+  when:
+    - groups['broken_etcd']
+    - "item.rc != 0 and not 'No such file or directory' in item.stderr"
+
+- name: Get etcd cluster members
+  shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem member list"
+  register: member_list
+  changed_when: false
+  check_mode: no
+  environment:
+    - ETCDCTL_API: 3
+  when:
+    - groups['broken_etcd']
+    - not healthy
+    - has_quorum
+
+- name: Remove broken cluster members
+  shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}"
+  environment:
+    - ETCDCTL_API: 3
+  with_nested:
+    - "{{ groups['broken_etcd'] }}"
+    - "{{ member_list.stdout_lines }}"
+  when:
+    - groups['broken_etcd']
+    - not healthy
+    - has_quorum
+    - hostvars[item[0]]['etcd_member_name'] == item[1].replace(' ','').split(',')[2]
diff --git a/roles/recover_control_plane/etcd/tasks/prepare.yml b/roles/recover_control_plane/etcd/tasks/prepare.yml
deleted file mode 100644
index d3cacb934..000000000
--- a/roles/recover_control_plane/etcd/tasks/prepare.yml
+++ /dev/null
@@ -1,48 +0,0 @@
----
-- name: Delete old certificates
-  # noqa 302 - rm is ok here for now
-  shell: "rm /etc/ssl/etcd/ssl/*{{ item }}* /etc/kubernetes/ssl/etcd/*{{ item }}*"
-  with_items: "{{ old_etcds.split(',') }}"
-  register: delete_old_cerificates
-  ignore_errors: true
-  when: old_etcds is defined
-
-- name: Fail if unable to delete old certificates
-  fail:
-    msg: "Unable to delete old certificates for: {{ item.item }}"
-  loop: "{{ delete_old_cerificates.results }}"
-  changed_when: false
-  when:
-    - old_etcds is defined
-    - "item.rc != 0 and not 'No such file or directory' in item.stderr"
-
-- name: Get etcd cluster members
-  shell: "{{ bin_dir }}/etcdctl member list"
-  register: member_list
-  changed_when: false
-  check_mode: no
-  environment:
-    - ETCDCTL_API: 3
-    - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
-    - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem"
-    - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem"
-  when:
-    - has_etcdctl
-    - etcd_cluster_is_healthy
-    - old_etcd_members is defined
-
-- name: Remove old cluster members
-  shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}"
-  environment:
-    - ETCDCTL_API: 3
-    - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
-    - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem"
-    - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem"
-  with_nested:
-    - "{{ old_etcd_members.split(',') }}"
-    - "{{ member_list.stdout_lines }}"
-  when:
-    - has_etcdctl
-    - etcd_cluster_is_healthy
-    - old_etcd_members is defined
-    - item[0] == item[1].replace(' ','').split(',')[2]
diff --git a/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml b/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml
index beb8b0daf..fdd9d0b5f 100644
--- a/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml
+++ b/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml
@@ -1,11 +1,8 @@
 ---
 - name: Save etcd snapshot
-  shell: "{{ bin_dir }}/etcdctl snapshot save /tmp/snapshot.db"
+  shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem snapshot save /tmp/snapshot.db"
   environment:
     - ETCDCTL_API: 3
-    - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
-    - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem"
-    - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem"
   when: etcd_snapshot is not defined
 
 - name: Transfer etcd snapshot to host
@@ -25,12 +22,9 @@
     state: absent
 
 - name: Restore etcd snapshot
-  shell: "{{ bin_dir }}/etcdctl snapshot restore /tmp/snapshot.db --name {{ etcd_member_name }} --initial-cluster {{ etcd_member_name }}={{ etcd_peer_url }} --initial-cluster-token k8s_etcd --initial-advertise-peer-urls {{ etcd_peer_url }} --data-dir {{ etcd_data_dir }}"
+  shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem snapshot restore /tmp/snapshot.db --name {{ etcd_member_name }} --initial-cluster {{ etcd_member_name }}={{ etcd_peer_url }} --initial-cluster-token k8s_etcd --initial-advertise-peer-urls {{ etcd_peer_url }} --data-dir {{ etcd_data_dir }}"
   environment:
     - ETCDCTL_API: 3
-    - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
-    - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem"
-    - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem"
 
 - name: Remove etcd snapshot
   file:
diff --git a/roles/recover_control_plane/master/tasks/main.yml b/roles/recover_control_plane/master/tasks/main.yml
index f67742c85..71a094168 100644
--- a/roles/recover_control_plane/master/tasks/main.yml
+++ b/roles/recover_control_plane/master/tasks/main.yml
@@ -8,21 +8,22 @@
   retries: 6
   delay: 10
   changed_when: false
+  when: groups['broken_kube-master']
 
-- name: Delete old kube-master nodes from cluster
+- name: Delete broken kube-master nodes from cluster
   shell: "{{ bin_dir }}/kubectl delete node {{ item }}"
   environment:
     - KUBECONFIG: "{{ ansible_env.HOME | default('/root') }}/.kube/config"
-  with_items: "{{ old_kube_masters.split(',') }}"
-  register: delete_old_kube_masters
+  with_items: "{{ groups['broken_kube-master'] }}"
+  register: delete_broken_kube_masters
   failed_when: false
-  when: old_kube_masters is defined
+  when: groups['broken_kube-master']
 
-- name: Fail if unable to delete old kube-master nodes from cluster
+- name: Fail if unable to delete broken kube-master nodes from cluster
   fail:
-    msg: "Unable to delete old kube-master node: {{ item.item }}"
-  loop: "{{ delete_old_kube_masters.results }}"
+    msg: "Unable to delete broken kube-master node: {{ item.item }}"
+  loop: "{{ delete_broken_kube_masters.results }}"
   changed_when: false
   when:
-    - old_kube_masters is defined
+    - groups['broken_kube-master']
     - "item.rc != 0 and not 'NotFound' in item.stderr"
diff --git a/roles/recover_control_plane/pre-recover/defaults/main.yml b/roles/recover_control_plane/pre-recover/defaults/main.yml
deleted file mode 100644
index a1f72dea6..000000000
--- a/roles/recover_control_plane/pre-recover/defaults/main.yml
+++ /dev/null
@@ -1,2 +0,0 @@
----
-control_plane_is_converged: "{{ groups['etcd'] | sort == groups['kube-master'] | sort | bool }}"
diff --git a/roles/recover_control_plane/pre-recover/tasks/main.yml b/roles/recover_control_plane/pre-recover/tasks/main.yml
deleted file mode 100644
index 0b305ed9e..000000000
--- a/roles/recover_control_plane/pre-recover/tasks/main.yml
+++ /dev/null
@@ -1,36 +0,0 @@
----
-- name: Check for etcdctl binary
-  raw: "test -e {{ bin_dir }}/etcdctl"
-  register: test_etcdctl
-
-- name: Set has_etcdctl fact
-  set_fact:
-    has_etcdctl: "{{ test_etcdctl.rc == 0 | bool }}"
-
-- name: Check if etcd cluster is healthy
-  shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'"
-  register: etcd_cluster_health
-  ignore_errors: true
-  changed_when: false
-  check_mode: no
-  environment:
-    ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
-    ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
-    ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem"
-  when: has_etcdctl
-
-- name: Set etcd_cluster_is_healthy fact
-  set_fact:
-    etcd_cluster_is_healthy: "{{ etcd_cluster_health.rc == 0 | bool }}"
-
-- name: Abort if etcd cluster is healthy and old_etcd_members is undefined
-  assert:
-    that: "{{ old_etcd_members is defined }}"
-    msg: "'old_etcd_members' must be defined when the etcd cluster has quorum."
-  when: etcd_cluster_is_healthy
-
-- name: Warn for untested recovery
-  debug:
-    msg: Control plane recovery of split control planes is UNTESTED! Abort or continue at your own risk.
-  delay: 30
-  when: not control_plane_is_converged
diff --git a/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml b/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml
index 6f7d0cdf6..bf4e974e3 100644
--- a/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml
+++ b/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml
@@ -5,7 +5,7 @@
 
 - name: Set VM count needed for CI test_id
   set_fact:
-    vm_count: "{%- if mode in ['separate', 'separate-scale', 'ha', 'ha-scale'] -%}{{ 3|int }}{%- elif mode == 'aio' -%}{{ 1|int }}{%- else -%}{{ 2|int }}{%- endif -%}"
+    vm_count: "{%- if mode in ['separate', 'separate-scale', 'ha', 'ha-scale', 'ha-recover', 'ha-recover-noquorum'] -%}{{ 3|int }}{%- elif mode == 'aio' -%}{{ 1|int }}{%- else -%}{{ 2|int }}{%- endif -%}"
 
 - import_tasks: create-vms.yml
   when:
diff --git a/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2 b/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2
index 82293e0cd..b842c97a7 100644
--- a/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2
+++ b/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2
@@ -45,6 +45,45 @@ instance-1
 
 [vault]
 instance-1
+{% elif mode == "ha-recover" %}
+[kube-master]
+instance-1
+instance-2
+
+[kube-node]
+instance-3
+
+[etcd]
+instance-3
+instance-1
+instance-2
+
+[broken_kube-master]
+instance-2
+
+[broken_etcd]
+instance-2 etcd_member_name=etcd3
+{% elif mode == "ha-recover-noquorum" %}
+[kube-master]
+instance-3
+instance-1
+instance-2
+
+[kube-node]
+instance-3
+
+[etcd]
+instance-3
+instance-1
+instance-2
+
+[broken_kube-master]
+instance-1
+instance-2
+
+[broken_etcd]
+instance-1 etcd_member_name=etcd2
+instance-2 etcd_member_name=etcd3
 {% endif %}
 
 [k8s-cluster:children]
diff --git a/tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml b/tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml
new file mode 100644
index 000000000..a011af01f
--- /dev/null
+++ b/tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml
@@ -0,0 +1,10 @@
+---
+# Instance settings
+cloud_image: ubuntu-1804
+mode: ha-recover-noquorum
+vm_memory: 1600Mi
+
+# Kubespray settings
+kube_network_plugin: calico
+deploy_netchecker: true
+dns_min_replicas: 1
diff --git a/tests/files/packet_ubuntu18-calico-ha-recover.yml b/tests/files/packet_ubuntu18-calico-ha-recover.yml
new file mode 100644
index 000000000..079440a30
--- /dev/null
+++ b/tests/files/packet_ubuntu18-calico-ha-recover.yml
@@ -0,0 +1,10 @@
+---
+# Instance settings
+cloud_image: ubuntu-1804
+mode: ha-recover
+vm_memory: 1600Mi
+
+# Kubespray settings
+kube_network_plugin: calico
+deploy_netchecker: true
+dns_min_replicas: 1
diff --git a/tests/scripts/testcases_run.sh b/tests/scripts/testcases_run.sh
index 69782f862..81df1a129 100755
--- a/tests/scripts/testcases_run.sh
+++ b/tests/scripts/testcases_run.sh
@@ -47,6 +47,12 @@ if [ "${UPGRADE_TEST}" != "false" ]; then
   ansible-playbook ${LOG_LEVEL} -e @${CI_TEST_VARS} -e local_release_dir=${PWD}/downloads -e ansible_python_interpreter=${PYPATH} --limit "all:!fake_hosts" $PLAYBOOK
 fi
 
+# Test control plane recovery
+if [ "${RECOVER_CONTROL_PLANE_TEST}" != "false" ]; then
+  ansible-playbook ${LOG_LEVEL} -e @${CI_TEST_VARS} -e local_release_dir=${PWD}/downloads -e ansible_python_interpreter=${PYPATH} --limit "${RECOVER_CONTROL_PLANE_TEST_GROUPS}:!fake_hosts" -e reset_confirmation=yes reset.yml
+  ansible-playbook ${LOG_LEVEL} -e @${CI_TEST_VARS} -e local_release_dir=${PWD}/downloads -e ansible_python_interpreter=${PYPATH} -e etcd_retries=10 --limit etcd,kube-master:!fake_hosts recover-control-plane.yml
+fi
+
 # Tests Cases
 ## Test Master API
 ansible-playbook -e ansible_python_interpreter=${PYPATH} --limit "all:!fake_hosts" tests/testcases/010_check-apiserver.yml $LOG_LEVEL
diff --git a/tests/templates/inventory-aws.j2 b/tests/templates/inventory-aws.j2
index 92f107f65..3ed86eb96 100644
--- a/tests/templates/inventory-aws.j2
+++ b/tests/templates/inventory-aws.j2
@@ -25,3 +25,9 @@ kube-master
 calico-rr
 
 [calico-rr]
+
+[broken_kube-master]
+node2
+
+[broken_etcd]
+node2
diff --git a/tests/templates/inventory-do.j2 b/tests/templates/inventory-do.j2
index 83a749afc..ab7d95220 100644
--- a/tests/templates/inventory-do.j2
+++ b/tests/templates/inventory-do.j2
@@ -29,6 +29,12 @@
 [vault]
 {{droplets.results[1].droplet.name}}
 {{droplets.results[2].droplet.name}}
+
+[broken_kube-master]
+{{droplets.results[1].droplet.name}}
+
+[broken_etcd]
+{{droplets.results[2].droplet.name}}
 {% else %}
 [kube-master]
 {{droplets.results[0].droplet.name}}
diff --git a/tests/templates/inventory-gce.j2 b/tests/templates/inventory-gce.j2
index 503bb4091..55f67deec 100644
--- a/tests/templates/inventory-gce.j2
+++ b/tests/templates/inventory-gce.j2
@@ -37,6 +37,13 @@
 {{node1}}
 {{node2}}
 {{node3}}
+
+[broken_kube-master]
+{{node2}}
+
+[etcd]
+{{node2}}
+{{node3}}
 {% elif mode == "default" %}
 [kube-master]
 {{node1}}