diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index faea911c8..436fdab41 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,6 +26,8 @@ variables: RESET_CHECK: "false" UPGRADE_TEST: "false" LOG_LEVEL: "-vv" + RECOVER_CONTROL_PLANE_TEST: "false" + RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[2:],kube-master[1:]" before_script: - ./tests/scripts/rebase.sh diff --git a/.gitlab-ci/packet.yml b/.gitlab-ci/packet.yml index 9aa398ee3..86164c392 100644 --- a/.gitlab-ci/packet.yml +++ b/.gitlab-ci/packet.yml @@ -124,3 +124,19 @@ packet_amazon-linux-2-aio: stage: deploy-part2 extends: .packet when: manual + +packet_ubuntu18-calico-ha-recover: + stage: deploy-part2 + extends: .packet + when: on_success + variables: + RECOVER_CONTROL_PLANE_TEST: "true" + RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[2:],kube-master[1:]" + +packet_ubuntu18-calico-ha-recover-noquorum: + stage: deploy-part2 + extends: .packet + when: on_success + variables: + RECOVER_CONTROL_PLANE_TEST: "true" + RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[1:],kube-master[1:]" diff --git a/docs/recover-control-plane.md b/docs/recover-control-plane.md index 90f789589..d24a4c73a 100644 --- a/docs/recover-control-plane.md +++ b/docs/recover-control-plane.md @@ -17,37 +17,23 @@ Examples of what broken means in this context: __Note that you need at least one functional node to be able to recover using this method.__ -## If etcd quorum is intact +## Runbook -* Set the etcd member names of the broken node(s) in the variable "old\_etcd\_members", this variable is used to remove the broken nodes from the etcd cluster. -```old_etcd_members=etcd2,etcd3``` -* If you reuse identities for your etcd nodes add the inventory names for those nodes to the variable "old\_etcds". This will remove any previously generated certificates for those nodes. -```old_etcds=etcd2.example.com,etcd3.example.com``` -* If you would like to remove the broken node objects from the kubernetes cluster add their inventory names to the variable "old\_kube\_masters" -```old_kube_masters=master2.example.com,master3.example.com``` +* Move any broken etcd nodes into the "broken\_etcd" group, make sure the "etcd\_member\_name" variable is set. +* Move any broken master nodes into the "broken\_kube-master" group. -Then run the playbook with ```--limit etcd,kube-master``` +Then run the playbook with ```--limit etcd,kube-master``` and increase the number of ETCD retries by setting ```-e etcd_retries=10``` or something even larger. The amount of retries required is difficult to predict. -When finished you should have a fully working and highly available control plane again. +When finished you should have a fully working control plane again. -## If etcd quorum is lost +## Recover from lost quorum -* If you reuse identities for your etcd nodes add the inventory names for those nodes to the variable "old\_etcds". This will remove any previously generated certificates for those nodes. -```old_etcds=etcd2.example.com,etcd3.example.com``` -* If you would like to remove the broken node objects from the kubernetes cluster add their inventory names to the variable "old\_kube\_masters" -```old_kube_masters=master2.example.com,master3.example.com``` +The playbook attempts to figure out it the etcd quorum is intact. If quorum is lost it will attempt to take a snapshot from the first node in the "etcd" group and restore from that. If you would like to restore from an alternate snapshot set the path to that snapshot in the "etcd\_snapshot" variable. -Then run the playbook with ```--limit etcd,kube-master``` - -When finished you should have a fully working and highly available control plane again. - -The playbook will attempt to take a snapshot from the first node in the "etcd" group and restore from that. If you would like to restore from an alternate snapshot set the path to that snapshot in the "etcd\_snapshot" variable. - -```etcd_snapshot=/tmp/etcd_snapshot``` +```-e etcd_snapshot=/tmp/etcd_snapshot``` ## Caveats -* The playbook has only been tested on control planes where the etcd and kube-master nodes are the same, the playbook will warn if run on a cluster with separate etcd and kube-master nodes. * The playbook has only been tested with fairly small etcd databases. * If your new control plane nodes have new ip addresses you may have to change settings in various places. * There may be disruptions while running the playbook. diff --git a/recover-control-plane.yml b/recover-control-plane.yml index cd6482efb..cd6bfde2b 100644 --- a/recover-control-plane.yml +++ b/recover-control-plane.yml @@ -22,7 +22,6 @@ - hosts: "{{ groups['etcd'] | first }}" roles: - { role: kubespray-defaults} - - { role: recover_control_plane/pre-recover } - { role: recover_control_plane/etcd } - hosts: "{{ groups['kube-master'] | first }}" diff --git a/roles/etcd/defaults/main.yml b/roles/etcd/defaults/main.yml index 48a68b61c..ac38f6d7f 100644 --- a/roles/etcd/defaults/main.yml +++ b/roles/etcd/defaults/main.yml @@ -62,3 +62,6 @@ etcd_secure_client: true # Enable peer client cert authentication etcd_peer_client_auth: true + +# Number of loop retries +etcd_retries: 4 diff --git a/roles/etcd/tasks/configure.yml b/roles/etcd/tasks/configure.yml index e3f9c31dd..d87917176 100644 --- a/roles/etcd/tasks/configure.yml +++ b/roles/etcd/tasks/configure.yml @@ -67,7 +67,7 @@ shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_client_url }} cluster-health | grep -q 'cluster is healthy'" register: etcd_cluster_is_healthy until: etcd_cluster_is_healthy.rc == 0 - retries: 4 + retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" ignore_errors: false changed_when: false @@ -88,7 +88,7 @@ shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_events_client_url }} cluster-health | grep -q 'cluster is healthy'" register: etcd_events_cluster_is_healthy until: etcd_events_cluster_is_healthy.rc == 0 - retries: 4 + retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" ignore_errors: false changed_when: false diff --git a/roles/etcd/tasks/install_docker.yml b/roles/etcd/tasks/install_docker.yml index 7859134b0..6c38ad9f3 100644 --- a/roles/etcd/tasks/install_docker.yml +++ b/roles/etcd/tasks/install_docker.yml @@ -6,7 +6,7 @@ {{ docker_bin_dir }}/docker rm -f etcdctl-binarycopy" register: etcd_task_result until: etcd_task_result.rc == 0 - retries: 4 + retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" changed_when: false when: etcd_cluster_setup diff --git a/roles/etcd/tasks/join_etcd-events_member.yml b/roles/etcd/tasks/join_etcd-events_member.yml index b75460c41..0f214302e 100644 --- a/roles/etcd/tasks/join_etcd-events_member.yml +++ b/roles/etcd/tasks/join_etcd-events_member.yml @@ -3,7 +3,7 @@ shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} member add {{ etcd_member_name }} {{ etcd_events_peer_url }}" register: member_add_result until: member_add_result.rc == 0 - retries: 4 + retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" when: target_node == inventory_hostname environment: diff --git a/roles/etcd/tasks/join_etcd_member.yml b/roles/etcd/tasks/join_etcd_member.yml index d512eb78a..928d22642 100644 --- a/roles/etcd/tasks/join_etcd_member.yml +++ b/roles/etcd/tasks/join_etcd_member.yml @@ -3,7 +3,7 @@ shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member add {{ etcd_member_name }} {{ etcd_peer_url }}" register: member_add_result until: member_add_result.rc == 0 - retries: 4 + retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" when: target_node == inventory_hostname environment: diff --git a/roles/recover_control_plane/etcd/tasks/main.yml b/roles/recover_control_plane/etcd/tasks/main.yml index d1d2d1fa5..92c275a1f 100644 --- a/roles/recover_control_plane/etcd/tasks/main.yml +++ b/roles/recover_control_plane/etcd/tasks/main.yml @@ -1,7 +1,78 @@ --- -- include_tasks: prepare.yml +- name: Get etcd endpoint health + shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem --endpoints={{ etcd_access_addresses }} endpoint health" + register: etcd_endpoint_health + ignore_errors: true + changed_when: false + check_mode: no + environment: + - ETCDCTL_API: 3 + when: + - groups['broken_etcd'] + +- name: Set healthy fact + set_fact: + healthy: "{{ etcd_endpoint_health.stderr | match('Error: unhealthy cluster') }}" + when: + - groups['broken_etcd'] + +- name: Set has_quorum fact + set_fact: + has_quorum: "{{ etcd_endpoint_health.stdout_lines | select('match', '.*is healthy.*') | list | length >= etcd_endpoint_health.stderr_lines | select('match', '.*is unhealthy.*') | list | length }}" - include_tasks: recover_lost_quorum.yml when: - - has_etcdctl - - not etcd_cluster_is_healthy + - groups['broken_etcd'] + - not has_quorum + +- name: Remove etcd data dir + file: + path: "{{ etcd_data_dir }}" + state: absent + delegate_to: "{{ item }}" + with_items: "{{ groups['broken_etcd'] }}" + when: + - groups['broken_etcd'] + - has_quorum + +- name: Delete old certificates + # noqa 302 - rm is ok here for now + shell: "rm {{ etcd_cert_dir }}/*{{ item }}*" + with_items: "{{ groups['broken_etcd'] }}" + register: delete_old_cerificates + ignore_errors: true + when: groups['broken_etcd'] + +- name: Fail if unable to delete old certificates + fail: + msg: "Unable to delete old certificates for: {{ item.item }}" + loop: "{{ delete_old_cerificates.results }}" + changed_when: false + when: + - groups['broken_etcd'] + - "item.rc != 0 and not 'No such file or directory' in item.stderr" + +- name: Get etcd cluster members + shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem member list" + register: member_list + changed_when: false + check_mode: no + environment: + - ETCDCTL_API: 3 + when: + - groups['broken_etcd'] + - not healthy + - has_quorum + +- name: Remove broken cluster members + shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}" + environment: + - ETCDCTL_API: 3 + with_nested: + - "{{ groups['broken_etcd'] }}" + - "{{ member_list.stdout_lines }}" + when: + - groups['broken_etcd'] + - not healthy + - has_quorum + - hostvars[item[0]]['etcd_member_name'] == item[1].replace(' ','').split(',')[2] diff --git a/roles/recover_control_plane/etcd/tasks/prepare.yml b/roles/recover_control_plane/etcd/tasks/prepare.yml deleted file mode 100644 index d3cacb934..000000000 --- a/roles/recover_control_plane/etcd/tasks/prepare.yml +++ /dev/null @@ -1,48 +0,0 @@ ---- -- name: Delete old certificates - # noqa 302 - rm is ok here for now - shell: "rm /etc/ssl/etcd/ssl/*{{ item }}* /etc/kubernetes/ssl/etcd/*{{ item }}*" - with_items: "{{ old_etcds.split(',') }}" - register: delete_old_cerificates - ignore_errors: true - when: old_etcds is defined - -- name: Fail if unable to delete old certificates - fail: - msg: "Unable to delete old certificates for: {{ item.item }}" - loop: "{{ delete_old_cerificates.results }}" - changed_when: false - when: - - old_etcds is defined - - "item.rc != 0 and not 'No such file or directory' in item.stderr" - -- name: Get etcd cluster members - shell: "{{ bin_dir }}/etcdctl member list" - register: member_list - changed_when: false - check_mode: no - environment: - - ETCDCTL_API: 3 - - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem - - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem" - - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem" - when: - - has_etcdctl - - etcd_cluster_is_healthy - - old_etcd_members is defined - -- name: Remove old cluster members - shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}" - environment: - - ETCDCTL_API: 3 - - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem - - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem" - - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem" - with_nested: - - "{{ old_etcd_members.split(',') }}" - - "{{ member_list.stdout_lines }}" - when: - - has_etcdctl - - etcd_cluster_is_healthy - - old_etcd_members is defined - - item[0] == item[1].replace(' ','').split(',')[2] diff --git a/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml b/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml index beb8b0daf..fdd9d0b5f 100644 --- a/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml +++ b/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml @@ -1,11 +1,8 @@ --- - name: Save etcd snapshot - shell: "{{ bin_dir }}/etcdctl snapshot save /tmp/snapshot.db" + shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem snapshot save /tmp/snapshot.db" environment: - ETCDCTL_API: 3 - - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem - - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem" - - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem" when: etcd_snapshot is not defined - name: Transfer etcd snapshot to host @@ -25,12 +22,9 @@ state: absent - name: Restore etcd snapshot - shell: "{{ bin_dir }}/etcdctl snapshot restore /tmp/snapshot.db --name {{ etcd_member_name }} --initial-cluster {{ etcd_member_name }}={{ etcd_peer_url }} --initial-cluster-token k8s_etcd --initial-advertise-peer-urls {{ etcd_peer_url }} --data-dir {{ etcd_data_dir }}" + shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem snapshot restore /tmp/snapshot.db --name {{ etcd_member_name }} --initial-cluster {{ etcd_member_name }}={{ etcd_peer_url }} --initial-cluster-token k8s_etcd --initial-advertise-peer-urls {{ etcd_peer_url }} --data-dir {{ etcd_data_dir }}" environment: - ETCDCTL_API: 3 - - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem - - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem" - - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem" - name: Remove etcd snapshot file: diff --git a/roles/recover_control_plane/master/tasks/main.yml b/roles/recover_control_plane/master/tasks/main.yml index f67742c85..71a094168 100644 --- a/roles/recover_control_plane/master/tasks/main.yml +++ b/roles/recover_control_plane/master/tasks/main.yml @@ -8,21 +8,22 @@ retries: 6 delay: 10 changed_when: false + when: groups['broken_kube-master'] -- name: Delete old kube-master nodes from cluster +- name: Delete broken kube-master nodes from cluster shell: "{{ bin_dir }}/kubectl delete node {{ item }}" environment: - KUBECONFIG: "{{ ansible_env.HOME | default('/root') }}/.kube/config" - with_items: "{{ old_kube_masters.split(',') }}" - register: delete_old_kube_masters + with_items: "{{ groups['broken_kube-master'] }}" + register: delete_broken_kube_masters failed_when: false - when: old_kube_masters is defined + when: groups['broken_kube-master'] -- name: Fail if unable to delete old kube-master nodes from cluster +- name: Fail if unable to delete broken kube-master nodes from cluster fail: - msg: "Unable to delete old kube-master node: {{ item.item }}" - loop: "{{ delete_old_kube_masters.results }}" + msg: "Unable to delete broken kube-master node: {{ item.item }}" + loop: "{{ delete_broken_kube_masters.results }}" changed_when: false when: - - old_kube_masters is defined + - groups['broken_kube-master'] - "item.rc != 0 and not 'NotFound' in item.stderr" diff --git a/roles/recover_control_plane/pre-recover/defaults/main.yml b/roles/recover_control_plane/pre-recover/defaults/main.yml deleted file mode 100644 index a1f72dea6..000000000 --- a/roles/recover_control_plane/pre-recover/defaults/main.yml +++ /dev/null @@ -1,2 +0,0 @@ ---- -control_plane_is_converged: "{{ groups['etcd'] | sort == groups['kube-master'] | sort | bool }}" diff --git a/roles/recover_control_plane/pre-recover/tasks/main.yml b/roles/recover_control_plane/pre-recover/tasks/main.yml deleted file mode 100644 index 0b305ed9e..000000000 --- a/roles/recover_control_plane/pre-recover/tasks/main.yml +++ /dev/null @@ -1,36 +0,0 @@ ---- -- name: Check for etcdctl binary - raw: "test -e {{ bin_dir }}/etcdctl" - register: test_etcdctl - -- name: Set has_etcdctl fact - set_fact: - has_etcdctl: "{{ test_etcdctl.rc == 0 | bool }}" - -- name: Check if etcd cluster is healthy - shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'" - register: etcd_cluster_health - ignore_errors: true - changed_when: false - check_mode: no - environment: - ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" - ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" - ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem" - when: has_etcdctl - -- name: Set etcd_cluster_is_healthy fact - set_fact: - etcd_cluster_is_healthy: "{{ etcd_cluster_health.rc == 0 | bool }}" - -- name: Abort if etcd cluster is healthy and old_etcd_members is undefined - assert: - that: "{{ old_etcd_members is defined }}" - msg: "'old_etcd_members' must be defined when the etcd cluster has quorum." - when: etcd_cluster_is_healthy - -- name: Warn for untested recovery - debug: - msg: Control plane recovery of split control planes is UNTESTED! Abort or continue at your own risk. - delay: 30 - when: not control_plane_is_converged diff --git a/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml b/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml index 6f7d0cdf6..bf4e974e3 100644 --- a/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml +++ b/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml @@ -5,7 +5,7 @@ - name: Set VM count needed for CI test_id set_fact: - vm_count: "{%- if mode in ['separate', 'separate-scale', 'ha', 'ha-scale'] -%}{{ 3|int }}{%- elif mode == 'aio' -%}{{ 1|int }}{%- else -%}{{ 2|int }}{%- endif -%}" + vm_count: "{%- if mode in ['separate', 'separate-scale', 'ha', 'ha-scale', 'ha-recover', 'ha-recover-noquorum'] -%}{{ 3|int }}{%- elif mode == 'aio' -%}{{ 1|int }}{%- else -%}{{ 2|int }}{%- endif -%}" - import_tasks: create-vms.yml when: diff --git a/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2 b/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2 index 82293e0cd..b842c97a7 100644 --- a/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2 +++ b/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2 @@ -45,6 +45,45 @@ instance-1 [vault] instance-1 +{% elif mode == "ha-recover" %} +[kube-master] +instance-1 +instance-2 + +[kube-node] +instance-3 + +[etcd] +instance-3 +instance-1 +instance-2 + +[broken_kube-master] +instance-2 + +[broken_etcd] +instance-2 etcd_member_name=etcd3 +{% elif mode == "ha-recover-noquorum" %} +[kube-master] +instance-3 +instance-1 +instance-2 + +[kube-node] +instance-3 + +[etcd] +instance-3 +instance-1 +instance-2 + +[broken_kube-master] +instance-1 +instance-2 + +[broken_etcd] +instance-1 etcd_member_name=etcd2 +instance-2 etcd_member_name=etcd3 {% endif %} [k8s-cluster:children] diff --git a/tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml b/tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml new file mode 100644 index 000000000..a011af01f --- /dev/null +++ b/tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml @@ -0,0 +1,10 @@ +--- +# Instance settings +cloud_image: ubuntu-1804 +mode: ha-recover-noquorum +vm_memory: 1600Mi + +# Kubespray settings +kube_network_plugin: calico +deploy_netchecker: true +dns_min_replicas: 1 diff --git a/tests/files/packet_ubuntu18-calico-ha-recover.yml b/tests/files/packet_ubuntu18-calico-ha-recover.yml new file mode 100644 index 000000000..079440a30 --- /dev/null +++ b/tests/files/packet_ubuntu18-calico-ha-recover.yml @@ -0,0 +1,10 @@ +--- +# Instance settings +cloud_image: ubuntu-1804 +mode: ha-recover +vm_memory: 1600Mi + +# Kubespray settings +kube_network_plugin: calico +deploy_netchecker: true +dns_min_replicas: 1 diff --git a/tests/scripts/testcases_run.sh b/tests/scripts/testcases_run.sh index 69782f862..81df1a129 100755 --- a/tests/scripts/testcases_run.sh +++ b/tests/scripts/testcases_run.sh @@ -47,6 +47,12 @@ if [ "${UPGRADE_TEST}" != "false" ]; then ansible-playbook ${LOG_LEVEL} -e @${CI_TEST_VARS} -e local_release_dir=${PWD}/downloads -e ansible_python_interpreter=${PYPATH} --limit "all:!fake_hosts" $PLAYBOOK fi +# Test control plane recovery +if [ "${RECOVER_CONTROL_PLANE_TEST}" != "false" ]; then + ansible-playbook ${LOG_LEVEL} -e @${CI_TEST_VARS} -e local_release_dir=${PWD}/downloads -e ansible_python_interpreter=${PYPATH} --limit "${RECOVER_CONTROL_PLANE_TEST_GROUPS}:!fake_hosts" -e reset_confirmation=yes reset.yml + ansible-playbook ${LOG_LEVEL} -e @${CI_TEST_VARS} -e local_release_dir=${PWD}/downloads -e ansible_python_interpreter=${PYPATH} -e etcd_retries=10 --limit etcd,kube-master:!fake_hosts recover-control-plane.yml +fi + # Tests Cases ## Test Master API ansible-playbook -e ansible_python_interpreter=${PYPATH} --limit "all:!fake_hosts" tests/testcases/010_check-apiserver.yml $LOG_LEVEL diff --git a/tests/templates/inventory-aws.j2 b/tests/templates/inventory-aws.j2 index 92f107f65..3ed86eb96 100644 --- a/tests/templates/inventory-aws.j2 +++ b/tests/templates/inventory-aws.j2 @@ -25,3 +25,9 @@ kube-master calico-rr [calico-rr] + +[broken_kube-master] +node2 + +[broken_etcd] +node2 diff --git a/tests/templates/inventory-do.j2 b/tests/templates/inventory-do.j2 index 83a749afc..ab7d95220 100644 --- a/tests/templates/inventory-do.j2 +++ b/tests/templates/inventory-do.j2 @@ -29,6 +29,12 @@ [vault] {{droplets.results[1].droplet.name}} {{droplets.results[2].droplet.name}} + +[broken_kube-master] +{{droplets.results[1].droplet.name}} + +[broken_etcd] +{{droplets.results[2].droplet.name}} {% else %} [kube-master] {{droplets.results[0].droplet.name}} diff --git a/tests/templates/inventory-gce.j2 b/tests/templates/inventory-gce.j2 index 503bb4091..55f67deec 100644 --- a/tests/templates/inventory-gce.j2 +++ b/tests/templates/inventory-gce.j2 @@ -37,6 +37,13 @@ {{node1}} {{node2}} {{node3}} + +[broken_kube-master] +{{node2}} + +[etcd] +{{node2}} +{{node3}} {% elif mode == "default" %} [kube-master] {{node1}}