Documentation and playbook for recovering control plane from node failure (#4146)

This commit is contained in:
qvicksilver 2019-04-29 10:40:20 +02:00 committed by Kubernetes Prow Robot
parent 9335cdcebc
commit 48a182844c
9 changed files with 287 additions and 0 deletions

View file

@ -0,0 +1,57 @@
Recovering the control plane
============================
To recover from broken nodes in the control plane use the "recover\-control\-plane.yml" playbook.
* Backup what you can
* Provision new nodes to replace the broken ones
* Place the surviving nodes of the control plane first in the "etcd" and "kube-master" groups
* Add the new nodes below the surviving control plane nodes in the "etcd" and "kube-master" groups
Examples of what broken means in this context:
* One or more bare metal node(s) suffer from unrecoverable hardware failure
* One or more node(s) fail during patching or upgrading
* Etcd database corruption
* Other node related failures leaving your control plane degraded or nonfunctional
__Note that you need at least one functional node to be able to recover using this method.__
## If etcd quorum is intact
* Set the etcd member names of the broken node(s) in the variable "old\_etcd\_members", this variable is used to remove the broken nodes from the etcd cluster.
```old_etcd_members=etcd2,etcd3```
* If you reuse identities for your etcd nodes add the inventory names for those nodes to the variable "old\_etcds". This will remove any previously generated certificates for those nodes.
```old_etcds=etcd2.example.com,etcd3.example.com```
* If you would like to remove the broken node objects from the kubernetes cluster add their inventory names to the variable "old\_kube\_masters"
```old_kube_masters=master2.example.com,master3.example.com```
Then run the playbook with ```--limit etcd,kube-master```
When finished you should have a fully working and highly available control plane again.
## If etcd quorum is lost
* If you reuse identities for your etcd nodes add the inventory names for those nodes to the variable "old\_etcds". This will remove any previously generated certificates for those nodes.
```old_etcds=etcd2.example.com,etcd3.example.com```
* If you would like to remove the broken node objects from the kubernetes cluster add their inventory names to the variable "old\_kube\_masters"
```old_kube_masters=master2.example.com,master3.example.com```
Then run the playbook with ```--limit etcd,kube-master```
When finished you should have a fully working and highly available control plane again.
The playbook will attempt to take a snapshot from the first node in the "etcd" group and restore from that. If you would like to restore from an alternate snapshot set the path to that snapshot in the "etcd\_snapshot" variable.
```etcd_snapshot=/tmp/etcd_snapshot```
## Caveats
* The playbook has only been tested on control planes where the etcd and kube-master nodes are the same, the playbook will warn if run on a cluster with separate etcd and kube-master nodes.
* The playbook has only been tested with fairly small etcd databases.
* If your new control plane nodes have new ip addresses you may have to change settings in various places.
* There may be disruptions while running the playbook.
* There are absolutely no guarantees.
If possible try to break a cluster in the same way that your target cluster is broken and test to recover that before trying on the real target cluster.

37
recover-control-plane.yml Normal file
View file

@ -0,0 +1,37 @@
---
- hosts: localhost
gather_facts: False
tasks:
- name: "Check ansible version !=2.7.0"
assert:
msg: "Ansible V2.7.0 can't be used until: https://github.com/ansible/ansible/issues/46600 is fixed"
that:
- ansible_version.string is version("2.7.0", "!=")
- ansible_version.string is version("2.6.0", ">=")
tags:
- check
vars:
ansible_connection: local
- hosts: bastion[0]
gather_facts: False
roles:
- { role: kubespray-defaults}
- { role: bastion-ssh-config, tags: ["localhost", "bastion"]}
- hosts: "{{ groups['etcd'] | first }}"
roles:
- { role: kubespray-defaults}
- { role: recover_control_plane/pre-recover }
- { role: recover_control_plane/etcd }
- hosts: "{{ groups['kube-master'] | first }}"
roles:
- { role: recover_control_plane/master }
- include: cluster.yml
- hosts: "{{ groups['kube-master'] }}"
roles:
- { role: kubespray-defaults}
- { role: recover_control_plane/post-recover }

View file

@ -0,0 +1,7 @@
---
- include_tasks: prepare.yml
- include_tasks: recover_lost_quorum.yml
when:
- has_etcdctl
- not etcd_cluster_is_healthy

View file

@ -0,0 +1,47 @@
---
- name: Delete old certificates
shell: "rm /etc/ssl/etcd/ssl/*{{ item }}* /etc/kubernetes/ssl/etcd/*{{ item }}*"
with_items: "{{ old_etcds.split(',') }}"
register: delete_old_cerificates
ignore_errors: true
when: old_etcds is defined
- name: Fail if unable to delete old certificates
fail:
msg: "Unable to delete old certificates for: {{ item.item }}"
loop: "{{ delete_old_cerificates.results }}"
changed_when: false
when:
- old_etcds is defined
- "item.rc != 0 and not 'No such file or directory' in item.stderr"
- name: Get etcd cluster members
shell: "{{ bin_dir }}/etcdctl member list"
register: member_list
changed_when: false
check_mode: no
environment:
- ETCDCTL_API: 3
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem"
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem"
when:
- has_etcdctl
- etcd_cluster_is_healthy
- old_etcd_members is defined
- name: Remove old cluster members
shell: "{{ bin_dir}}/etcdctl --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}"
environment:
- ETCDCTL_API: 3
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem"
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem"
with_nested:
- "{{ old_etcd_members.split(',') }}"
- "{{ member_list.stdout_lines }}"
when:
- has_etcdctl
- etcd_cluster_is_healthy
- old_etcd_members is defined
- item[0] == item[1].replace(' ','').split(',')[2]

View file

@ -0,0 +1,54 @@
---
- name: Save etcd snapshot
shell: "{{ bin_dir }}/etcdctl snapshot save /tmp/snapshot.db"
environment:
- ETCDCTL_API: 3
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem"
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem"
when: etcd_snapshot is not defined
- name: Transfer etcd snapshot to host
copy:
src: "{{ etcd_snapshot }}"
dest: /tmp/snapshot.db
when: etcd_snapshot is defined
- name: Stop etcd
systemd:
name: etcd
state: stopped
- name: Remove etcd data-dir
shell: "rm -rf {{ etcd_data_dir }}"
- name: Restore etcd snapshot
shell: "{{ bin_dir }}/etcdctl snapshot restore /tmp/snapshot.db --name {{ etcd_member_name }} --initial-cluster {{ etcd_member_name }}={{ etcd_peer_url }} --initial-cluster-token k8s_etcd --initial-advertise-peer-urls {{ etcd_peer_url }} --data-dir {{ etcd_data_dir }}"
environment:
- ETCDCTL_API: 3
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem"
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem"
- name: Remove etcd snapshot
file:
path: /tmp/snapshot.db
state: absent
- name: Change etcd data-dir owner
file:
path: "{{ etcd_data_dir }}"
owner: etcd
group: etcd
recurse: true
- name: Reconfigure etcd
replace:
path: /etc/etcd.env
regexp: "^(ETCD_INITIAL_CLUSTER=).*"
replace: '\1{{ etcd_member_name }}={{ etcd_peer_url }}'
- name: Start etcd
systemd:
name: etcd
state: started

View file

@ -0,0 +1,28 @@
---
- name: Wait for apiserver
shell: "{{ bin_dir }}/kubectl get nodes"
environment:
- KUBECONFIG: /root/.kube/config
register: apiserver_is_ready
until: apiserver_is_ready.rc == 0
retries: 6
delay: 10
changed_when: false
- name: Delete old kube-master nodes from cluster
shell: "{{ bin_dir }}/kubectl delete node {{ item }}"
environment:
- KUBECONFIG: /root/.kube/config
with_items: "{{ old_kube_masters.split(',') }}"
register: delete_old_kube_masters
failed_when: false
when: old_kube_masters is defined
- name: Fail if unable to delete old kube-master nodes from cluster
fail:
msg: "Unable to delete old kube-master node: {{ item.item }}"
loop: "{{ delete_old_kube_masters.results }}"
changed_when: false
when:
- old_kube_masters is defined
- "item.rc != 0 and not 'NotFound' in item.stderr"

View file

@ -0,0 +1,19 @@
---
# TODO: Figure out why kubeadm does not fix this
- name: Set etcd-servers fact
set_fact:
etcd_servers: >-
{% for host in groups['etcd'] -%}
{% if not loop.last -%}
https://{{ hostvars[host].access_ip | default(hostvars[host].ip | default(hostvars[host].ansible_default_ipv4['address'])) }}:2379,
{%- endif -%}
{%- if loop.last -%}
https://{{ hostvars[host].access_ip | default(hostvars[host].ip | default(hostvars[host].ansible_default_ipv4['address'])) }}:2379
{%- endif -%}
{%- endfor -%}
- name: Update apiserver etcd-servers list
replace:
path: /etc/kubernetes/manifests/kube-apiserver.yaml
regexp: "(etcd-servers=).*"
replace: "\\1{{ etcd_servers }}"

View file

@ -0,0 +1,2 @@
---
control_plane_is_converged: "{{ groups['etcd'] | sort == groups['kube-master'] | sort | bool }}"

View file

@ -0,0 +1,36 @@
---
- name: Check for etcdctl binary
raw: "test -e {{ bin_dir }}/etcdctl"
register: test_etcdctl
- name: Set has_etcdctl fact
set_fact:
has_etcdctl: "{{ test_etcdctl.rc == 0 | bool }}"
- name: Check if etcd cluster is healthy
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'"
register: etcd_cluster_health
ignore_errors: true
changed_when: false
check_mode: no
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem"
when: has_etcdctl
- name: Set etcd_cluster_is_healthy fact
set_fact:
etcd_cluster_is_healthy: "{{ etcd_cluster_health.rc == 0 | bool }}"
- name: Abort if etcd cluster is healthy and old_etcd_members is undefined
assert:
that: "{{ old_etcd_members is defined }}"
msg: "'old_etcd_members' must be defined when the etcd cluster has quorum."
when: etcd_cluster_is_healthy
- name: Warn for untested recovery
debug:
msg: Control plane recovery of split control planes is UNTESTED! Abort or continue at your own risk.
delay: 30
when: not control_plane_is_converged