Refactor remove node to allow removing dead nodes and etcd members (#5009)

Change-Id: I1c59249f08f16d0f6fd60df6ab61f17a0a7df189
2019-08-07 14:46:50 +03:00 · 2019-08-07 14:46:50 +03:00 · a44235d11b
commit a44235d11b
parent 7abf6a6958
3 changed files with 70 additions and 17 deletions
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@ -51,20 +51,27 @@ You may want to add worker, master or etcd nodes to your existing cluster. This
 Remove nodes
 ------------

-You may want to remove **worker** nodes to your existing cluster. This can be done by re-running the `remove-node.yml` playbook. First, all nodes will be drained, then stop some kubernetes services and delete some certificates, and finally execute the kubectl command to delete these nodes. This can be combined with the add node function, This is generally helpful when doing something like autoscaling your clusters. Of course if a node is not working, you can remove the node and install it again.
+You may want to remove **master**, **worker**, or **etcd** nodes from your
+existing cluster. This can be done by re-running the `remove-node.yml`
+playbook. First, all specified nodes will be drained, then stop some
+kubernetes services and delete some certificates,
+and finally execute the kubectl command to delete these nodes.
+This can be combined with the add node function. This is generally helpful
+when doing something like autoscaling your clusters. Of course, if a node
+is not working, you can remove the node and install it again.

-Add worker nodes to the list under kube-node if you want to delete them (or utilize a [dynamic inventory](https://docs.ansible.com/ansible/intro_dynamic_inventory.html)).
-
-    ansible-playbook -i inventory/mycluster/hosts.yml remove-node.yml -b -v \
-        --private-key=~/.ssh/private_key
-
-Use `--extra-vars "node=<nodename>,<nodename2>"` to select the node you want to delete.
+Use `--extra-vars "node=<nodename>,<nodename2>"` to select the node(s) you want to delete.
 ```
 ansible-playbook -i inventory/mycluster/hosts.yml remove-node.yml -b -v \
  --private-key=~/.ssh/private_key \
  --extra-vars "node=nodename,nodename2"
 ```

+If a node is completely unreachable by ssh, add `--extra-vars reset_nodes=no`
+to skip the node reset step. If one node is unavailable, but others you wish
+to remove are able to connect via SSH, you could set reset_nodes=no as a host
+var in inventory.
+
 Connecting to Kubernetes
 ------------------------

--- a/remove-node.yml
+++ b/remove-node.yml
@ -1,6 +1,7 @@
 ---
 - hosts: localhost
  become: no
+  gather_facts: no
  tasks:
    - name: "Check ansible version >=2.7.8"
      assert:
@ -12,12 +13,8 @@
  vars:
    ansible_connection: local

- hosts: all
-  vars:
-    ansible_ssh_pipelining: true
-  gather_facts: true
-
 - hosts: "{{ node | default('etcd:k8s-cluster:calico-rr') }}"
+  gather_facts: no
  vars_prompt:
    name: "delete_nodes_confirmation"
    prompt: "Are you sure you want to delete nodes state? Type 'yes' to delete nodes."
@ -31,16 +28,20 @@
      when: delete_nodes_confirmation != "yes"

 - hosts: kube-master
+  gather_facts: no
  roles:
    - { role: kubespray-defaults }
    - { role: remove-node/pre-remove, tags: pre-remove }

 - hosts: "{{ node | default('kube-node') }}"
+  gather_facts: no
  roles:
    - { role: kubespray-defaults }
-    - { role: reset, tags: reset }
+    - { role: reset, tags: reset, when: reset_nodes|default(True) }

- hosts: kube-master
+# Currently cannot remove first master or etcd
+- hosts: "{{ node | default('kube-master[1:]:etcd[:1]') }}"
+  gather_facts: no
  roles:
    - { role: kubespray-defaults }
    - { role: remove-node/post-remove, tags: post-remove }
--- a/roles/remove-node/post-remove/tasks/main.yml
+++ b/roles/remove-node/post-remove/tasks/main.yml
@ -1,9 +1,54 @@
 ---
+- name: Lookup node IP in kubernetes
+  shell: >-
+    {{ bin_dir }}/kubectl get nodes {{ node }}
+    -o jsonpath='{range.status.addresses[?(@.type=="InternalIP")]}{.address}{"\n"}{end}'
+  register: remove_node_ip
+  when:
+    - inventory_hostname in groups['etcd']
+    - ip is not defined
+    - access_ip is not defined
+  delegate_to: "{{ groups['etcd']|first }}"
+  failed_when: false
+
+- name: Set node IP
+  set_fact:
+    node_ip: "{{ ip | default(access_ip | default(remove_node_ip.stdout)) | trim }}"

 - name: Delete node
-  command: "{{ bin_dir }}/kubectl delete node {{ item }}"
-  with_items:
-    - "{{ node.split(',') | default(groups['kube-node']) }}"
+  command: "{{ bin_dir }}/kubectl delete node {{ inventory_hostname }}"
  delegate_to: "{{ groups['kube-master']|first }}"
  run_once: true
  ignore_errors: yes
+
+- name: Lookup etcd member id
+  shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} member list | grep {{ node_ip }} | cut -d: -f1"
+  register: etcd_member_id
+  ignore_errors: true
+  changed_when: false
+  check_mode: no
+  tags:
+    - facts
+  environment:
+    ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ groups['etcd']|first }}.pem"
+    ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ groups['etcd']|first }}-key.pem"
+    ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem"
+  delegate_to: "{{ groups['etcd']|first }}"
+  when: inventory_hostname in groups['etcd']
+
+- name: Remove etcd member from cluster
+  shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} member remove {{ etcd_member_id.stdout }}"
+  register: etcd_member_in_cluster
+  ignore_errors: true
+  changed_when: false
+  check_mode: no
+  tags:
+    - facts
+  environment:
+    ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ groups['etcd']|first }}.pem"
+    ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ groups['etcd']|first }}-key.pem"
+    ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem"
+  delegate_to: "{{ groups['etcd']|first }}"
+  when:
+    - inventory_hostname in groups['etcd']
+    - not etcd_member_id.stdout