From a35b6dc1af2ad62e8f48160f1e106cb691a80923 Mon Sep 17 00:00:00 2001 From: Etienne Champetier Date: Wed, 8 Apr 2020 04:27:43 -0400 Subject: [PATCH] Fix scaling (#5889) * etcd: etcd-events doesn't depend on etcd_cluster_setup Signed-off-by: Etienne Champetier * etcd: remove condition already present on include_tasks Signed-off-by: Etienne Champetier * etcd: fix scaling up Signed-off-by: Etienne Champetier * etcd: use *access_addresses, do not delegate to etcd[0] We want to wait for the full cluster to be healthy, so use all the cluster addresses Also we should be able to run the playbook when etcd[0] is down (not tested), so do not delegate to etcd[0] Signed-off-by: Etienne Champetier * etcd: use failed_when for health check unhealthy cluster is expected on first run, so use failed_when instead of ignore_errors to remove scary red messages Also use run_once Signed-off-by: Etienne Champetier * kubernetes/preinstall: ensure ansible_fqdn is up to date after changing /etc/hosts Signed-off-by: Etienne Champetier * kubernetes/master: regenerate apiserver cert if needed Signed-off-by: Etienne Champetier --- roles/etcd/tasks/configure.yml | 25 +++++++-------- roles/etcd/tasks/join_etcd-events_member.yml | 9 ++++-- roles/etcd/tasks/join_etcd_member.yml | 9 ++++-- .../kubernetes/master/tasks/kubeadm-setup.yml | 31 +++++++++++++++++++ .../preinstall/tasks/0090-etchosts.yml | 5 +++ 5 files changed, 60 insertions(+), 19 deletions(-) diff --git a/roles/etcd/tasks/configure.yml b/roles/etcd/tasks/configure.yml index d87917176..69fb272e5 100644 --- a/roles/etcd/tasks/configure.yml +++ b/roles/etcd/tasks/configure.yml @@ -2,9 +2,10 @@ - name: Configure | Check if etcd cluster is healthy shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'" register: etcd_cluster_is_healthy - ignore_errors: true + failed_when: false changed_when: false check_mode: no + run_once: yes when: is_etcd_master and etcd_cluster_setup tags: - facts @@ -16,9 +17,10 @@ - name: Configure | Check if etcd-events cluster is healthy shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} cluster-health | grep -q 'cluster is healthy'" register: etcd_events_cluster_is_healthy - ignore_errors: true + failed_when: false changed_when: false check_mode: no + run_once: yes when: is_etcd_master and etcd_events_cluster_setup tags: - facts @@ -49,22 +51,26 @@ daemon_reload: true when: is_etcd_master +# when scaling new etcd will fail to start - name: Configure | Ensure etcd is running service: name: etcd state: started enabled: yes + ignore_errors: "{{ etcd_cluster_is_healthy.rc == 0 }}" when: is_etcd_master and etcd_cluster_setup +# when scaling new etcd will fail to start - name: Configure | Ensure etcd-events is running service: name: etcd-events state: started enabled: yes + ignore_errors: "{{ etcd_events_cluster_is_healthy.rc == 0 }}" when: is_etcd_master and etcd_events_cluster_setup -- name: Configure | Check if etcd cluster is healthy - shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_client_url }} cluster-health | grep -q 'cluster is healthy'" +- name: Configure | Wait for etcd cluster to be healthy + shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'" register: etcd_cluster_is_healthy until: etcd_cluster_is_healthy.rc == 0 retries: "{{ etcd_retries }}" @@ -72,7 +78,6 @@ ignore_errors: false changed_when: false check_mode: no - delegate_to: "{{ groups['etcd'][0] }}" run_once: yes when: - is_etcd_master @@ -84,8 +89,8 @@ ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem" -- name: Configure | Check if etcd-events cluster is healthy - shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_events_client_url }} cluster-health | grep -q 'cluster is healthy'" +- name: Configure | Wait for etcd-events cluster to be healthy + shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_events_access_addresses }} cluster-health | grep -q 'cluster is healthy'" register: etcd_events_cluster_is_healthy until: etcd_events_cluster_is_healthy.rc == 0 retries: "{{ etcd_retries }}" @@ -93,12 +98,10 @@ ignore_errors: false changed_when: false check_mode: no - delegate_to: "{{ groups['etcd'][0] }}" run_once: yes when: - is_etcd_master - etcd_events_cluster_setup - - etcd_cluster_setup tags: - facts environment: @@ -136,14 +139,10 @@ - name: Configure | Join member(s) to etcd cluster one at a time include_tasks: join_etcd_member.yml - vars: - target_node: "{{ item }}" with_items: "{{ groups['etcd'] }}" when: inventory_hostname == item and etcd_cluster_setup and etcd_member_in_cluster.rc != 0 and etcd_cluster_is_healthy.rc == 0 - name: Configure | Join member(s) to etcd-events cluster one at a time include_tasks: join_etcd-events_member.yml - vars: - target_node: "{{ item }}" with_items: "{{ groups['etcd'] }}" when: inventory_hostname == item and etcd_events_cluster_setup and etcd_events_member_in_cluster.rc != 0 and etcd_events_cluster_is_healthy.rc == 0 diff --git a/roles/etcd/tasks/join_etcd-events_member.yml b/roles/etcd/tasks/join_etcd-events_member.yml index 0f214302e..21396a57a 100644 --- a/roles/etcd/tasks/join_etcd-events_member.yml +++ b/roles/etcd/tasks/join_etcd-events_member.yml @@ -5,7 +5,6 @@ until: member_add_result.rc == 0 retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" - when: target_node == inventory_hostname environment: ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" @@ -21,7 +20,6 @@ {{ etcd_member_name }}={{ etcd_events_peer_url }} {%- endif -%} {%- endfor -%} - when: target_node == inventory_hostname - name: Join Member | Ensure member is in etcd-events cluster shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_events_access_addresses }} member list | grep -q {{ etcd_events_access_address }}" @@ -30,7 +28,12 @@ check_mode: no tags: - facts - when: target_node == inventory_hostname environment: ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" + +- name: Configure | Ensure etcd-events is running + service: + name: etcd-events + state: started + enabled: yes diff --git a/roles/etcd/tasks/join_etcd_member.yml b/roles/etcd/tasks/join_etcd_member.yml index 928d22642..5c3c7aef0 100644 --- a/roles/etcd/tasks/join_etcd_member.yml +++ b/roles/etcd/tasks/join_etcd_member.yml @@ -5,7 +5,6 @@ until: member_add_result.rc == 0 retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" - when: target_node == inventory_hostname environment: ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" @@ -22,7 +21,6 @@ {{ etcd_member_name }}={{ etcd_peer_url }} {%- endif -%} {%- endfor -%} - when: target_node == inventory_hostname - name: Join Member | Ensure member is in etcd cluster shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} member list | grep -q {{ etcd_access_address }}" @@ -31,8 +29,13 @@ check_mode: no tags: - facts - when: target_node == inventory_hostname environment: ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem" + +- name: Configure | Ensure etcd is running + service: + name: etcd + state: started + enabled: yes diff --git a/roles/kubernetes/master/tasks/kubeadm-setup.yml b/roles/kubernetes/master/tasks/kubeadm-setup.yml index 549a34a1f..d34128553 100644 --- a/roles/kubernetes/master/tasks/kubeadm-setup.yml +++ b/roles/kubernetes/master/tasks/kubeadm-setup.yml @@ -103,6 +103,37 @@ - not upgrade_cluster_setup - kubeadm_already_run.stat.exists +- name: kubeadm | Check if apiserver.crt contains all needed SANs + command: openssl x509 -noout -in "{{ kube_cert_dir }}/apiserver.crt" -checkip "{{ item }}" + with_items: "{{ apiserver_sans }}" + register: apiserver_sans_check + changed_when: "'does match certificate' not in apiserver_sans_check.stdout" + when: + - inventory_hostname == groups['kube-master']|first + - kubeadm_already_run.stat.exists + +- name: kubeadm | regenerate apiserver cert 1/2 + file: + state: absent + path: "{{ kube_cert_dir }}/{{ item }}" + with_items: + - apiserver.crt + - apiserver.key + when: + - inventory_hostname == groups['kube-master']|first + - kubeadm_already_run.stat.exists + - apiserver_sans_check.changed + +- name: kubeadm | regenerate apiserver cert 2/2 + command: >- + {{ bin_dir }}/kubeadm + init phase certs apiserver + --config={{ kube_config_dir }}/kubeadm-config.yaml + when: + - inventory_hostname == groups['kube-master']|first + - kubeadm_already_run.stat.exists + - apiserver_sans_check.changed + - name: kubeadm | Initialize first master command: >- timeout -k 300s 300s diff --git a/roles/kubernetes/preinstall/tasks/0090-etchosts.yml b/roles/kubernetes/preinstall/tasks/0090-etchosts.yml index 0cf2fa2fe..d7b18012f 100644 --- a/roles/kubernetes/preinstall/tasks/0090-etchosts.yml +++ b/roles/kubernetes/preinstall/tasks/0090-etchosts.yml @@ -59,3 +59,8 @@ backup: yes unsafe_writes: yes with_dict: "{{ etc_hosts_localhosts_dict_target }}" + +# gather facts to update ansible_fqdn +- name: Update facts + setup: + gather_subset: min