From 86e3506ae637e2050f953cd85cda5ca16a12ce6f Mon Sep 17 00:00:00 2001 From: woopstar Date: Sun, 1 Apr 2018 18:58:08 +0200 Subject: [PATCH] Etcd cluster setup makeover The current way to setup the etc cluster is messy and buggy. - It checks for cluster is healthy before the cluster is even created. - The unit files are started on handlers, not in the task, so you mess with "flush handlers". - The join_member.yml is not used. - etcd events cluster is not configured for kubeadm - remove duplicate runs between running the role on etcd nodes and k8s nodes --- cluster.yml | 4 +- roles/etcd/defaults/main.yml | 1 + roles/etcd/handlers/main.yml | 5 +- roles/etcd/tasks/configure.yml | 127 +++++++++++++----- roles/etcd/tasks/gen_certs_script.yml | 17 ++- roles/etcd/tasks/install_docker.yml | 8 +- roles/etcd/tasks/install_host.yml | 1 + roles/etcd/tasks/install_rkt.yml | 2 + roles/etcd/tasks/join_etcd-events_member.yml | 13 +- roles/etcd/tasks/join_etcd_member.yml | 15 +-- roles/etcd/tasks/join_member.yml | 47 ------- roles/etcd/tasks/main.yml | 36 ++--- roles/etcd/tasks/refresh_config.yml | 2 +- roles/etcd/tasks/set_cluster_health.yml | 26 ---- .../master/templates/kubeadm-config.yaml.j2 | 3 + 15 files changed, 135 insertions(+), 172 deletions(-) delete mode 100644 roles/etcd/tasks/join_member.yml delete mode 100644 roles/etcd/tasks/set_cluster_health.yml diff --git a/cluster.yml b/cluster.yml index fb7dec4cb..1ca4c23bc 100644 --- a/cluster.yml +++ b/cluster.yml @@ -51,13 +51,13 @@ any_errors_fatal: "{{ any_errors_fatal | default(true) }}" roles: - { role: kubespray-defaults} - - { role: etcd, tags: etcd, etcd_cluster_setup: true } + - { role: etcd, tags: etcd } - hosts: k8s-cluster:calico-rr any_errors_fatal: "{{ any_errors_fatal | default(true) }}" roles: - { role: kubespray-defaults} - - { role: etcd, tags: etcd, etcd_cluster_setup: false } + - { role: etcd, tags: etcd } - hosts: etcd:k8s-cluster:vault:calico-rr any_errors_fatal: "{{ any_errors_fatal | default(true) }}" diff --git a/roles/etcd/defaults/main.yml b/roles/etcd/defaults/main.yml index 6c13810c5..209b401fb 100644 --- a/roles/etcd/defaults/main.yml +++ b/roles/etcd/defaults/main.yml @@ -1,6 +1,7 @@ --- # Set to false to only do certificate management etcd_cluster_setup: true +etcd_events_cluster_setup: false etcd_backup_prefix: "/var/backups" etcd_data_dir: "/var/lib/etcd" diff --git a/roles/etcd/handlers/main.yml b/roles/etcd/handlers/main.yml index a72cbd515..3a46978a6 100644 --- a/roles/etcd/handlers/main.yml +++ b/roles/etcd/handlers/main.yml @@ -10,7 +10,7 @@ - name: restart etcd-events command: /bin/true notify: - - etcd-events | reload systemd + - etcd | reload systemd - reload etcd-events - wait for etcd-events up @@ -19,9 +19,6 @@ - name: etcd | reload systemd command: systemctl daemon-reload -- name: etcd-events | reload systemd - command: systemctl daemon-reload - - name: reload etcd service: name: etcd diff --git a/roles/etcd/tasks/configure.yml b/roles/etcd/tasks/configure.yml index d39ba62d4..674d202e0 100644 --- a/roles/etcd/tasks/configure.yml +++ b/roles/etcd/tasks/configure.yml @@ -1,11 +1,104 @@ --- +- name: Configure | Check if etcd cluster is healthy + shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'" + register: etcd_cluster_is_healthy + ignore_errors: true + changed_when: false + check_mode: no + when: is_etcd_master and etcd_cluster_setup + tags: + - facts + environment: + ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" + ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" + +- name: Configure | Check if etcd-events cluster is healthy + shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} cluster-health | grep -q 'cluster is healthy'" + register: etcd_events_cluster_is_healthy + ignore_errors: true + changed_when: false + check_mode: no + when: is_etcd_master and etcd_events_cluster_setup + tags: + - facts + environment: + ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" + ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" + +- include_tasks: refresh_config.yml + when: is_etcd_master + +- name: Configure | Copy etcd.service systemd file + template: + src: "etcd-{{ etcd_deployment_type }}.service.j2" + dest: /etc/systemd/system/etcd.service + backup: yes + when: is_etcd_master and etcd_cluster_setup + +- name: Configure | Copy etcd-events.service systemd file + template: + src: "etcd-events-{{ etcd_deployment_type }}.service.j2" + dest: /etc/systemd/system/etcd-events.service + backup: yes + when: is_etcd_master and etcd_events_cluster_setup + +- name: Configure | reload systemd + command: systemctl daemon-reload + when: is_etcd_master + +- name: Configure | Ensure etcd is running + service: + name: etcd + state: started + enabled: yes + when: is_etcd_master and etcd_cluster_setup + +- name: Configure | Ensure etcd-events is running + service: + name: etcd-events + state: started + enabled: yes + when: is_etcd_master and etcd_events_cluster_setup + +- name: Configure | Check if etcd cluster is healthy + shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'" + register: etcd_cluster_is_healthy + until: etcd_cluster_is_healthy.rc == 0 + retries: 4 + delay: "{{ retry_stagger | random + 3 }}" + ignore_errors: false + changed_when: false + check_mode: no + when: is_etcd_master and etcd_cluster_setup + tags: + - facts + environment: + ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" + ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" + +- name: Configure | Check if etcd-events cluster is healthy + shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} cluster-health | grep -q 'cluster is healthy'" + register: etcd_events_cluster_is_healthy + until: etcd_events_cluster_is_healthy.rc == 0 + retries: 4 + delay: "{{ retry_stagger | random + 3 }}" + ignore_errors: false + changed_when: false + check_mode: no + when: is_etcd_master and etcd_events_cluster_setup + tags: + - facts + environment: + ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" + ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" + - name: Configure | Check if member is in etcd cluster shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} member list | grep -q {{ etcd_access_address }}" register: etcd_member_in_cluster ignore_errors: true changed_when: false check_mode: no - when: is_etcd_master + when: is_etcd_master and etcd_cluster_setup tags: - facts environment: @@ -25,44 +118,16 @@ ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" -- name: Configure | Copy etcd.service systemd file - template: - src: "etcd-{{ etcd_deployment_type }}.service.j2" - dest: /etc/systemd/system/etcd.service - backup: yes - when: is_etcd_master - notify: restart etcd - -- name: Configure | Copy etcd-events.service systemd file - template: - src: "etcd-events-host.service.j2" - dest: /etc/systemd/system/etcd-events.service - backup: yes - when: is_etcd_master and etcd_deployment_type == "host" and etcd_events_cluster_setup - notify: restart etcd-events - -- name: Configure | Copy etcd-events.service systemd file - template: - src: "etcd-events-docker.service.j2" - dest: /etc/systemd/system/etcd-events.service - backup: yes - when: is_etcd_master and etcd_deployment_type == "docker" and etcd_events_cluster_setup - notify: restart etcd-events - - name: Configure | Join member(s) to etcd cluster one at a time include_tasks: join_etcd_member.yml vars: target_node: "{{ item }}" - loop_control: - pause: 10 with_items: "{{ groups['etcd'] }}" - when: inventory_hostname == item and etcd_member_in_cluster.rc != 0 and etcd_cluster_is_healthy.rc == 0 + when: inventory_hostname == item and etcd_cluster_setup and etcd_member_in_cluster.rc != 0 and etcd_cluster_is_healthy.rc == 0 - name: Configure | Join member(s) to etcd-events cluster one at a time - include_tasks: join_etcd-evetns_member.yml + include_tasks: join_etcd-events_member.yml vars: target_node: "{{ item }}" - loop_control: - pause: 10 with_items: "{{ groups['etcd'] }}" when: inventory_hostname == item and etcd_events_cluster_setup and etcd_events_member_in_cluster.rc != 0 and etcd_events_cluster_is_healthy.rc == 0 diff --git a/roles/etcd/tasks/gen_certs_script.yml b/roles/etcd/tasks/gen_certs_script.yml index 3fbafc52a..8ef9a3dcc 100644 --- a/roles/etcd/tasks/gen_certs_script.yml +++ b/roles/etcd/tasks/gen_certs_script.yml @@ -15,6 +15,7 @@ owner: root mode: 0700 run_once: yes + when: inventory_hostname == groups['etcd'][0] delegate_to: "{{groups['etcd'][0]}}" - name: "Gen_certs | create etcd cert dir (on {{groups['etcd'][0]}})" @@ -26,6 +27,7 @@ recurse: yes mode: 0700 run_once: yes + when: inventory_hostname == groups['etcd'][0] delegate_to: "{{groups['etcd'][0]}}" - name: Gen_certs | write openssl config @@ -34,7 +36,9 @@ dest: "{{ etcd_config_dir }}/openssl.conf" run_once: yes delegate_to: "{{groups['etcd'][0]}}" - when: gen_certs|default(false) + when: + - gen_certs|default(false) + - inventory_hostname == groups['etcd'][0] - name: Gen_certs | copy certs generation script copy: @@ -43,8 +47,9 @@ mode: 0700 run_once: yes delegate_to: "{{groups['etcd'][0]}}" - when: gen_certs|default(false) - + when: + - gen_certs|default(false) + - inventory_hostname == groups['etcd'][0] - name: Gen_certs | run cert generation script command: "bash -x {{ etcd_script_dir }}/make-ssl-etcd.sh -f {{ etcd_config_dir }}/openssl.conf -d {{ etcd_cert_dir }}" @@ -61,7 +66,9 @@ {% endfor %}" run_once: yes delegate_to: "{{groups['etcd'][0]}}" - when: gen_certs|default(false) + when: + - gen_certs|default(false) + - inventory_hostname == groups['etcd'][0] notify: set etcd_secret_changed - set_fact: @@ -160,5 +167,5 @@ group: "{{ etcd_cert_group }}" state: directory owner: kube - mode: "u=rwX,g-rwx,o-rwx" + mode: "640" recurse: yes diff --git a/roles/etcd/tasks/install_docker.yml b/roles/etcd/tasks/install_docker.yml index 58e1485a5..7859134b0 100644 --- a/roles/etcd/tasks/install_docker.yml +++ b/roles/etcd/tasks/install_docker.yml @@ -9,22 +9,22 @@ retries: 4 delay: "{{ retry_stagger | random + 3 }}" changed_when: false + when: etcd_cluster_setup - name: Install etcd launch script template: src: etcd.j2 dest: "{{ bin_dir }}/etcd" owner: 'root' - mode: 0755 + mode: 0750 backup: yes - notify: restart etcd + when: etcd_cluster_setup - name: Install etcd-events launch script template: src: etcd-events.j2 dest: "{{ bin_dir }}/etcd-events" owner: 'root' - mode: 0755 + mode: 0750 backup: yes when: etcd_events_cluster_setup - notify: restart etcd-events diff --git a/roles/etcd/tasks/install_host.yml b/roles/etcd/tasks/install_host.yml index 9e83905bc..1d06a7d5a 100644 --- a/roles/etcd/tasks/install_host.yml +++ b/roles/etcd/tasks/install_host.yml @@ -10,3 +10,4 @@ retries: 4 delay: "{{ retry_stagger | random + 3 }}" changed_when: false + when: etcd_cluster_setup diff --git a/roles/etcd/tasks/install_rkt.yml b/roles/etcd/tasks/install_rkt.yml index 5df623c8b..2f693b371 100644 --- a/roles/etcd/tasks/install_rkt.yml +++ b/roles/etcd/tasks/install_rkt.yml @@ -11,6 +11,7 @@ delay: "{{ retry_stagger | random + 3 }}" changed_when: false environment: "{{proxy_env}}" + when: etcd_cluster_setup - name: Install | Copy etcdctl binary from rkt container command: >- @@ -26,3 +27,4 @@ delay: "{{ retry_stagger | random + 3 }}" changed_when: false environment: "{{proxy_env}}" + when: etcd_cluster_setup diff --git a/roles/etcd/tasks/join_etcd-events_member.yml b/roles/etcd/tasks/join_etcd-events_member.yml index 5a7061880..d5df065f9 100644 --- a/roles/etcd/tasks/join_etcd-events_member.yml +++ b/roles/etcd/tasks/join_etcd-events_member.yml @@ -1,5 +1,5 @@ --- -- name: Join Member | Add member to cluster +- name: Join Member | Add member to etcd-events cluster shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} member add {{ etcd_member_name }} {{ etcd_events_peer_url }}" register: member_add_result until: member_add_result.rc == 0 @@ -23,17 +23,6 @@ {%- endfor -%} when: target_node == inventory_hostname -- name: Join Member | reload systemd - command: systemctl daemon-reload - when: target_node == inventory_hostname - -- name: Join Member | Ensure etcd-events is running - service: - name: etcd-events - state: started - enabled: yes - when: target_node == inventory_hostname - - name: Join Member | Ensure member is in etcd-events cluster shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_events_access_addresses }} member list | grep -q {{ etcd_events_access_address }}" register: etcd_events_member_in_cluster diff --git a/roles/etcd/tasks/join_etcd_member.yml b/roles/etcd/tasks/join_etcd_member.yml index d11037151..60cfd16cd 100644 --- a/roles/etcd/tasks/join_etcd_member.yml +++ b/roles/etcd/tasks/join_etcd_member.yml @@ -1,5 +1,5 @@ --- -- name: Join Member | Add member to cluster +- name: Join Member | Add member to etcd cluster shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member add {{ etcd_member_name }} {{ etcd_peer_url }}" register: member_add_result until: member_add_result.rc == 0 @@ -23,18 +23,7 @@ {%- endfor -%} when: target_node == inventory_hostname -- name: Join Member | reload systemd - command: systemctl daemon-reload - when: target_node == inventory_hostname - -- name: Join Member | Ensure etcd is running - service: - name: etcd - state: started - enabled: yes - when: target_node == inventory_hostname - -- name: Join Member | Ensure member is in cluster +- name: Join Member | Ensure member is in etcd cluster shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} member list | grep -q {{ etcd_access_address }}" register: etcd_member_in_cluster changed_when: false diff --git a/roles/etcd/tasks/join_member.yml b/roles/etcd/tasks/join_member.yml deleted file mode 100644 index d11037151..000000000 --- a/roles/etcd/tasks/join_member.yml +++ /dev/null @@ -1,47 +0,0 @@ ---- -- name: Join Member | Add member to cluster - shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member add {{ etcd_member_name }} {{ etcd_peer_url }}" - register: member_add_result - until: member_add_result.rc == 0 - retries: 4 - delay: "{{ retry_stagger | random + 3 }}" - when: target_node == inventory_hostname - environment: - ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" - ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" - -- include_tasks: refresh_config.yml - vars: - etcd_peer_addresses: >- - {% for host in groups['etcd'] -%} - {%- if hostvars[host]['etcd_member_in_cluster'].rc == 0 -%} - {{ "etcd"+loop.index|string }}=https://{{ hostvars[host].access_ip | default(hostvars[host].ip | default(hostvars[host].ansible_default_ipv4['address'])) }}:2380, - {%- endif -%} - {%- if loop.last -%} - {{ etcd_member_name }}={{ etcd_peer_url }} - {%- endif -%} - {%- endfor -%} - when: target_node == inventory_hostname - -- name: Join Member | reload systemd - command: systemctl daemon-reload - when: target_node == inventory_hostname - -- name: Join Member | Ensure etcd is running - service: - name: etcd - state: started - enabled: yes - when: target_node == inventory_hostname - -- name: Join Member | Ensure member is in cluster - shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} member list | grep -q {{ etcd_access_address }}" - register: etcd_member_in_cluster - changed_when: false - check_mode: no - tags: - - facts - when: target_node == inventory_hostname - environment: - ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" - ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" diff --git a/roles/etcd/tasks/main.yml b/roles/etcd/tasks/main.yml index a64d9b097..c35a9cab6 100644 --- a/roles/etcd/tasks/main.yml +++ b/roles/etcd/tasks/main.yml @@ -6,6 +6,7 @@ - facts - include_tasks: "gen_certs_{{ cert_management }}.yml" + when: tags: - etcd-secrets @@ -29,47 +30,28 @@ tags: - upgrade -- include_tasks: set_cluster_health.yml - when: is_etcd_master and etcd_cluster_setup - - include_tasks: configure.yml - when: is_etcd_master and etcd_cluster_setup + when: is_etcd_master - include_tasks: refresh_config.yml - when: is_etcd_master and etcd_cluster_setup + when: is_etcd_master - name: Restart etcd if certs changed - command: /bin/true - notify: restart etcd - when: is_etcd_master and etcd_secret_changed|default(false) - -- name: Restart etcd-events if certs changed - command: /bin/true - notify: restart etcd - when: is_etcd_master and etcd_events_cluster_setup and etcd_secret_changed|default(false) - -# reload-systemd -- meta: flush_handlers - -- name: Ensure etcd is running service: name: etcd - state: started + state: restarted enabled: yes - when: is_etcd_master and etcd_cluster_setup + when: is_etcd_master and etcd_cluster_setup and etcd_secret_changed|default(false) -- name: Ensure etcd-events is running +- name: Restart etcd-events if certs changed service: name: etcd-events - state: started + state: restarted enabled: yes - when: is_etcd_master and etcd_events_cluster_setup + when: is_etcd_master and etcd_events_cluster_setup and etcd_secret_changed|default(false) # After etcd cluster is assembled, make sure that # initial state of the cluster is in `existing` # state insted of `new`. -- include_tasks: set_cluster_health.yml - when: is_etcd_master and etcd_cluster_setup - - include_tasks: refresh_config.yml - when: is_etcd_master and etcd_cluster_setup + when: is_etcd_master diff --git a/roles/etcd/tasks/refresh_config.yml b/roles/etcd/tasks/refresh_config.yml index 927663301..21c308fb0 100644 --- a/roles/etcd/tasks/refresh_config.yml +++ b/roles/etcd/tasks/refresh_config.yml @@ -4,7 +4,7 @@ src: etcd.env.j2 dest: /etc/etcd.env notify: restart etcd - when: is_etcd_master + when: is_etcd_master and etcd_cluster_setup - name: Refresh config | Create etcd-events config file template: diff --git a/roles/etcd/tasks/set_cluster_health.yml b/roles/etcd/tasks/set_cluster_health.yml deleted file mode 100644 index d0202943c..000000000 --- a/roles/etcd/tasks/set_cluster_health.yml +++ /dev/null @@ -1,26 +0,0 @@ ---- -- name: Configure | Check if etcd cluster is healthy - shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'" - register: etcd_cluster_is_healthy - ignore_errors: true - changed_when: false - check_mode: no - when: is_etcd_master - tags: - - facts - environment: - ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" - ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" - -- name: Configure | Check if etcd-events cluster is healthy - shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} cluster-health | grep -q 'cluster is healthy'" - register: etcd_events_cluster_is_healthy - ignore_errors: true - changed_when: false - check_mode: no - when: is_etcd_master and etcd_events_cluster_setup - tags: - - facts - environment: - ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" - ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" diff --git a/roles/kubernetes/master/templates/kubeadm-config.yaml.j2 b/roles/kubernetes/master/templates/kubeadm-config.yaml.j2 index 0eccb4918..849a697f7 100644 --- a/roles/kubernetes/master/templates/kubeadm-config.yaml.j2 +++ b/roles/kubernetes/master/templates/kubeadm-config.yaml.j2 @@ -38,6 +38,9 @@ apiServerExtraArgs: apiserver-count: "{{ kube_apiserver_count }}" {% if kube_version | version_compare('v1.9', '>=') %} endpoint-reconciler-type: lease +{% endif %} +{% if etcd_events_cluster_setup %} + etcd-servers-overrides: "/events#{{ etcd_events_access_addresses }}" {% endif %} service-node-port-range: {{ kube_apiserver_node_port_range }} kubelet-preferred-address-types: "{{ kubelet_preferred_address_types }}"