Merge pull request #1029 from mattymo/graceful

Add graceful upgrade process
2017-02-17 21:24:32 +01:00 · 2017-02-17 21:24:32 +01:00 · b84cc14694
commit b84cc14694
parent e16ebcad6e a510e7b8f3
7 changed files with 158 additions and 34 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -101,8 +101,8 @@ before_script:

    # Check out latest tag if testing upgrade
    # Uncomment when gitlab kargo repo has tags
-    #- test "${UPGRADE_TEST}" = "true" && git fetch --all && git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
-    - test "${UPGRADE_TEST}" = "true" && git checkout 031cf565ec3ccd3ebbe80eeef3454c3780e5c598 && pip install ansible==2.2.0
+    #- test "${UPGRADE_TEST}" != "false" && git fetch --all && git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
+    - test "${UPGRADE_TEST}" != "false" && git checkout 031cf565ec3ccd3ebbe80eeef3454c3780e5c598 && pip install ansible==2.2.0


    # Create cluster
@ -127,9 +127,10 @@ before_script:
      cluster.yml

    # Repeat deployment if testing upgrade
-    #FIXME(mattymo): repeat "Create cluster" above without duplicating code 
    - >
-      if [ "${UPGRADE_TEST}" = "true" ]; then 
+      if [ "${UPGRADE_TEST}" != "false" ]; then 
+      test "${UPGRADE_TEST}" == "basic" && PLAYBOOK="cluster.yml";
+      test "${UPGRADE_TEST}" == "graceful" && PLAYBOOK="upgrade-cluster.yml";
      pip install ansible==2.2.1.0; 
      git checkout "${CI_BUILD_REF}"; 
      ansible-playbook -i inventory/inventory.ini -b --become-user=root --private-key=${HOME}/.ssh/id_rsa -u $SSH_USER 
@ -149,7 +150,7 @@ before_script:
      -e resolvconf_mode=${RESOLVCONF_MODE} 
      -e weave_cpu_requests=${WEAVE_CPU_LIMIT} 
      -e weave_cpu_limit=${WEAVE_CPU_LIMIT} 
-      cluster.yml; 
+      $PLAYBOOK; 
      fi

    # Tests Cases
@ -254,7 +255,7 @@ before_script:
  KUBE_NETWORK_PLUGIN: canal
  CLOUD_IMAGE: debian-8-kubespray
  CLOUD_REGION: us-east1-b
-  UPGRADE_TEST: "true"
+  UPGRADE_TEST: "basic"
  CLUSTER_MODE: ha

 .rhel7_weave_variables: &rhel7_weave_variables
@ -262,7 +263,6 @@ before_script:
  KUBE_NETWORK_PLUGIN: weave
  CLOUD_IMAGE: rhel-7
  CLOUD_REGION: europe-west1-b
-  UPGRADE_TEST: "true"
  CLUSTER_MODE: default

 .centos7_flannel_variables: &centos7_flannel_variables
@ -278,6 +278,7 @@ before_script:
  CLOUD_IMAGE: debian-8-kubespray
  CLOUD_REGION: us-central1-b
  CLUSTER_MODE: default
+  UPGRADE_TEST: "graceful"

 .coreos_canal_variables: &coreos_canal_variables
 # stage: deploy-gce-part2
@ -444,7 +445,7 @@ rhel7-weave-triggers:
  when: on_success
  only: ['triggers']

-debian8-calico:
+debian8-calico-upgrade:
  stage: deploy-gce-part2
  <<: *job
  <<: *gce
@ -540,7 +541,7 @@ coreos-alpha-weave-ha:
  except: ['triggers']
  only: ['master', /^pr-.*$/]

-ubuntu-rkt-sep:
+ubuntu-rkt-sep-upgrade:
  stage: deploy-gce-part1
  <<: *job
  <<: *gce
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@ -18,7 +18,7 @@ versions. Here are all version vars for each component:
 * flannel_version
 * kubedns_version

-#### Example
+#### Unsafe upgrade example

 If you wanted to upgrade just kube_version from v1.4.3 to v1.4.6, you could
 deploy the following way:
@ -33,6 +33,20 @@ And then repeat with v1.4.6 as kube_version:
 ansible-playbook cluster.yml -i inventory/inventory.cfg -e kube_version=v1.4.6
 ```

+#### Graceful upgrade
+
+Kargo also supports cordon, drain and uncordoning of nodes when performing 
+a cluster upgrade. There is a separate playbook used for this purpose. It is
+important to note that upgrade-cluster.yml can only be used for upgrading an
+existing cluster. That means there must be at least 1 kube-master already
+deployed.
+
+```
+git fetch origin
+git checkout origin/master
+ansible-playbook upgrade-cluster cluster.yml -i inventory/inventory.cfg
+```
+
 #### Upgrade order

 As mentioned above, components are upgraded in the order in which they were
--- a/roles/kubernetes-apps/network_plugin/weave/tasks/main.yml
+++ b/roles/kubernetes-apps/network_plugin/weave/tasks/main.yml
@ -1,6 +1,5 @@
 #FIXME: remove if kubernetes/features#124 is implemented
 - name: Weave | Purge old weave daemonset
-  run_once: true
  kube:
    name: "weave-net"
    kubectl: "{{ bin_dir }}/kubectl"
@ -12,7 +11,6 @@


 - name: Weave | Start Resources
-  run_once: true
  kube:
    name: "weave-net"
    kubectl: "{{ bin_dir }}/kubectl"
@ -21,17 +19,16 @@
    namespace: "{{system_namespace}}"
    state: "{{ item | ternary('latest','present') }}"
  with_items: "{{ weave_manifest.changed }}"
-  delegate_to: "{{groups['kube-master'][0]}}"
+  when: inventory_hostname == groups['kube-master'][0]


 - name: "Weave | wait for weave to become available"
  uri:
    url: http://127.0.0.1:6784/status
    return_content: yes
-  run_once: true
  register: weave_status
-  retries: 12
+  retries: 180
  delay: 10
  until: "{{ weave_status.status == 200 and
    'Status: ready' in weave_status.content }}"
-  delegate_to: "{{groups['kube-master'][0]}}"
+  when: inventory_hostname == groups['kube-master'][0]
--- a/roles/upgrade/post-upgrade/tasks/main.yml
+++ b/roles/upgrade/post-upgrade/tasks/main.yml
@ -0,0 +1,5 @@
+---
+
+- name: Uncordon node
+  command: "{{ bin_dir }}/kubectl uncordon {{ ansible_hostname }}"
+  delegate_to: "{{ groups['kube-master'][0] }}"
--- a/roles/upgrade/pre-upgrade/tasks/main.yml
+++ b/roles/upgrade/pre-upgrade/tasks/main.yml
@ -0,0 +1,12 @@
+---
+
+- name: Cordon node
+  command: "{{ bin_dir }}/kubectl cordon {{ ansible_hostname }}"
+  delegate_to: "{{ groups['kube-master'][0] }}"
+
+- name: Drain node
+  command: "{{ bin_dir }}/kubectl drain --force --ignore-daemonsets --grace-period 30 --delete-local-data {{ ansible_hostname }}"
+  delegate_to: "{{ groups['kube-master'][0] }}"
+
+- name: Sleep for grace period for draining
+  pause: seconds=30
--- a/tests/templates/inventory-gce.j2
+++ b/tests/templates/inventory-gce.j2
@ -1,48 +1,51 @@
-node1 ansible_ssh_host={{gce.instance_data[0].public_ip}}
-node2 ansible_ssh_host={{gce.instance_data[1].public_ip}}
+{% set node1 = gce.instance_data[0].name %}
+{% set node2 = gce.instance_data[1].name %}
+{{node1}} ansible_ssh_host={{gce.instance_data[0].public_ip}}
+{{node2}} ansible_ssh_host={{gce.instance_data[1].public_ip}}
 {% if mode is defined and mode in ["separate", "ha"] %}
-node3 ansible_ssh_host={{gce.instance_data[2].public_ip}}
+{% set node3 = gce.instance_data[2].name %}
+{{node3}} ansible_ssh_host={{gce.instance_data[2].public_ip}}
 {% endif %}

 {% if mode is defined and mode == "separate" %}
 [kube-master]
-node1
+{{node1}}

 [kube-node]
-node2
+{{node2}}

 [etcd]
-node3
+{{node3}}

 [vault]
-node3
+{{node3}}
 {% elif mode is defined and mode == "ha" %}
 [kube-master]
-node1
-node2
+{{node1}}
+{{node2}}

 [kube-node]
-node3
+{{node3}}

 [etcd]
-node2
-node3
+{{node2}}
+{{node3}}

 [vault]
-node2
-node3
+{{node2}}
+{{node3}}
 {% else %}
 [kube-master]
-node1
+{{node1}}

 [kube-node]
-node2
+{{node2}}

 [etcd]
-node1
+{{node1}}

 [vault]
-node1
+{{node1}}
 {% endif %}

 [k8s-cluster:children]
--- a/upgrade-cluster.yml
+++ b/upgrade-cluster.yml
@ -0,0 +1,92 @@
+---
+- hosts: localhost
+  gather_facts: False
+  roles:
+    - bastion-ssh-config
+  tags: [localhost, bastion]
+
+- hosts: k8s-cluster:etcd:calico-rr
+  any_errors_fatal: true
+  gather_facts: false
+  vars:
+    # Need to disable pipelining for bootstrap-os as some systems have requiretty in sudoers set, which makes pipelining
+    # fail. bootstrap-os fixes this on these systems, so in later plays it can be enabled.
+    ansible_ssh_pipelining: false
+  roles:
+    - bootstrap-os
+  tags:
+    - bootstrap-os
+
+- hosts: k8s-cluster:etcd:calico-rr
+  any_errors_fatal: true
+  vars:
+    ansible_ssh_pipelining: true
+  gather_facts: true
+
+- hosts: k8s-cluster:etcd:calico-rr
+  any_errors_fatal: true
+  roles:
+    - { role: kernel-upgrade, tags: kernel-upgrade, when: kernel_upgrade is defined and kernel_upgrade }
+    - { role: kubernetes/preinstall, tags: preinstall }
+    - { role: docker, tags: docker }
+    - role: rkt
+      tags: rkt
+      when: "'rkt' in [etcd_deployment_type, kubelet_deployment_type, vault_deployment_type]"
+
+- hosts: etcd:k8s-cluster:vault
+  any_errors_fatal: true
+  roles:
+    - { role: vault, tags: vault, vault_bootstrap: true, when: "cert_management == 'vault'" }
+
+- hosts: etcd:!k8s-cluster
+  any_errors_fatal: true
+  roles:
+    - { role: etcd, tags: etcd }
+
+- hosts: k8s-cluster
+  any_errors_fatal: true
+  roles:
+    - { role: etcd, tags: etcd }
+
+- hosts: etcd:k8s-cluster:vault
+  any_errors_fatal: true
+  roles:
+    - { role: vault, tags: vault, when: "cert_management == 'vault'"}
+
+#Handle upgrades to master components first to maintain backwards compat.
+- hosts: kube-master
+  any_errors_fatal: true
+  serial: 1
+  roles:
+    - { role: upgrade/pre-upgrade, tags: pre-upgrade }
+    - { role: kubernetes/node, tags: node }
+    - { role: kubernetes/master, tags: master }
+    - { role: network_plugin, tags: network }
+    - { role: upgrade/post-upgrade, tags: post-upgrade }
+
+#Finally handle worker upgrades, based on given batch size
+- hosts: kube-node:!kube-master
+  any_errors_fatal: true
+  serial: "{{ serial | default('20%') }}"
+  roles:
+    - { role: upgrade/pre-upgrade, tags: pre-upgrade }
+    - { role: kubernetes/node, tags: node }
+    - { role: network_plugin, tags: network }
+    - { role: upgrade/post-upgrade, tags: post-upgrade }
+    - { role: kubernetes-apps/network_plugin, tags: network }
+
+- hosts: calico-rr
+  any_errors_fatal: true
+  roles:
+    - { role: network_plugin/calico/rr, tags: network }
+
+- hosts: k8s-cluster
+  any_errors_fatal: true
+  roles:
+    - { role: dnsmasq, when: "dns_mode == 'dnsmasq_kubedns'", tags: dnsmasq }
+    - { role: kubernetes/preinstall, when: "dns_mode != 'none' and resolvconf_mode == 'host_resolvconf'", tags: resolvconf }
+
+- hosts: kube-master[0]
+  any_errors_fatal: true
+  roles:
+    - { role: kubernetes-apps, tags: apps }