Merge pull request #1029 from mattymo/graceful

Add graceful upgrade process
2017-02-17 21:24:32 +01:00 · 2017-02-17 21:24:32 +01:00 · b84cc14694
commit b84cc14694
parent e16ebcad6e a510e7b8f3
7 changed files with 158 additions and 34 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -101,8 +101,8 @@ before_script:
    # Check out latest tag if testing upgrade
    # Uncomment when gitlab kargo repo has tags
-    #- test "${UPGRADE_TEST}" = "true" && git fetch --all && git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
+    #- test "${UPGRADE_TEST}" != "false" && git fetch --all && git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
-    - test "${UPGRADE_TEST}" = "true" && git checkout 031cf565ec3ccd3ebbe80eeef3454c3780e5c598 && pip install ansible==2.2.0
+    - test "${UPGRADE_TEST}" != "false" && git checkout 031cf565ec3ccd3ebbe80eeef3454c3780e5c598 && pip install ansible==2.2.0
    # Create cluster
@ -127,9 +127,10 @@ before_script:
      cluster.yml
    # Repeat deployment if testing upgrade
    #FIXME(mattymo): repeat "Create cluster" above without duplicating code 
    - >
-      if [ "${UPGRADE_TEST}" = "true" ]; then 
+      if [ "${UPGRADE_TEST}" != "false" ]; then 
      test "${UPGRADE_TEST}" == "basic" && PLAYBOOK="cluster.yml";
      test "${UPGRADE_TEST}" == "graceful" && PLAYBOOK="upgrade-cluster.yml";
      pip install ansible==2.2.1.0; 
      git checkout "${CI_BUILD_REF}"; 
      ansible-playbook -i inventory/inventory.ini -b --become-user=root --private-key=${HOME}/.ssh/id_rsa -u $SSH_USER 
@ -149,7 +150,7 @@ before_script:
      -e resolvconf_mode=${RESOLVCONF_MODE} 
      -e weave_cpu_requests=${WEAVE_CPU_LIMIT} 
      -e weave_cpu_limit=${WEAVE_CPU_LIMIT} 
-      cluster.yml; 
+      $PLAYBOOK; 
      fi
    # Tests Cases
@ -254,7 +255,7 @@ before_script:
  KUBE_NETWORK_PLUGIN: canal
  CLOUD_IMAGE: debian-8-kubespray
  CLOUD_REGION: us-east1-b
-  UPGRADE_TEST: "true"
+  UPGRADE_TEST: "basic"
  CLUSTER_MODE: ha
 .rhel7_weave_variables: &rhel7_weave_variables
@ -262,7 +263,6 @@ before_script:
  KUBE_NETWORK_PLUGIN: weave
  CLOUD_IMAGE: rhel-7
  CLOUD_REGION: europe-west1-b
  UPGRADE_TEST: "true"
  CLUSTER_MODE: default
 .centos7_flannel_variables: &centos7_flannel_variables
@ -278,6 +278,7 @@ before_script:
  CLOUD_IMAGE: debian-8-kubespray
  CLOUD_REGION: us-central1-b
  CLUSTER_MODE: default
  UPGRADE_TEST: "graceful"
 .coreos_canal_variables: &coreos_canal_variables
 # stage: deploy-gce-part2
@ -444,7 +445,7 @@ rhel7-weave-triggers:
  when: on_success
  only: ['triggers']
-debian8-calico:
+debian8-calico-upgrade:
  stage: deploy-gce-part2
  <<: *job
  <<: *gce
@ -540,7 +541,7 @@ coreos-alpha-weave-ha:
  except: ['triggers']
  only: ['master', /^pr-.*$/]
-ubuntu-rkt-sep:
+ubuntu-rkt-sep-upgrade:
  stage: deploy-gce-part1
  <<: *job
  <<: *gce
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@ -18,7 +18,7 @@ versions. Here are all version vars for each component:
 * flannel_version
 * kubedns_version
-#### Example
+#### Unsafe upgrade example
 If you wanted to upgrade just kube_version from v1.4.3 to v1.4.6, you could
 deploy the following way:
@ -33,6 +33,20 @@ And then repeat with v1.4.6 as kube_version:
 ansible-playbook cluster.yml -i inventory/inventory.cfg -e kube_version=v1.4.6
 ```
 #### Graceful upgrade
 Kargo also supports cordon, drain and uncordoning of nodes when performing 
 a cluster upgrade. There is a separate playbook used for this purpose. It is
 important to note that upgrade-cluster.yml can only be used for upgrading an
 existing cluster. That means there must be at least 1 kube-master already
 deployed.
 ```
 git fetch origin
 git checkout origin/master
 ansible-playbook upgrade-cluster cluster.yml -i inventory/inventory.cfg
 ```
 #### Upgrade order
 As mentioned above, components are upgraded in the order in which they were
--- a/roles/kubernetes-apps/network_plugin/weave/tasks/main.yml
+++ b/roles/kubernetes-apps/network_plugin/weave/tasks/main.yml
@ -1,6 +1,5 @@
 #FIXME: remove if kubernetes/features#124 is implemented
 - name: Weave | Purge old weave daemonset
  run_once: true
  kube:
    name: "weave-net"
    kubectl: "{{ bin_dir }}/kubectl"
@ -12,7 +11,6 @@
 - name: Weave | Start Resources
  run_once: true
  kube:
    name: "weave-net"
    kubectl: "{{ bin_dir }}/kubectl"
@ -21,17 +19,16 @@
    namespace: "{{system_namespace}}"
    state: "{{ item | ternary('latest','present') }}"
  with_items: "{{ weave_manifest.changed }}"
-  delegate_to: "{{groups['kube-master'][0]}}"
+  when: inventory_hostname == groups['kube-master'][0]
 - name: "Weave | wait for weave to become available"
  uri:
    url: http://127.0.0.1:6784/status
    return_content: yes
  run_once: true
  register: weave_status
-  retries: 12
+  retries: 180
  delay: 10
  until: "{{ weave_status.status == 200 and
    'Status: ready' in weave_status.content }}"
-  delegate_to: "{{groups['kube-master'][0]}}"
+  when: inventory_hostname == groups['kube-master'][0]
--- a/roles/upgrade/post-upgrade/tasks/main.yml
+++ b/roles/upgrade/post-upgrade/tasks/main.yml
@ -0,0 +1,5 @@
 ---
 - name: Uncordon node
  command: "{{ bin_dir }}/kubectl uncordon {{ ansible_hostname }}"
  delegate_to: "{{ groups['kube-master'][0] }}"
--- a/roles/upgrade/pre-upgrade/tasks/main.yml
+++ b/roles/upgrade/pre-upgrade/tasks/main.yml
@ -0,0 +1,12 @@
 ---
 - name: Cordon node
  command: "{{ bin_dir }}/kubectl cordon {{ ansible_hostname }}"
  delegate_to: "{{ groups['kube-master'][0] }}"
 - name: Drain node
  command: "{{ bin_dir }}/kubectl drain --force --ignore-daemonsets --grace-period 30 --delete-local-data {{ ansible_hostname }}"
  delegate_to: "{{ groups['kube-master'][0] }}"
 - name: Sleep for grace period for draining
  pause: seconds=30
--- a/tests/templates/inventory-gce.j2
+++ b/tests/templates/inventory-gce.j2
@ -1,48 +1,51 @@
-node1 ansible_ssh_host={{gce.instance_data[0].public_ip}}
+{% set node1 = gce.instance_data[0].name %}
-node2 ansible_ssh_host={{gce.instance_data[1].public_ip}}
+{% set node2 = gce.instance_data[1].name %}
 {{node1}} ansible_ssh_host={{gce.instance_data[0].public_ip}}
 {{node2}} ansible_ssh_host={{gce.instance_data[1].public_ip}}
 {% if mode is defined and mode in ["separate", "ha"] %}
-node3 ansible_ssh_host={{gce.instance_data[2].public_ip}}
+{% set node3 = gce.instance_data[2].name %}
 {{node3}} ansible_ssh_host={{gce.instance_data[2].public_ip}}
 {% endif %}
 {% if mode is defined and mode == "separate" %}
 [kube-master]
-node1
+{{node1}}
 [kube-node]
-node2
+{{node2}}
 [etcd]
-node3
+{{node3}}
 [vault]
-node3
+{{node3}}
 {% elif mode is defined and mode == "ha" %}
 [kube-master]
-node1
+{{node1}}
-node2
+{{node2}}
 [kube-node]
-node3
+{{node3}}
 [etcd]
-node2
+{{node2}}
-node3
+{{node3}}
 [vault]
-node2
+{{node2}}
-node3
+{{node3}}
 {% else %}
 [kube-master]
-node1
+{{node1}}
 [kube-node]
-node2
+{{node2}}
 [etcd]
-node1
+{{node1}}
 [vault]
-node1
+{{node1}}
 {% endif %}
 [k8s-cluster:children]
--- a/upgrade-cluster.yml
+++ b/upgrade-cluster.yml
@ -0,0 +1,92 @@
 ---
 - hosts: localhost
  gather_facts: False
  roles:
    - bastion-ssh-config
  tags: [localhost, bastion]
 - hosts: k8s-cluster:etcd:calico-rr
  any_errors_fatal: true
  gather_facts: false
  vars:
    # Need to disable pipelining for bootstrap-os as some systems have requiretty in sudoers set, which makes pipelining
    # fail. bootstrap-os fixes this on these systems, so in later plays it can be enabled.
    ansible_ssh_pipelining: false
  roles:
    - bootstrap-os
  tags:
    - bootstrap-os
 - hosts: k8s-cluster:etcd:calico-rr
  any_errors_fatal: true
  vars:
    ansible_ssh_pipelining: true
  gather_facts: true
 - hosts: k8s-cluster:etcd:calico-rr
  any_errors_fatal: true
  roles:
    - { role: kernel-upgrade, tags: kernel-upgrade, when: kernel_upgrade is defined and kernel_upgrade }
    - { role: kubernetes/preinstall, tags: preinstall }
    - { role: docker, tags: docker }
    - role: rkt
      tags: rkt
      when: "'rkt' in [etcd_deployment_type, kubelet_deployment_type, vault_deployment_type]"
 - hosts: etcd:k8s-cluster:vault
  any_errors_fatal: true
  roles:
    - { role: vault, tags: vault, vault_bootstrap: true, when: "cert_management == 'vault'" }
 - hosts: etcd:!k8s-cluster
  any_errors_fatal: true
  roles:
    - { role: etcd, tags: etcd }
 - hosts: k8s-cluster
  any_errors_fatal: true
  roles:
    - { role: etcd, tags: etcd }
 - hosts: etcd:k8s-cluster:vault
  any_errors_fatal: true
  roles:
    - { role: vault, tags: vault, when: "cert_management == 'vault'"}
 #Handle upgrades to master components first to maintain backwards compat.
 - hosts: kube-master
  any_errors_fatal: true
  serial: 1
  roles:
    - { role: upgrade/pre-upgrade, tags: pre-upgrade }
    - { role: kubernetes/node, tags: node }
    - { role: kubernetes/master, tags: master }
    - { role: network_plugin, tags: network }
    - { role: upgrade/post-upgrade, tags: post-upgrade }
 #Finally handle worker upgrades, based on given batch size
 - hosts: kube-node:!kube-master
  any_errors_fatal: true
  serial: "{{ serial | default('20%') }}"
  roles:
    - { role: upgrade/pre-upgrade, tags: pre-upgrade }
    - { role: kubernetes/node, tags: node }
    - { role: network_plugin, tags: network }
    - { role: upgrade/post-upgrade, tags: post-upgrade }
    - { role: kubernetes-apps/network_plugin, tags: network }
 - hosts: calico-rr
  any_errors_fatal: true
  roles:
    - { role: network_plugin/calico/rr, tags: network }
 - hosts: k8s-cluster
  any_errors_fatal: true
  roles:
    - { role: dnsmasq, when: "dns_mode == 'dnsmasq_kubedns'", tags: dnsmasq }
    - { role: kubernetes/preinstall, when: "dns_mode != 'none' and resolvconf_mode == 'host_resolvconf'", tags: resolvconf }
 - hosts: kube-master[0]
  any_errors_fatal: true
  roles:
    - { role: kubernetes-apps, tags: apps }