From 13f5217d887c2c01ebbdd11ee42e532ef7025922 Mon Sep 17 00:00:00 2001 From: Bogdan Dobrelya Date: Tue, 18 Oct 2016 12:15:33 +0200 Subject: [PATCH] Controll delayed startup/restart for kubelet/etcd In order to mitigate sporadic data races in etcd (publish error: etcdserver: request timed out"): - Add etcd_start_delay and kubelet_start_delay (defaults to a 5 sec.) - Increase default start sleep times to foo_start_delay from a 1 sec. - Add restart sleeping as well. - Add missing start sleep commands as appropriate. Closes: https://github.com/kubespray/kargo/issues/342 Signed-off-by: Bogdan Dobrelya --- roles/dnsmasq/handlers/main.yml | 1 + roles/etcd/handlers/main.yml | 2 ++ roles/etcd/templates/deb-etcd-docker.initd.j2 | 2 +- roles/etcd/templates/deb-etcd-host.initd.j2 | 1 + roles/etcd/templates/deb-etcd-proxy-docker.initd.j2 | 2 +- roles/etcd/templates/deb-etcd-proxy-host.initd.j2 | 1 + roles/etcd/templates/etcd-docker.service.j2 | 2 ++ roles/etcd/templates/etcd-host.service.j2 | 2 ++ roles/etcd/templates/etcd-proxy-docker.service.j2 | 2 ++ roles/etcd/templates/etcd-proxy-host.service.j2 | 2 ++ roles/kubernetes/master/handlers/main.yml | 1 + roles/kubernetes/node/handlers/main.yml | 1 + roles/kubernetes/node/templates/deb-kubelet.initd.j2 | 2 +- roles/kubernetes/node/templates/kubelet-container.j2 | 1 + roles/kubernetes/node/templates/kubelet.service.j2 | 2 ++ roles/kubernetes/node/templates/rh-kubelet.initd.j2 | 3 ++- roles/kubernetes/preinstall/defaults/main.yml | 4 ++++ roles/network_plugin/flannel/handlers/main.yml | 1 + 18 files changed, 28 insertions(+), 4 deletions(-) diff --git a/roles/dnsmasq/handlers/main.yml b/roles/dnsmasq/handlers/main.yml index 4bdfd10f6..599be2e85 100644 --- a/roles/dnsmasq/handlers/main.yml +++ b/roles/dnsmasq/handlers/main.yml @@ -30,5 +30,6 @@ service: name: kubelet state: restarted + sleep: "{{ kubelet_start_delay|int }}" when: "{{ inventory_hostname in groups['kube-master'] }}" ignore_errors: true diff --git a/roles/etcd/handlers/main.yml b/roles/etcd/handlers/main.yml index 693754a06..f9155ca09 100644 --- a/roles/etcd/handlers/main.yml +++ b/roles/etcd/handlers/main.yml @@ -28,10 +28,12 @@ service: name: etcd state: restarted + sleep: "{{ etcd_start_delay|int }}" when: is_etcd_master - name: reload etcd-proxy service: name: etcd-proxy state: restarted + sleep: "{{ etcd_start_delay|int }}" when: is_etcd_proxy diff --git a/roles/etcd/templates/deb-etcd-docker.initd.j2 b/roles/etcd/templates/deb-etcd-docker.initd.j2 index a83aae184..3acfcd4b2 100644 --- a/roles/etcd/templates/deb-etcd-docker.initd.j2 +++ b/roles/etcd/templates/deb-etcd-docker.initd.j2 @@ -50,7 +50,7 @@ do_status() do_start() { {{ docker_bin_dir | default("/usr/bin") }}/docker rm -f {{ etcd_member_name | default("etcd-proxy") }} &>/dev/null || true - sleep 1 + sleep {{ etcd_start_delay }} start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \ $DAEMON_ARGS \ || return 2 diff --git a/roles/etcd/templates/deb-etcd-host.initd.j2 b/roles/etcd/templates/deb-etcd-host.initd.j2 index b27c0f49a..dc244a994 100644 --- a/roles/etcd/templates/deb-etcd-host.initd.j2 +++ b/roles/etcd/templates/deb-etcd-host.initd.j2 @@ -41,6 +41,7 @@ do_status() # do_start() { + sleep {{ etcd_start_delay }} start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \ $DAEMON_ARGS \ || return 2 diff --git a/roles/etcd/templates/deb-etcd-proxy-docker.initd.j2 b/roles/etcd/templates/deb-etcd-proxy-docker.initd.j2 index ad0338a09..e434060de 100644 --- a/roles/etcd/templates/deb-etcd-proxy-docker.initd.j2 +++ b/roles/etcd/templates/deb-etcd-proxy-docker.initd.j2 @@ -51,7 +51,7 @@ do_status() do_start() { {{ docker_bin_dir | default("/usr/bin") }}/docker rm -f {{ etcd_proxy_member_name | default("etcd-proxy") }} &>/dev/null || true - sleep 1 + sleep {{ etcd_start_delay }} start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \ $DAEMON_ARGS \ || return 2 diff --git a/roles/etcd/templates/deb-etcd-proxy-host.initd.j2 b/roles/etcd/templates/deb-etcd-proxy-host.initd.j2 index d0858bb2f..40455a18a 100644 --- a/roles/etcd/templates/deb-etcd-proxy-host.initd.j2 +++ b/roles/etcd/templates/deb-etcd-proxy-host.initd.j2 @@ -42,6 +42,7 @@ do_status() # do_start() { + sleep {{ etcd_start_delay }} start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \ $DAEMON_ARGS \ || return 2 diff --git a/roles/etcd/templates/etcd-docker.service.j2 b/roles/etcd/templates/etcd-docker.service.j2 index a37759fec..c66f2ed4e 100644 --- a/roles/etcd/templates/etcd-docker.service.j2 +++ b/roles/etcd/templates/etcd-docker.service.j2 @@ -23,6 +23,8 @@ ExecReload={{ docker_bin_dir | default("/usr/bin") }}/docker restart {{ etcd_mem ExecStop={{ docker_bin_dir | default("/usr/bin") }}/docker stop {{ etcd_member_name | default("etcd-proxy") }} Restart=always RestartSec=15s +OnStartupSec={{ etcd_start_delay }} +OnBootSec={{ etcd_start_delay }} [Install] WantedBy=multi-user.target diff --git a/roles/etcd/templates/etcd-host.service.j2 b/roles/etcd/templates/etcd-host.service.j2 index 8a91fab92..38b35dd7b 100644 --- a/roles/etcd/templates/etcd-host.service.j2 +++ b/roles/etcd/templates/etcd-host.service.j2 @@ -11,6 +11,8 @@ NotifyAccess=all Restart=always RestartSec=10s LimitNOFILE=40000 +OnStartupSec={{ etcd_start_delay }} +OnBootSec={{ etcd_start_delay }} [Install] WantedBy=multi-user.target diff --git a/roles/etcd/templates/etcd-proxy-docker.service.j2 b/roles/etcd/templates/etcd-proxy-docker.service.j2 index bf70f0e7f..0828c95ca 100644 --- a/roles/etcd/templates/etcd-proxy-docker.service.j2 +++ b/roles/etcd/templates/etcd-proxy-docker.service.j2 @@ -23,6 +23,8 @@ ExecReload={{ docker_bin_dir | default("/usr/bin") }}/docker restart {{ etcd_pro ExecStop={{ docker_bin_dir | default("/usr/bin") }}/docker stop {{ etcd_proxy_member_name | default("etcd-proxy") }} Restart=always RestartSec=15s +OnStartupSec={{ etcd_start_delay }} +OnBootSec={{ etcd_start_delay }} [Install] WantedBy=multi-user.target diff --git a/roles/etcd/templates/etcd-proxy-host.service.j2 b/roles/etcd/templates/etcd-proxy-host.service.j2 index 4ea5f7bc9..c6c9cb057 100644 --- a/roles/etcd/templates/etcd-proxy-host.service.j2 +++ b/roles/etcd/templates/etcd-proxy-host.service.j2 @@ -14,6 +14,8 @@ NotifyAccess=all Restart=always RestartSec=10s LimitNOFILE=40000 +OnStartupSec={{ etcd_start_delay }} +OnBootSec={{ etcd_start_delay }} [Install] WantedBy=multi-user.target diff --git a/roles/kubernetes/master/handlers/main.yml b/roles/kubernetes/master/handlers/main.yml index 3d69cba7d..1132d713b 100644 --- a/roles/kubernetes/master/handlers/main.yml +++ b/roles/kubernetes/master/handlers/main.yml @@ -21,6 +21,7 @@ service: name: kubelet state: restarted + sleep: "{{ kubelet_start_delay|int }}" - name: Master | wait for kube-scheduler uri: url=http://localhost:10251/healthz diff --git a/roles/kubernetes/node/handlers/main.yml b/roles/kubernetes/node/handlers/main.yml index 5991bebf3..508cbbc54 100644 --- a/roles/kubernetes/node/handlers/main.yml +++ b/roles/kubernetes/node/handlers/main.yml @@ -13,3 +13,4 @@ service: name: kubelet state: restarted + sleep: "{{ kubelet_start_delay|int }}" diff --git a/roles/kubernetes/node/templates/deb-kubelet.initd.j2 b/roles/kubernetes/node/templates/deb-kubelet.initd.j2 index 5d5184efe..0cdedd522 100644 --- a/roles/kubernetes/node/templates/deb-kubelet.initd.j2 +++ b/roles/kubernetes/node/templates/deb-kubelet.initd.j2 @@ -40,7 +40,7 @@ DAEMON_USER=root do_start() { /usr/bin/docker rm -f kubelet &>/dev/null || true - sleep 1 + sleep {{ kubelet_start_delay }} # Return # 0 if daemon has been started # 1 if daemon was already running diff --git a/roles/kubernetes/node/templates/kubelet-container.j2 b/roles/kubernetes/node/templates/kubelet-container.j2 index 2fcc7307f..4dd7c5c2c 100644 --- a/roles/kubernetes/node/templates/kubelet-container.j2 +++ b/roles/kubernetes/node/templates/kubelet-container.j2 @@ -1,4 +1,5 @@ #!/bin/bash +sleep {{ kubelet_start_delay }} /usr/bin/docker run --privileged --rm \ --net=host --pid=host --name=kubelet \ -v /etc/cni:/etc/cni:ro \ diff --git a/roles/kubernetes/node/templates/kubelet.service.j2 b/roles/kubernetes/node/templates/kubelet.service.j2 index ad62d8562..ae60867d3 100644 --- a/roles/kubernetes/node/templates/kubelet.service.j2 +++ b/roles/kubernetes/node/templates/kubelet.service.j2 @@ -28,6 +28,8 @@ ExecStartPre=-/usr/bin/docker rm -f kubelet ExecReload=/usr/bin/docker restart kubelet Restart=always RestartSec=10s +OnStartupSec={{ kubelet_start_delay }} +OnBootSec={{ kubelet_start_delay }} [Install] WantedBy=multi-user.target diff --git a/roles/kubernetes/node/templates/rh-kubelet.initd.j2 b/roles/kubernetes/node/templates/rh-kubelet.initd.j2 index 5a709e118..001495ccc 100644 --- a/roles/kubernetes/node/templates/rh-kubelet.initd.j2 +++ b/roles/kubernetes/node/templates/rh-kubelet.initd.j2 @@ -35,7 +35,7 @@ start() { echo "Docker executable $exec not found" else echo "You do not have permission to execute the Docker executable $exec" - fi + fi exit 5 fi @@ -44,6 +44,7 @@ start() { if ! [ -f $pidfile ]; then printf "Starting $prog:\t" echo "\n$(date)\n" >> $logfile + sleep {{ kubelet_start_delay }} $exec $DAEMON_ARGS &>> $logfile & pid=$! echo $pid >> $pidfile diff --git a/roles/kubernetes/preinstall/defaults/main.yml b/roles/kubernetes/preinstall/defaults/main.yml index 5d1b2cd2e..c1acd9842 100644 --- a/roles/kubernetes/preinstall/defaults/main.yml +++ b/roles/kubernetes/preinstall/defaults/main.yml @@ -1,6 +1,10 @@ --- run_gitinfos: false +# Controll delayed startup/restart for services, in seconds +etcd_start_delay: 5 +kubelet_start_delay: 5 + # This directory is where all the additional scripts go # that Kubernetes normally puts in /srv/kubernetes. # This puts them in a sane location diff --git a/roles/network_plugin/flannel/handlers/main.yml b/roles/network_plugin/flannel/handlers/main.yml index a503569f6..9ca2f2de9 100644 --- a/roles/network_plugin/flannel/handlers/main.yml +++ b/roles/network_plugin/flannel/handlers/main.yml @@ -8,3 +8,4 @@ service: name: kubelet state: restarted + sleep: "{{ kubelet_start_delay|int }}"