Controll delayed startup/restart for kubelet/etcd
In order to mitigate sporadic data races in etcd (publish error: etcdserver: request timed out"): - Add etcd_start_delay and kubelet_start_delay (defaults to a 5 sec.) - Increase default start sleep times to foo_start_delay from a 1 sec. - Add restart sleeping as well. - Add missing start sleep commands as appropriate. Closes: https://github.com/kubespray/kargo/issues/342 Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
This commit is contained in:
parent
1de127470f
commit
13f5217d88
18 changed files with 28 additions and 4 deletions
|
@ -30,5 +30,6 @@
|
|||
service:
|
||||
name: kubelet
|
||||
state: restarted
|
||||
sleep: "{{ kubelet_start_delay|int }}"
|
||||
when: "{{ inventory_hostname in groups['kube-master'] }}"
|
||||
ignore_errors: true
|
||||
|
|
|
@ -28,10 +28,12 @@
|
|||
service:
|
||||
name: etcd
|
||||
state: restarted
|
||||
sleep: "{{ etcd_start_delay|int }}"
|
||||
when: is_etcd_master
|
||||
|
||||
- name: reload etcd-proxy
|
||||
service:
|
||||
name: etcd-proxy
|
||||
state: restarted
|
||||
sleep: "{{ etcd_start_delay|int }}"
|
||||
when: is_etcd_proxy
|
||||
|
|
|
@ -50,7 +50,7 @@ do_status()
|
|||
do_start()
|
||||
{
|
||||
{{ docker_bin_dir | default("/usr/bin") }}/docker rm -f {{ etcd_member_name | default("etcd-proxy") }} &>/dev/null || true
|
||||
sleep 1
|
||||
sleep {{ etcd_start_delay }}
|
||||
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
||||
$DAEMON_ARGS \
|
||||
|| return 2
|
||||
|
|
|
@ -41,6 +41,7 @@ do_status()
|
|||
#
|
||||
do_start()
|
||||
{
|
||||
sleep {{ etcd_start_delay }}
|
||||
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
||||
$DAEMON_ARGS \
|
||||
|| return 2
|
||||
|
|
|
@ -51,7 +51,7 @@ do_status()
|
|||
do_start()
|
||||
{
|
||||
{{ docker_bin_dir | default("/usr/bin") }}/docker rm -f {{ etcd_proxy_member_name | default("etcd-proxy") }} &>/dev/null || true
|
||||
sleep 1
|
||||
sleep {{ etcd_start_delay }}
|
||||
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
||||
$DAEMON_ARGS \
|
||||
|| return 2
|
||||
|
|
|
@ -42,6 +42,7 @@ do_status()
|
|||
#
|
||||
do_start()
|
||||
{
|
||||
sleep {{ etcd_start_delay }}
|
||||
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
||||
$DAEMON_ARGS \
|
||||
|| return 2
|
||||
|
|
|
@ -23,6 +23,8 @@ ExecReload={{ docker_bin_dir | default("/usr/bin") }}/docker restart {{ etcd_mem
|
|||
ExecStop={{ docker_bin_dir | default("/usr/bin") }}/docker stop {{ etcd_member_name | default("etcd-proxy") }}
|
||||
Restart=always
|
||||
RestartSec=15s
|
||||
OnStartupSec={{ etcd_start_delay }}
|
||||
OnBootSec={{ etcd_start_delay }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -11,6 +11,8 @@ NotifyAccess=all
|
|||
Restart=always
|
||||
RestartSec=10s
|
||||
LimitNOFILE=40000
|
||||
OnStartupSec={{ etcd_start_delay }}
|
||||
OnBootSec={{ etcd_start_delay }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -23,6 +23,8 @@ ExecReload={{ docker_bin_dir | default("/usr/bin") }}/docker restart {{ etcd_pro
|
|||
ExecStop={{ docker_bin_dir | default("/usr/bin") }}/docker stop {{ etcd_proxy_member_name | default("etcd-proxy") }}
|
||||
Restart=always
|
||||
RestartSec=15s
|
||||
OnStartupSec={{ etcd_start_delay }}
|
||||
OnBootSec={{ etcd_start_delay }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -14,6 +14,8 @@ NotifyAccess=all
|
|||
Restart=always
|
||||
RestartSec=10s
|
||||
LimitNOFILE=40000
|
||||
OnStartupSec={{ etcd_start_delay }}
|
||||
OnBootSec={{ etcd_start_delay }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
service:
|
||||
name: kubelet
|
||||
state: restarted
|
||||
sleep: "{{ kubelet_start_delay|int }}"
|
||||
|
||||
- name: Master | wait for kube-scheduler
|
||||
uri: url=http://localhost:10251/healthz
|
||||
|
|
|
@ -13,3 +13,4 @@
|
|||
service:
|
||||
name: kubelet
|
||||
state: restarted
|
||||
sleep: "{{ kubelet_start_delay|int }}"
|
||||
|
|
|
@ -40,7 +40,7 @@ DAEMON_USER=root
|
|||
do_start()
|
||||
{
|
||||
/usr/bin/docker rm -f kubelet &>/dev/null || true
|
||||
sleep 1
|
||||
sleep {{ kubelet_start_delay }}
|
||||
# Return
|
||||
# 0 if daemon has been started
|
||||
# 1 if daemon was already running
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#!/bin/bash
|
||||
sleep {{ kubelet_start_delay }}
|
||||
/usr/bin/docker run --privileged --rm \
|
||||
--net=host --pid=host --name=kubelet \
|
||||
-v /etc/cni:/etc/cni:ro \
|
||||
|
|
|
@ -28,6 +28,8 @@ ExecStartPre=-/usr/bin/docker rm -f kubelet
|
|||
ExecReload=/usr/bin/docker restart kubelet
|
||||
Restart=always
|
||||
RestartSec=10s
|
||||
OnStartupSec={{ kubelet_start_delay }}
|
||||
OnBootSec={{ kubelet_start_delay }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -35,7 +35,7 @@ start() {
|
|||
echo "Docker executable $exec not found"
|
||||
else
|
||||
echo "You do not have permission to execute the Docker executable $exec"
|
||||
fi
|
||||
fi
|
||||
exit 5
|
||||
fi
|
||||
|
||||
|
@ -44,6 +44,7 @@ start() {
|
|||
if ! [ -f $pidfile ]; then
|
||||
printf "Starting $prog:\t"
|
||||
echo "\n$(date)\n" >> $logfile
|
||||
sleep {{ kubelet_start_delay }}
|
||||
$exec $DAEMON_ARGS &>> $logfile &
|
||||
pid=$!
|
||||
echo $pid >> $pidfile
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
---
|
||||
run_gitinfos: false
|
||||
|
||||
# Controll delayed startup/restart for services, in seconds
|
||||
etcd_start_delay: 5
|
||||
kubelet_start_delay: 5
|
||||
|
||||
# This directory is where all the additional scripts go
|
||||
# that Kubernetes normally puts in /srv/kubernetes.
|
||||
# This puts them in a sane location
|
||||
|
|
|
@ -8,3 +8,4 @@
|
|||
service:
|
||||
name: kubelet
|
||||
state: restarted
|
||||
sleep: "{{ kubelet_start_delay|int }}"
|
||||
|
|
Loading…
Reference in a new issue