Controll delayed startup/restart for kubelet/etcd
In order to mitigate sporadic data races in etcd (publish error: etcdserver: request timed out"): - Add etcd_start_delay and kubelet_start_delay (defaults to a 5 sec.) - Increase default start sleep times to foo_start_delay from a 1 sec. - Add restart sleeping as well. - Add missing start sleep commands as appropriate. Closes: https://github.com/kubespray/kargo/issues/342 Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
This commit is contained in:
parent
1de127470f
commit
13f5217d88
18 changed files with 28 additions and 4 deletions
|
@ -30,5 +30,6 @@
|
||||||
service:
|
service:
|
||||||
name: kubelet
|
name: kubelet
|
||||||
state: restarted
|
state: restarted
|
||||||
|
sleep: "{{ kubelet_start_delay|int }}"
|
||||||
when: "{{ inventory_hostname in groups['kube-master'] }}"
|
when: "{{ inventory_hostname in groups['kube-master'] }}"
|
||||||
ignore_errors: true
|
ignore_errors: true
|
||||||
|
|
|
@ -28,10 +28,12 @@
|
||||||
service:
|
service:
|
||||||
name: etcd
|
name: etcd
|
||||||
state: restarted
|
state: restarted
|
||||||
|
sleep: "{{ etcd_start_delay|int }}"
|
||||||
when: is_etcd_master
|
when: is_etcd_master
|
||||||
|
|
||||||
- name: reload etcd-proxy
|
- name: reload etcd-proxy
|
||||||
service:
|
service:
|
||||||
name: etcd-proxy
|
name: etcd-proxy
|
||||||
state: restarted
|
state: restarted
|
||||||
|
sleep: "{{ etcd_start_delay|int }}"
|
||||||
when: is_etcd_proxy
|
when: is_etcd_proxy
|
||||||
|
|
|
@ -50,7 +50,7 @@ do_status()
|
||||||
do_start()
|
do_start()
|
||||||
{
|
{
|
||||||
{{ docker_bin_dir | default("/usr/bin") }}/docker rm -f {{ etcd_member_name | default("etcd-proxy") }} &>/dev/null || true
|
{{ docker_bin_dir | default("/usr/bin") }}/docker rm -f {{ etcd_member_name | default("etcd-proxy") }} &>/dev/null || true
|
||||||
sleep 1
|
sleep {{ etcd_start_delay }}
|
||||||
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
||||||
$DAEMON_ARGS \
|
$DAEMON_ARGS \
|
||||||
|| return 2
|
|| return 2
|
||||||
|
|
|
@ -41,6 +41,7 @@ do_status()
|
||||||
#
|
#
|
||||||
do_start()
|
do_start()
|
||||||
{
|
{
|
||||||
|
sleep {{ etcd_start_delay }}
|
||||||
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
||||||
$DAEMON_ARGS \
|
$DAEMON_ARGS \
|
||||||
|| return 2
|
|| return 2
|
||||||
|
|
|
@ -51,7 +51,7 @@ do_status()
|
||||||
do_start()
|
do_start()
|
||||||
{
|
{
|
||||||
{{ docker_bin_dir | default("/usr/bin") }}/docker rm -f {{ etcd_proxy_member_name | default("etcd-proxy") }} &>/dev/null || true
|
{{ docker_bin_dir | default("/usr/bin") }}/docker rm -f {{ etcd_proxy_member_name | default("etcd-proxy") }} &>/dev/null || true
|
||||||
sleep 1
|
sleep {{ etcd_start_delay }}
|
||||||
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
||||||
$DAEMON_ARGS \
|
$DAEMON_ARGS \
|
||||||
|| return 2
|
|| return 2
|
||||||
|
|
|
@ -42,6 +42,7 @@ do_status()
|
||||||
#
|
#
|
||||||
do_start()
|
do_start()
|
||||||
{
|
{
|
||||||
|
sleep {{ etcd_start_delay }}
|
||||||
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \
|
||||||
$DAEMON_ARGS \
|
$DAEMON_ARGS \
|
||||||
|| return 2
|
|| return 2
|
||||||
|
|
|
@ -23,6 +23,8 @@ ExecReload={{ docker_bin_dir | default("/usr/bin") }}/docker restart {{ etcd_mem
|
||||||
ExecStop={{ docker_bin_dir | default("/usr/bin") }}/docker stop {{ etcd_member_name | default("etcd-proxy") }}
|
ExecStop={{ docker_bin_dir | default("/usr/bin") }}/docker stop {{ etcd_member_name | default("etcd-proxy") }}
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=15s
|
RestartSec=15s
|
||||||
|
OnStartupSec={{ etcd_start_delay }}
|
||||||
|
OnBootSec={{ etcd_start_delay }}
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|
|
@ -11,6 +11,8 @@ NotifyAccess=all
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=10s
|
RestartSec=10s
|
||||||
LimitNOFILE=40000
|
LimitNOFILE=40000
|
||||||
|
OnStartupSec={{ etcd_start_delay }}
|
||||||
|
OnBootSec={{ etcd_start_delay }}
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|
|
@ -23,6 +23,8 @@ ExecReload={{ docker_bin_dir | default("/usr/bin") }}/docker restart {{ etcd_pro
|
||||||
ExecStop={{ docker_bin_dir | default("/usr/bin") }}/docker stop {{ etcd_proxy_member_name | default("etcd-proxy") }}
|
ExecStop={{ docker_bin_dir | default("/usr/bin") }}/docker stop {{ etcd_proxy_member_name | default("etcd-proxy") }}
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=15s
|
RestartSec=15s
|
||||||
|
OnStartupSec={{ etcd_start_delay }}
|
||||||
|
OnBootSec={{ etcd_start_delay }}
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|
|
@ -14,6 +14,8 @@ NotifyAccess=all
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=10s
|
RestartSec=10s
|
||||||
LimitNOFILE=40000
|
LimitNOFILE=40000
|
||||||
|
OnStartupSec={{ etcd_start_delay }}
|
||||||
|
OnBootSec={{ etcd_start_delay }}
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
service:
|
service:
|
||||||
name: kubelet
|
name: kubelet
|
||||||
state: restarted
|
state: restarted
|
||||||
|
sleep: "{{ kubelet_start_delay|int }}"
|
||||||
|
|
||||||
- name: Master | wait for kube-scheduler
|
- name: Master | wait for kube-scheduler
|
||||||
uri: url=http://localhost:10251/healthz
|
uri: url=http://localhost:10251/healthz
|
||||||
|
|
|
@ -13,3 +13,4 @@
|
||||||
service:
|
service:
|
||||||
name: kubelet
|
name: kubelet
|
||||||
state: restarted
|
state: restarted
|
||||||
|
sleep: "{{ kubelet_start_delay|int }}"
|
||||||
|
|
|
@ -40,7 +40,7 @@ DAEMON_USER=root
|
||||||
do_start()
|
do_start()
|
||||||
{
|
{
|
||||||
/usr/bin/docker rm -f kubelet &>/dev/null || true
|
/usr/bin/docker rm -f kubelet &>/dev/null || true
|
||||||
sleep 1
|
sleep {{ kubelet_start_delay }}
|
||||||
# Return
|
# Return
|
||||||
# 0 if daemon has been started
|
# 0 if daemon has been started
|
||||||
# 1 if daemon was already running
|
# 1 if daemon was already running
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
sleep {{ kubelet_start_delay }}
|
||||||
/usr/bin/docker run --privileged --rm \
|
/usr/bin/docker run --privileged --rm \
|
||||||
--net=host --pid=host --name=kubelet \
|
--net=host --pid=host --name=kubelet \
|
||||||
-v /etc/cni:/etc/cni:ro \
|
-v /etc/cni:/etc/cni:ro \
|
||||||
|
|
|
@ -28,6 +28,8 @@ ExecStartPre=-/usr/bin/docker rm -f kubelet
|
||||||
ExecReload=/usr/bin/docker restart kubelet
|
ExecReload=/usr/bin/docker restart kubelet
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=10s
|
RestartSec=10s
|
||||||
|
OnStartupSec={{ kubelet_start_delay }}
|
||||||
|
OnBootSec={{ kubelet_start_delay }}
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|
|
@ -35,7 +35,7 @@ start() {
|
||||||
echo "Docker executable $exec not found"
|
echo "Docker executable $exec not found"
|
||||||
else
|
else
|
||||||
echo "You do not have permission to execute the Docker executable $exec"
|
echo "You do not have permission to execute the Docker executable $exec"
|
||||||
fi
|
fi
|
||||||
exit 5
|
exit 5
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -44,6 +44,7 @@ start() {
|
||||||
if ! [ -f $pidfile ]; then
|
if ! [ -f $pidfile ]; then
|
||||||
printf "Starting $prog:\t"
|
printf "Starting $prog:\t"
|
||||||
echo "\n$(date)\n" >> $logfile
|
echo "\n$(date)\n" >> $logfile
|
||||||
|
sleep {{ kubelet_start_delay }}
|
||||||
$exec $DAEMON_ARGS &>> $logfile &
|
$exec $DAEMON_ARGS &>> $logfile &
|
||||||
pid=$!
|
pid=$!
|
||||||
echo $pid >> $pidfile
|
echo $pid >> $pidfile
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
---
|
---
|
||||||
run_gitinfos: false
|
run_gitinfos: false
|
||||||
|
|
||||||
|
# Controll delayed startup/restart for services, in seconds
|
||||||
|
etcd_start_delay: 5
|
||||||
|
kubelet_start_delay: 5
|
||||||
|
|
||||||
# This directory is where all the additional scripts go
|
# This directory is where all the additional scripts go
|
||||||
# that Kubernetes normally puts in /srv/kubernetes.
|
# that Kubernetes normally puts in /srv/kubernetes.
|
||||||
# This puts them in a sane location
|
# This puts them in a sane location
|
||||||
|
|
|
@ -8,3 +8,4 @@
|
||||||
service:
|
service:
|
||||||
name: kubelet
|
name: kubelet
|
||||||
state: restarted
|
state: restarted
|
||||||
|
sleep: "{{ kubelet_start_delay|int }}"
|
||||||
|
|
Loading…
Reference in a new issue