diff --git a/docs/large-deployments.md b/docs/large-deployments.md index 473f83954..20bc7fefd 100644 --- a/docs/large-deployments.md +++ b/docs/large-deployments.md @@ -20,5 +20,12 @@ For a large scaled deployments, consider the following configuration changes: ``dns_cpu_requests``, ``dns_memory_limit``, ``dns_memory_requests``. Please note that limits must always be greater than or equal to requests. +* Tune CPU/memory limits and requests. Those are located in roles' defaults + and named like ``foo_memory_limit``, ``foo_memory_requests`` and + ``foo_cpu_limit``, ``foo_cpu_requests``. Note that 'Mi' memory units for K8s + will be submitted as 'M', if applied for ``docker run``, and cpu K8s units will + end up with the 'm' skipped for docker as well. This is required as docker does not + understand k8s units well. + For example, when deploying 200 nodes, you may want to run ansible with ``--forks=50``, ``--timeout=600`` and define the ``retry_stagger: 60``. diff --git a/inventory/group_vars/all.yml b/inventory/group_vars/all.yml index cc9a8b874..a73612b0d 100644 --- a/inventory/group_vars/all.yml +++ b/inventory/group_vars/all.yml @@ -188,6 +188,7 @@ docker_daemon_graph: "/var/lib/docker" ## An obvious use case is allowing insecure-registry access ## to self hosted registries like so: docker_options: "--insecure-registry={{ kube_service_addresses }} --graph={{ docker_daemon_graph }}" +docker_bin_dir: "/usr/bin" ## Uncomment this if you want to force overlay/overlay2 as docker storage driver ## Please note that overlay2 is only supported on newer kernels diff --git a/roles/docker/handlers/main.yml b/roles/docker/handlers/main.yml index fd866a109..4f4c0c8c0 100644 --- a/roles/docker/handlers/main.yml +++ b/roles/docker/handlers/main.yml @@ -27,7 +27,7 @@ pause: seconds=10 prompt="Waiting for docker restart" - name: Docker | wait for docker - command: /usr/bin/docker images + command: "{{ docker_bin_dir }}/docker images" register: docker_ready retries: 10 delay: 5 diff --git a/roles/docker/templates/docker.service.j2 b/roles/docker/templates/docker.service.j2 index 39e460e96..6838868cd 100644 --- a/roles/docker/templates/docker.service.j2 +++ b/roles/docker/templates/docker.service.j2 @@ -18,7 +18,7 @@ Environment=GOTRACEBACK=crash ExecReload=/bin/kill -s HUP $MAINPID Delegate=yes KillMode=process -ExecStart=/usr/bin/docker daemon \ +ExecStart={{ docker_bin_dir }}/docker daemon \ $DOCKER_OPTS \ $DOCKER_STORAGE_OPTIONS \ $DOCKER_NETWORK_OPTIONS \ diff --git a/roles/download/tasks/main.yml b/roles/download/tasks/main.yml index e1859a1e6..7ac38449d 100644 --- a/roles/download/tasks/main.yml +++ b/roles/download/tasks/main.yml @@ -79,7 +79,7 @@ #NOTE(bogdando) this brings no docker-py deps for nodes - name: Download containers if pull is required or told to always pull - command: "/usr/bin/docker pull {{ pull_args }}" + command: "{{ docker_bin_dir }}/docker pull {{ pull_args }}" register: pull_task_result until: pull_task_result|success retries: 4 @@ -115,7 +115,7 @@ tags: facts - name: Download | save container images - shell: docker save "{{ pull_args }}" | gzip -{{ download_compress }} > "{{ fname }}" + shell: "{{ docker_bin_dir }}/docker save {{ pull_args }} | gzip -{{ download_compress }} > {{ fname }}" delegate_to: "{{ download_delegate }}" register: saved run_once: true @@ -145,6 +145,6 @@ tags: [upload, upgrade] - name: Download | load container images - shell: docker load < "{{ fname }}" + shell: "{{ docker_bin_dir }}/docker load < {{ fname }}" when: (ansible_os_family != "CoreOS" and inventory_hostname != groups['kube-master'][0] or download_delegate == "localhost") and download_run_once|bool and download.enabled|bool and download.container|bool tags: [upload, upgrade] diff --git a/roles/download/tasks/set_docker_image_facts.yml b/roles/download/tasks/set_docker_image_facts.yml index 69048c513..451e0de2b 100644 --- a/roles/download/tasks/set_docker_image_facts.yml +++ b/roles/download/tasks/set_docker_image_facts.yml @@ -8,7 +8,7 @@ {%- if pull_by_digest|bool %}{{download.repo}}@sha256:{{download.sha256}}{%- else -%}{{download.repo}}:{{download.tag}}{%- endif -%} - name: Register docker images info - shell: "{% raw %}/usr/bin/docker images -q | xargs /usr/bin/docker inspect -f '{{.RepoTags}},{{.RepoDigests}}'{% endraw %}" + shell: "{{ docker_bin_dir }}/docker images -q | xargs {{ docker_bin_dir }}/docker inspect -f {% raw %}'{{.RepoTags}},{{.RepoDigests}}'{% endraw %}" register: docker_images_raw failed_when: false when: not download_always_pull|bool diff --git a/roles/etcd/defaults/main.yml b/roles/etcd/defaults/main.yml index 2df4ba165..e733fe56d 100644 --- a/roles/etcd/defaults/main.yml +++ b/roles/etcd/defaults/main.yml @@ -6,3 +6,7 @@ etcd_cert_dir: "{{ etcd_config_dir }}/ssl" etcd_cert_group: root etcd_script_dir: "{{ bin_dir }}/etcd-scripts" + +# Limits +etcd_memory_limit: 512M +etcd_cpu_limit: 300m diff --git a/roles/etcd/tasks/install.yml b/roles/etcd/tasks/install.yml index aa7f32ca3..0ed3f4154 100644 --- a/roles/etcd/tasks/install.yml +++ b/roles/etcd/tasks/install.yml @@ -12,10 +12,10 @@ #Plan A: no docker-py deps - name: Install | Copy etcdctl binary from container - command: sh -c "/usr/bin/docker rm -f etcdctl-binarycopy; - /usr/bin/docker create --name etcdctl-binarycopy {{ etcd_image_repo }}:{{ etcd_image_tag }} && - /usr/bin/docker cp etcdctl-binarycopy:{{ etcd_container_bin_dir }}etcdctl {{ bin_dir }}/etcdctl && - /usr/bin/docker rm -f etcdctl-binarycopy" + command: sh -c "{{ docker_bin_dir }}/docker rm -f etcdctl-binarycopy; + {{ docker_bin_dir }}/docker create --name etcdctl-binarycopy {{ etcd_image_repo }}:{{ etcd_image_tag }} && + {{ docker_bin_dir }}/docker cp etcdctl-binarycopy:{{ etcd_container_bin_dir }}etcdctl {{ bin_dir }}/etcdctl && + {{ docker_bin_dir }}/docker rm -f etcdctl-binarycopy" when: etcd_deployment_type == "docker" register: etcd_task_result until: etcd_task_result.rc == 0 diff --git a/roles/etcd/tasks/pre_upgrade.yml b/roles/etcd/tasks/pre_upgrade.yml index 081702c4a..eb17e9871 100644 --- a/roles/etcd/tasks/pre_upgrade.yml +++ b/roles/etcd/tasks/pre_upgrade.yml @@ -26,12 +26,12 @@ - /etc/init.d/etcd-proxy - name: "Pre-upgrade | find etcd-proxy container" - command: docker ps -aq --filter "name=etcd-proxy*" + command: "{{ docker_bin_dir }}/docker ps -aq --filter 'name=etcd-proxy*'" register: etcd_proxy_container failed_when: false - name: "Pre-upgrade | remove etcd-proxy if it exists" - command: "docker rm -f {{item}}" + command: "{{ docker_bin_dir }}/docker rm -f {{item}}" with_items: "{{etcd_proxy_container.stdout_lines}}" - name: "Pre-upgrade | check if member list is non-SSL" diff --git a/roles/etcd/templates/deb-etcd-docker.initd.j2 b/roles/etcd/templates/deb-etcd-docker.initd.j2 index 0489cd2f5..b8ae568e9 100644 --- a/roles/etcd/templates/deb-etcd-docker.initd.j2 +++ b/roles/etcd/templates/deb-etcd-docker.initd.j2 @@ -15,7 +15,7 @@ set -a PATH=/sbin:/usr/sbin:/bin/:/usr/bin DESC="etcd k/v store" NAME=etcd -DAEMON={{ docker_bin_dir | default("/usr/bin") }}/docker +DAEMON={{ docker_bin_dir }}/docker DAEMON_EXEC=`basename $DAEMON` DAEMON_ARGS="run --restart=on-failure:5 --env-file=/etc/etcd.env \ --net=host \ @@ -50,7 +50,7 @@ do_status() # do_start() { - {{ docker_bin_dir | default("/usr/bin") }}/docker rm -f {{ etcd_member_name | default("etcd-proxy") }} &>/dev/null || true + {{ docker_bin_dir }}/docker rm -f {{ etcd_member_name | default("etcd") }} &>/dev/null || true sleep 1 start-stop-daemon --background --start --quiet --make-pidfile --pidfile $PID --user $DAEMON_USER --exec $DAEMON -- \ $DAEMON_ARGS \ diff --git a/roles/etcd/templates/etcd-docker.service.j2 b/roles/etcd/templates/etcd-docker.service.j2 index d18a91f42..223d2d842 100644 --- a/roles/etcd/templates/etcd-docker.service.j2 +++ b/roles/etcd/templates/etcd-docker.service.j2 @@ -6,7 +6,7 @@ After=docker.service [Service] User=root PermissionsStartOnly=true -ExecStart={{ docker_bin_dir | default("/usr/bin") }}/docker run --restart=on-failure:5 \ +ExecStart={{ docker_bin_dir }}/docker run --restart=on-failure:5 \ --env-file=/etc/etcd.env \ {# TODO(mattymo): Allow docker IP binding and disable in envfile -p 2380:2380 -p 2379:2379 #} @@ -14,14 +14,15 @@ ExecStart={{ docker_bin_dir | default("/usr/bin") }}/docker run --restart=on-fai -v /etc/ssl/certs:/etc/ssl/certs:ro \ -v {{ etcd_cert_dir }}:{{ etcd_cert_dir }}:ro \ -v /var/lib/etcd:/var/lib/etcd:rw \ +--memory={{ etcd_memory_limit|regex_replace('Mi', 'M') }} --cpu-shares={{ etcd_cpu_limit|regex_replace('m', '') }} \ --name={{ etcd_member_name | default("etcd") }} \ {{ etcd_image_repo }}:{{ etcd_image_tag }} \ {% if etcd_after_v3 %} {{ etcd_container_bin_dir }}etcd {% endif %} -ExecStartPre=-{{ docker_bin_dir | default("/usr/bin") }}/docker rm -f {{ etcd_member_name | default("etcd-proxy") }} -ExecReload={{ docker_bin_dir | default("/usr/bin") }}/docker restart {{ etcd_member_name | default("etcd-proxy") }} -ExecStop={{ docker_bin_dir | default("/usr/bin") }}/docker stop {{ etcd_member_name | default("etcd-proxy") }} +ExecStartPre=-{{ docker_bin_dir }}/docker rm -f {{ etcd_member_name | default("etcd") }} +ExecReload={{ docker_bin_dir }}/docker restart {{ etcd_member_name | default("etcd") }} +ExecStop={{ docker_bin_dir }}/docker stop {{ etcd_member_name | default("etcd") }} Restart=always RestartSec=15s diff --git a/roles/kubernetes-apps/ansible/defaults/main.yml b/roles/kubernetes-apps/ansible/defaults/main.yml index dd2bd2d8a..90a5702bb 100644 --- a/roles/kubernetes-apps/ansible/defaults/main.yml +++ b/roles/kubernetes-apps/ansible/defaults/main.yml @@ -20,6 +20,12 @@ exechealthz_image_tag: "{{ exechealthz_version }}" calico_policy_image_repo: "calico/kube-policy-controller" calico_policy_image_tag: latest +# Limits for calico apps +calico_policy_controller_cpu_limit: 100m +calico_policy_controller_memory_limit: 256M +calico_policy_controller_cpu_requests: 30m +calico_policy_controller_memory_requests: 128M + # Netchecker deploy_netchecker: false netchecker_port: 31081 @@ -29,5 +35,19 @@ agent_img: "quay.io/l23network/mcp-netchecker-agent:v0.1" server_img: "quay.io/l23network/mcp-netchecker-server:v0.1" kubectl_image: "gcr.io/google_containers/kubectl:v0.18.0-120-gaeb4ac55ad12b1-dirty" +# Limits for netchecker apps +netchecker_agent_cpu_limit: 30m +netchecker_agent_memory_limit: 100M +netchecker_agent_cpu_requests: 15m +netchecker_agent_memory_requests: 64M +netchecker_server_cpu_limit: 100m +netchecker_server_memory_limit: 256M +netchecker_server_cpu_requests: 50m +netchecker_server_memory_requests: 128M +netchecker_kubectl_cpu_limit: 30m +netchecker_kubectl_memory_limit: 128M +netchecker_kubectl_cpu_requests: 15m +netchecker_kubectl_memory_requests: 64M + # SSL etcd_cert_dir: "/etc/ssl/etcd/ssl" diff --git a/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2 b/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2 index eb98267f3..c92328f15 100644 --- a/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2 +++ b/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2 @@ -25,6 +25,13 @@ spec: - name: calico-policy-controller image: {{ calico_policy_image_repo }}:{{ calico_policy_image_tag }} imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ calico_policy_controller_cpu_limit }} + memory: {{ calico_policy_controller_memory_limit }} + requests: + cpu: {{ calico_policy_controller_cpu_requests }} + memory: {{ calico_policy_controller_memory_requests }} env: - name: ETCD_ENDPOINTS value: "{{ etcd_access_endpoint }}" diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml b/roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml index a52329e50..41900ab33 100644 --- a/roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml +++ b/roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml @@ -23,3 +23,10 @@ spec: - name: REPORT_INTERVAL value: '{{ agent_report_interval }}' imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ netchecker_agent_cpu_limit }} + memory: {{ netchecker_agent_memory_limit }} + requests: + cpu: {{ netchecker_agent_cpu_requests }} + memory: {{ netchecker_agent_memory_requests }} diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml b/roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml index 4fd03e80a..5a6a63f36 100644 --- a/roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml +++ b/roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml @@ -24,3 +24,10 @@ spec: - name: REPORT_INTERVAL value: '{{ agent_report_interval }}' imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ netchecker_agent_cpu_limit }} + memory: {{ netchecker_agent_memory_limit }} + requests: + cpu: {{ netchecker_agent_cpu_requests }} + memory: {{ netchecker_agent_memory_requests }} diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml b/roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml index 6f242bc51..c1d8ddb9f 100644 --- a/roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml +++ b/roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml @@ -11,11 +11,25 @@ spec: image: "{{ server_img }}" env: imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ netchecker_server_cpu_limit }} + memory: {{ netchecker_server_memory_limit }} + requests: + cpu: {{ netchecker_server_cpu_requests }} + memory: {{ netchecker_server_memory_requests }} ports: - containerPort: 8081 hostPort: 8081 - name: kubectl-proxy image: "{{ kubectl_image }}" imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ netchecker_kubectl_cpu_limit }} + memory: {{ netchecker_kubectl_memory_limit }} + requests: + cpu: {{ netchecker_kubectl_cpu_requests }} + memory: {{ netchecker_kubectl_memory_requests }} args: - proxy diff --git a/roles/kubernetes/master/defaults/main.yml b/roles/kubernetes/master/defaults/main.yml index c1fbbb583..874925adf 100644 --- a/roles/kubernetes/master/defaults/main.yml +++ b/roles/kubernetes/master/defaults/main.yml @@ -13,4 +13,16 @@ kube_apiserver_node_port_range: "30000-32767" etcd_config_dir: /etc/ssl/etcd etcd_cert_dir: "{{ etcd_config_dir }}/ssl" - +# Limits for kube components +kube_controller_memory_limit: 512M +kube_controller_cpu_limit: 250m +kube_controller_memory_requests: 170M +kube_controller_cpu_requests: 100m +kube_scheduler_memory_limit: 512M +kube_scheduler_cpu_limit: 250m +kube_scheduler_memory_requests: 170M +kube_scheduler_cpu_requests: 100m +kube_apiserver_memory_limit: 2000M +kube_apiserver_cpu_limit: 800m +kube_apiserver_memory_requests: 256M +kube_apiserver_cpu_requests: 300m diff --git a/roles/kubernetes/master/tasks/main.yml b/roles/kubernetes/master/tasks/main.yml index e1b5cc5d2..f7b561578 100644 --- a/roles/kubernetes/master/tasks/main.yml +++ b/roles/kubernetes/master/tasks/main.yml @@ -3,7 +3,7 @@ tags: k8s-pre-upgrade - name: Copy kubectl from hyperkube container - command: "/usr/bin/docker run --rm -v {{ bin_dir }}:/systembindir {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} /bin/cp /hyperkube /systembindir/kubectl" + command: "{{ docker_bin_dir }}/docker run --rm -v {{ bin_dir }}:/systembindir {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} /bin/cp /hyperkube /systembindir/kubectl" register: kube_task_result until: kube_task_result.rc == 0 retries: 4 diff --git a/roles/kubernetes/master/templates/manifests/kube-apiserver.manifest.j2 b/roles/kubernetes/master/templates/manifests/kube-apiserver.manifest.j2 index 65505526d..c05030697 100644 --- a/roles/kubernetes/master/templates/manifests/kube-apiserver.manifest.j2 +++ b/roles/kubernetes/master/templates/manifests/kube-apiserver.manifest.j2 @@ -12,6 +12,13 @@ spec: - name: kube-apiserver image: {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ kube_apiserver_cpu_limit }} + memory: {{ kube_apiserver_memory_limit }} + requests: + cpu: {{ kube_apiserver_cpu_requests }} + memory: {{ kube_apiserver_memory_requests }} command: - /hyperkube - apiserver diff --git a/roles/kubernetes/master/templates/manifests/kube-controller-manager.manifest.j2 b/roles/kubernetes/master/templates/manifests/kube-controller-manager.manifest.j2 index 8f7580eb5..49dd05ba8 100644 --- a/roles/kubernetes/master/templates/manifests/kube-controller-manager.manifest.j2 +++ b/roles/kubernetes/master/templates/manifests/kube-controller-manager.manifest.j2 @@ -11,6 +11,13 @@ spec: - name: kube-controller-manager image: {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ kube_controller_cpu_limit }} + memory: {{ kube_controller_memory_limit }} + requests: + cpu: {{ kube_controller_cpu_requests }} + memory: {{ kube_controller_memory_requests }} command: - /hyperkube - controller-manager diff --git a/roles/kubernetes/master/templates/manifests/kube-scheduler.manifest.j2 b/roles/kubernetes/master/templates/manifests/kube-scheduler.manifest.j2 index a2c4c134a..781e38d7b 100644 --- a/roles/kubernetes/master/templates/manifests/kube-scheduler.manifest.j2 +++ b/roles/kubernetes/master/templates/manifests/kube-scheduler.manifest.j2 @@ -11,6 +11,13 @@ spec: - name: kube-scheduler image: {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ kube_scheduler_cpu_limit }} + memory: {{ kube_scheduler_memory_limit }} + requests: + cpu: {{ kube_scheduler_cpu_requests }} + memory: {{ kube_scheduler_memory_requests }} command: - /hyperkube - scheduler diff --git a/roles/kubernetes/node/defaults/main.yml b/roles/kubernetes/node/defaults/main.yml index b0f73e50d..99ed2bdae 100644 --- a/roles/kubernetes/node/defaults/main.yml +++ b/roles/kubernetes/node/defaults/main.yml @@ -9,6 +9,18 @@ kube_proxy_mode: iptables # If using the pure iptables proxy, SNAT everything kube_proxy_masquerade_all: true +# Limits for kube components and nginx load balancer app +kubelet_memory_limit: 512M +kubelet_cpu_limit: 100m +kube_proxy_memory_limit: 2000M +kube_proxy_cpu_limit: 500m +kube_proxy_memory_requests: 256M +kube_proxy_cpu_requests: 150m +nginx_memory_limit: 512M +nginx_cpu_limit: 300m +nginx_memory_requests: 64M +nginx_cpu_requests: 50m + # kube_api_runtime_config: # - extensions/v1beta1/daemonsets=true # - extensions/v1beta1/deployments=true diff --git a/roles/kubernetes/node/templates/deb-kubelet.initd.j2 b/roles/kubernetes/node/templates/deb-kubelet.initd.j2 index 6f349b8f2..194506e89 100644 --- a/roles/kubernetes/node/templates/deb-kubelet.initd.j2 +++ b/roles/kubernetes/node/templates/deb-kubelet.initd.j2 @@ -39,7 +39,7 @@ DAEMON_USER=root # do_start() { - /usr/bin/docker rm -f kubelet &>/dev/null || true + {{ docker_bin_dir }}/docker rm -f kubelet &>/dev/null || true sleep 1 # Return # 0 if daemon has been started diff --git a/roles/kubernetes/node/templates/kubelet-container.j2 b/roles/kubernetes/node/templates/kubelet-container.j2 index 7d4f536ab..c97c6f0de 100644 --- a/roles/kubernetes/node/templates/kubelet-container.j2 +++ b/roles/kubernetes/node/templates/kubelet-container.j2 @@ -1,5 +1,5 @@ #!/bin/bash -/usr/bin/docker run --privileged \ +{{ docker_bin_dir }}/docker run --privileged \ --net=host --pid=host --name=kubelet --restart=on-failure:5 \ -v /etc/cni:/etc/cni:ro \ -v /opt/cni:/opt/cni:ro \ @@ -9,6 +9,7 @@ -v {{ docker_daemon_graph }}:/var/lib/docker \ -v /var/run:/var/run \ -v /var/lib/kubelet:/var/lib/kubelet \ +--memory={{ kubelet_memory_limit|regex_replace('Mi', 'M') }} --cpu-shares={{ kubelet_cpu_limit|regex_replace('m', '') }} \ {{ hyperkube_image_repo }}:{{ hyperkube_image_tag}} \ nsenter --target=1 --mount --wd=. -- \ ./hyperkube kubelet \ diff --git a/roles/kubernetes/node/templates/kubelet.service.j2 b/roles/kubernetes/node/templates/kubelet.service.j2 index d8d5ec8a8..e3bf40878 100644 --- a/roles/kubernetes/node/templates/kubelet.service.j2 +++ b/roles/kubernetes/node/templates/kubelet.service.j2 @@ -23,8 +23,8 @@ ExecStart={{ bin_dir }}/kubelet \ $DOCKER_SOCKET \ $KUBELET_NETWORK_PLUGIN \ $KUBELET_CLOUDPROVIDER -ExecStartPre=-/usr/bin/docker rm -f kubelet -ExecReload=/usr/bin/docker restart kubelet +ExecStartPre=-{{ docker_bin_dir }}/docker rm -f kubelet +ExecReload={{ docker_bin_dir }}/docker restart kubelet Restart=always RestartSec=10s diff --git a/roles/kubernetes/node/templates/manifests/kube-proxy.manifest.j2 b/roles/kubernetes/node/templates/manifests/kube-proxy.manifest.j2 index 694ee1e36..a965ef792 100644 --- a/roles/kubernetes/node/templates/manifests/kube-proxy.manifest.j2 +++ b/roles/kubernetes/node/templates/manifests/kube-proxy.manifest.j2 @@ -11,6 +11,13 @@ spec: - name: kube-proxy image: {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ kube_proxy_cpu_limit }} + memory: {{ kube_proxy_memory_limit }} + requests: + cpu: {{ kube_proxy_cpu_requests }} + memory: {{ kube_proxy_memory_requests }} command: - /hyperkube - proxy diff --git a/roles/kubernetes/node/templates/manifests/nginx-proxy.manifest.j2 b/roles/kubernetes/node/templates/manifests/nginx-proxy.manifest.j2 index db15bd2b9..2d566cad1 100644 --- a/roles/kubernetes/node/templates/manifests/nginx-proxy.manifest.j2 +++ b/roles/kubernetes/node/templates/manifests/nginx-proxy.manifest.j2 @@ -11,6 +11,13 @@ spec: - name: nginx-proxy image: {{ nginx_image_repo }}:{{ nginx_image_tag }} imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ nginx_cpu_limit }} + memory: {{ nginx_memory_limit }} + requests: + cpu: {{ nginx_cpu_requests }} + memory: {{ nginx_memory_requests }} securityContext: privileged: true volumeMounts: diff --git a/roles/network_plugin/calico/defaults/main.yml b/roles/network_plugin/calico/defaults/main.yml index 391e7c53e..6718fdbc5 100644 --- a/roles/network_plugin/calico/defaults/main.yml +++ b/roles/network_plugin/calico/defaults/main.yml @@ -19,3 +19,17 @@ global_as_num: "64512" # not be specified in calico CNI config, so Calico will use built-in # defaults. The value should be a number, not a string. # calico_mtu: 1500 + +# Limits for apps +calico_rr_memory_limit: 1000M +calico_rr_cpu_limit: 300m +calico_rr_memory_requests: 500M +calico_rr_cpu_requests: 150m +calico_node_memory_limit: 500M +calico_node_cpu_limit: 300m +calico_node_memory_requests: 256M +calico_node_cpu_requests: 150m +calicoctl_memory_limit: 170M +calicoctl_cpu_limit: 100m +calicoctl_memory_requests: 70M +calicoctl_cpu_requests: 50m diff --git a/roles/network_plugin/calico/rr/templates/calico-rr.service.j2 b/roles/network_plugin/calico/rr/templates/calico-rr.service.j2 index 1a4b3e977..f6da04a4d 100644 --- a/roles/network_plugin/calico/rr/templates/calico-rr.service.j2 +++ b/roles/network_plugin/calico/rr/templates/calico-rr.service.j2 @@ -5,8 +5,8 @@ Requires=docker.service [Service] EnvironmentFile=/etc/calico/calico-rr.env -ExecStartPre=-/usr/bin/docker rm -f calico-rr -ExecStart=/usr/bin/docker run --net=host --privileged \ +ExecStartPre=-{{ docker_bin_dir }}/docker rm -f calico-rr +ExecStart={{ docker_bin_dir }}/docker run --net=host --privileged \ --name=calico-rr \ -e IP=${IP} \ -e IP6=${IP6} \ @@ -16,12 +16,13 @@ ExecStart=/usr/bin/docker run --net=host --privileged \ -e ETCD_KEY_FILE=${ETCD_KEY_FILE} \ -v /var/log/calico-rr:/var/log/calico \ -v {{ calico_cert_dir }}:{{ calico_cert_dir }}:ro \ + --memory={{ calico_rr_memory_limit|regex_replace('Mi', 'M') }} --cpu-shares={{ calico_rr_cpu_limit|regex_replace('m', '') }} \ {{ calico_rr_image_repo }}:{{ calico_rr_image_tag }} Restart=always RestartSec=10s -ExecStop=-/usr/bin/docker stop calico-rr +ExecStop=-{{ docker_bin_dir }}/docker stop calico-rr [Install] WantedBy=multi-user.target diff --git a/roles/network_plugin/calico/tasks/main.yml b/roles/network_plugin/calico/tasks/main.yml index 81979a910..ae6e0e4d4 100644 --- a/roles/network_plugin/calico/tasks/main.yml +++ b/roles/network_plugin/calico/tasks/main.yml @@ -41,7 +41,7 @@ notify: restart calico-node - name: Calico | Copy cni plugins from hyperkube - command: "/usr/bin/docker run --rm -v /opt/cni/bin:/cnibindir {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} /usr/bin/rsync -a /opt/cni/bin/ /cnibindir/" + command: "{{ docker_bin_dir }}/docker run --rm -v /opt/cni/bin:/cnibindir {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} /usr/bin/rsync -a /opt/cni/bin/ /cnibindir/" register: cni_task_result until: cni_task_result.rc == 0 retries: 4 @@ -50,7 +50,7 @@ tags: [hyperkube, upgrade] - name: Calico | Copy cni plugins from calico/cni container - command: "/usr/bin/docker run --rm -v /opt/cni/bin:/cnibindir {{ calico_cni_image_repo }}:{{ calico_cni_image_tag }} sh -c 'cp -a /opt/cni/bin/* /cnibindir/'" + command: "{{ docker_bin_dir }}/docker run --rm -v /opt/cni/bin:/cnibindir {{ calico_cni_image_repo }}:{{ calico_cni_image_tag }} sh -c 'cp -a /opt/cni/bin/* /cnibindir/'" register: cni_task_result until: cni_task_result.rc == 0 retries: 4 diff --git a/roles/network_plugin/calico/templates/calico-node.service.j2 b/roles/network_plugin/calico/templates/calico-node.service.j2 index dc041b054..dfe8c4a3d 100644 --- a/roles/network_plugin/calico/templates/calico-node.service.j2 +++ b/roles/network_plugin/calico/templates/calico-node.service.j2 @@ -5,8 +5,8 @@ Requires=docker.service [Service] EnvironmentFile=/etc/calico/calico.env -ExecStartPre=-/usr/bin/docker rm -f calico-node -ExecStart=/usr/bin/docker run --net=host --privileged \ +ExecStartPre=-{{ docker_bin_dir }}/docker rm -f calico-node +ExecStart={{ docker_bin_dir }}/docker run --net=host --privileged \ --name=calico-node \ -e HOSTNAME=${CALICO_HOSTNAME} \ -e IP=${CALICO_IP} \ @@ -24,12 +24,13 @@ ExecStart=/usr/bin/docker run --net=host --privileged \ -v /lib/modules:/lib/modules \ -v /var/run/calico:/var/run/calico \ -v {{ calico_cert_dir }}:{{ calico_cert_dir }}:ro \ + --memory={{ calico_node_memory_limit|regex_replace('Mi', 'M') }} --cpu-shares={{ calico_node_cpu_limit|regex_replace('m', '') }} \ {{ calico_node_image_repo }}:{{ calico_node_image_tag }} Restart=always RestartSec=10s -ExecStop=-/usr/bin/docker stop calico-node +ExecStop=-{{ docker_bin_dir }}/docker stop calico-node [Install] WantedBy=multi-user.target diff --git a/roles/network_plugin/calico/templates/calicoctl-container.j2 b/roles/network_plugin/calico/templates/calicoctl-container.j2 index 7be30928a..0ecfba0c1 100644 --- a/roles/network_plugin/calico/templates/calicoctl-container.j2 +++ b/roles/network_plugin/calico/templates/calicoctl-container.j2 @@ -1,13 +1,14 @@ #!/bin/bash -/usr/bin/docker run -i --privileged --rm \ +{{ docker_bin_dir }}/docker run -i --privileged --rm \ --net=host --pid=host \ -e ETCD_ENDPOINTS={{ etcd_access_endpoint }} \ -e ETCD_CA_CERT_FILE=/etc/calico/certs/ca_cert.crt \ -e ETCD_CERT_FILE=/etc/calico/certs/cert.crt \ -e ETCD_KEY_FILE=/etc/calico/certs/key.pem \ --v /usr/bin/docker:/usr/bin/docker \ +-v {{ docker_bin_dir }}/docker:{{ docker_bin_dir }}/docker \ -v /var/run/docker.sock:/var/run/docker.sock \ -v /var/run/calico:/var/run/calico \ -v /etc/calico/certs:/etc/calico/certs:ro \ +--memory={{ calicoctl_memory_limit|regex_replace('Mi', 'M') }} --cpu-shares={{ calicoctl_cpu_limit|regex_replace('m', '') }} \ {{ calicoctl_image_repo }}:{{ calicoctl_image_tag}} \ $@ diff --git a/roles/network_plugin/canal/defaults/main.yml b/roles/network_plugin/canal/defaults/main.yml index d67d593f5..f8482fb1a 100644 --- a/roles/network_plugin/canal/defaults/main.yml +++ b/roles/network_plugin/canal/defaults/main.yml @@ -13,3 +13,13 @@ canal_log_level: "info" # Etcd SSL dirs canal_cert_dir: /etc/canal/certs etcd_cert_dir: /etc/ssl/etcd/ssl + +# Limits for apps +calico_node_memory_limit: 500M +calico_node_cpu_limit: 200m +calico_node_memory_requests: 256M +calico_node_cpu_requests: 100m +flannel_memory_limit: 500M +flannel_cpu_limit: 200m +flannel_memory_requests: 256M +flannel_cpu_requests: 100m diff --git a/roles/network_plugin/canal/tasks/main.yml b/roles/network_plugin/canal/tasks/main.yml index fec09cb48..3d3b19bdc 100644 --- a/roles/network_plugin/canal/tasks/main.yml +++ b/roles/network_plugin/canal/tasks/main.yml @@ -43,7 +43,7 @@ dest: "{{kube_config_dir}}/canal-node.yaml" - name: Canal | Copy cni plugins from hyperkube - command: "/usr/bin/docker run --rm -v /opt/cni/bin:/cnibindir {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} /usr/bin/rsync -a /opt/cni/bin/ /cnibindir/" + command: "{{ docker_bin_dir }}/docker run --rm -v /opt/cni/bin:/cnibindir {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} /usr/bin/rsync -a /opt/cni/bin/ /cnibindir/" register: cni_task_result until: cni_task_result.rc == 0 retries: 4 @@ -52,7 +52,7 @@ tags: [hyperkube, upgrade] - name: Canal | Copy cni plugins from calico/cni - command: "/usr/bin/docker run --rm -v /opt/cni/bin:/cnibindir {{ calico_cni_image_repo }}:{{ calico_cni_image_tag }} sh -c 'cp -a /opt/cni/bin/* /cnibindir/'" + command: "{{ docker_bin_dir }}/docker run --rm -v /opt/cni/bin:/cnibindir {{ calico_cni_image_repo }}:{{ calico_cni_image_tag }} sh -c 'cp -a /opt/cni/bin/* /cnibindir/'" register: cni_task_result until: cni_task_result.rc == 0 retries: 4 diff --git a/roles/network_plugin/canal/templates/canal-node.yml.j2 b/roles/network_plugin/canal/templates/canal-node.yml.j2 index f73fae9bd..37baf06e0 100644 --- a/roles/network_plugin/canal/templates/canal-node.yml.j2 +++ b/roles/network_plugin/canal/templates/canal-node.yml.j2 @@ -49,6 +49,13 @@ spec: - name: flannel image: "{{ flannel_image_repo }}:{{ flannel_image_tag }}" imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ flannel_cpu_limit }} + memory: {{ flannel_memory_limit }} + requests: + cpu: {{ flannel_cpu_requests }} + memory: {{ flannel_memory_requests }} env: # Cluster name - name: CLUSTER_NAME @@ -119,6 +126,13 @@ spec: - name: calico-node image: "{{ calico_node_image_repo }}:{{ calico_node_image_tag }}" imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ calico_node_cpu_limit }} + memory: {{ calico_node_memory_limit }} + requests: + cpu: {{ calico_node_cpu_requests }} + memory: {{ calico_node_memory_requests }} env: # The location of the etcd cluster. - name: ETCD_ENDPOINTS diff --git a/roles/network_plugin/cloud/tasks/main.yml b/roles/network_plugin/cloud/tasks/main.yml index c8ae77830..346a57969 100644 --- a/roles/network_plugin/cloud/tasks/main.yml +++ b/roles/network_plugin/cloud/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Cloud | Copy cni plugins from hyperkube - command: "/usr/bin/docker run --rm -v /opt/cni/bin:/cnibindir {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} /bin/cp -r /opt/cni/bin/. /cnibindir/" + command: "{{ docker_bin_dir }}/docker run --rm -v /opt/cni/bin:/cnibindir {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} /bin/cp -r /opt/cni/bin/. /cnibindir/" register: cni_task_result until: cni_task_result.rc == 0 retries: 4 diff --git a/roles/network_plugin/flannel/defaults/main.yml b/roles/network_plugin/flannel/defaults/main.yml index ce00090ec..b6768f1bd 100644 --- a/roles/network_plugin/flannel/defaults/main.yml +++ b/roles/network_plugin/flannel/defaults/main.yml @@ -10,3 +10,9 @@ flannel_public_ip: "{{ access_ip|default(ip|default(ansible_default_ipv4.address # You can choose what type of flannel backend to use # please refer to flannel's docs : https://github.com/coreos/flannel/blob/master/README.md flannel_backend_type: "vxlan" + +# Limits for apps +flannel_memory_limit: 500M +flannel_cpu_limit: 300m +flannel_memory_requests: 256M +flannel_cpu_requests: 150m diff --git a/roles/network_plugin/flannel/handlers/main.yml b/roles/network_plugin/flannel/handlers/main.yml index 0f2734264..e393b6163 100644 --- a/roles/network_plugin/flannel/handlers/main.yml +++ b/roles/network_plugin/flannel/handlers/main.yml @@ -32,7 +32,7 @@ pause: seconds=10 prompt="Waiting for docker restart" - name: Flannel | wait for docker - command: /usr/bin/docker images + command: "{{ docker_bin_dir }}/docker images" register: docker_ready retries: 10 delay: 5 diff --git a/roles/network_plugin/flannel/templates/flannel-pod.yml b/roles/network_plugin/flannel/templates/flannel-pod.yml index 2edd9ada1..f9b76ce5f 100644 --- a/roles/network_plugin/flannel/templates/flannel-pod.yml +++ b/roles/network_plugin/flannel/templates/flannel-pod.yml @@ -19,6 +19,13 @@ - name: "flannel-container" image: "{{ flannel_image_repo }}:{{ flannel_image_tag }}" imagePullPolicy: {{ k8s_image_pull_policy }} + resources: + limits: + cpu: {{ flannel_cpu_limit }} + memory: {{ flannel_memory_limit }} + requests: + cpu: {{ flannel_cpu_requests }} + memory: {{ flannel_memory_requests }} command: - "/bin/sh" - "-c" @@ -26,9 +33,6 @@ ports: - hostPort: 10253 containerPort: 10253 - resources: - limits: - cpu: "100m" volumeMounts: - name: "subnetenv" mountPath: "/run/flannel" diff --git a/roles/network_plugin/weave/defaults/main.yml b/roles/network_plugin/weave/defaults/main.yml new file mode 100644 index 000000000..4aabcac6f --- /dev/null +++ b/roles/network_plugin/weave/defaults/main.yml @@ -0,0 +1,4 @@ +--- +# Limits +weave_memory_limit: 500M +weave_cpu_limit: 300m diff --git a/roles/network_plugin/weave/tasks/main.yml b/roles/network_plugin/weave/tasks/main.yml index e74c1c334..9609ea141 100644 --- a/roles/network_plugin/weave/tasks/main.yml +++ b/roles/network_plugin/weave/tasks/main.yml @@ -1,6 +1,6 @@ --- - name: Weave | Copy cni plugins from hyperkube - command: "/usr/bin/docker run --rm -v /opt/cni/bin:/cnibindir {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} /bin/cp -r /opt/cni/bin/. /cnibindir/" + command: "{{ docker_bin_dir }}/docker run --rm -v /opt/cni/bin:/cnibindir {{ hyperkube_image_repo }}:{{ hyperkube_image_tag }} /bin/cp -r /opt/cni/bin/. /cnibindir/" register: cni_task_result until: cni_task_result.rc == 0 retries: 4 diff --git a/roles/network_plugin/weave/templates/weave.j2 b/roles/network_plugin/weave/templates/weave.j2 index 865eb96a7..f1e92c25c 100644 --- a/roles/network_plugin/weave/templates/weave.j2 +++ b/roles/network_plugin/weave/templates/weave.j2 @@ -1,3 +1,4 @@ +WEAVE_DOCKER_ARGS="--memory={{ weave_memory_limit|regex_replace('Mi', 'M') }} --cpu-shares={{ weave_cpu_limit|regex_replace('m', '') }}" WEAVE_PEERS="{% for host in groups['k8s-cluster'] %}{{ hostvars[host]['access_ip'] | default(hostvars[host]['ip'] | default(hostvars[host]['ansible_default_ipv4']['address'])) }}{% if not loop.last %} {% endif %}{% endfor %}" WEAVEPROXY_ARGS="--rewrite-inspect --without-dns" WEAVE_SUBNET="--ipalloc-range {{ kube_pods_subnet }}" diff --git a/roles/network_plugin/weave/templates/weave.service.j2 b/roles/network_plugin/weave/templates/weave.service.j2 index 2df0cb989..6c2aad249 100644 --- a/roles/network_plugin/weave/templates/weave.service.j2 +++ b/roles/network_plugin/weave/templates/weave.service.j2 @@ -6,12 +6,13 @@ After=docker.service docker.socket [Service] EnvironmentFile=-/etc/weave.env -ExecStartPre=-/usr/bin/docker rm -f weave +ExecStartPre=-{{ docker_bin_dir }}/docker rm -f weave ExecStartPre={{ bin_dir }}/weave launch-router \ $WEAVE_SUBNET \ $WEAVE_PEERS -ExecStart=/usr/bin/docker attach weave +ExecStart={{ docker_bin_dir }}/docker attach weave ExecStop={{ bin_dir }}/weave stop +Restart=on-failure [Install] WantedBy=multi-user.target diff --git a/roles/network_plugin/weave/templates/weaveproxy.service.j2 b/roles/network_plugin/weave/templates/weaveproxy.service.j2 index 9b2a522ba..5b3f4f86d 100644 --- a/roles/network_plugin/weave/templates/weaveproxy.service.j2 +++ b/roles/network_plugin/weave/templates/weaveproxy.service.j2 @@ -7,11 +7,11 @@ After=docker.service docker.socket [Service] EnvironmentFile=-/etc/weave.%H.env EnvironmentFile=-/etc/weave.env -ExecStartPre=-/usr/bin/docker rm -f weaveproxy +ExecStartPre=-{{ docker_bin_dir }}/docker rm -f weaveproxy ExecStartPre={{ bin_dir }}/weave launch-proxy $WEAVEPROXY_ARGS -ExecStart=/usr/bin/docker attach weaveproxy +ExecStart={{ docker_bin_dir }}/docker attach weaveproxy Restart=on-failure -ExecStop=/opt/bin/weave stop-proxy +ExecStop={{ bin_dir }}/weave stop-proxy [Install] WantedBy=weave-network.target diff --git a/roles/reset/tasks/main.yml b/roles/reset/tasks/main.yml index 8678a8e44..217ce6729 100644 --- a/roles/reset/tasks/main.yml +++ b/roles/reset/tasks/main.yml @@ -21,7 +21,7 @@ when: ansible_service_mgr == "systemd" and services_removed.changed - name: reset | remove all containers - shell: docker ps -aq | xargs -r docker rm -fv + shell: "{{ docker_bin_dir }}/docker ps -aq | xargs -r docker rm -fv" - name: reset | gather mounted kubelet dirs shell: mount | grep /var/lib/kubelet | awk '{print $3}' | tac diff --git a/scripts/collect-info.yaml b/scripts/collect-info.yaml index 570c358f3..b258284ee 100644 --- a/scripts/collect-info.yaml +++ b/scripts/collect-info.yaml @@ -10,7 +10,7 @@ - name: kernel_info cmd: uname -r - name: docker_info - cmd: docker info + cmd: "{{ docker_bin_dir }}/docker info" - name: ip_info cmd: ip -4 -o a - name: route_info @@ -34,9 +34,11 @@ - name: weave_info cmd: weave report - name: weave_logs - cmd: docker logs weave + cmd: "{{ docker_bin_dir }}/docker logs weave" - name: kube_describe_all cmd: kubectl describe all --all-namespaces + - name: kube_describe_nodes + cmd: kubectl describe nodes - name: kubelet_logs cmd: journalctl -u kubelet --no-pager - name: kubedns_logs