From 91b02c057e2ee9025891c998ac133a60bd3236ba Mon Sep 17 00:00:00 2001 From: AtzeDeVries Date: Thu, 13 Sep 2018 11:52:09 +0200 Subject: [PATCH] Add support for GPU accelerator --- .../group_vars/k8s-cluster/k8s-cluster.yml | 16 +++- .../meta/main.yml | 8 ++ .../nvidia_gpu/defaults/main.yml | 10 +++ .../nvidia_gpu/tasks/main.yml | 54 +++++++++++++ .../k8s-device-plugin-nvidia-daemonset.yml.j2 | 61 ++++++++++++++ .../nvidia-driver-install-daemonset.yml.j2 | 80 +++++++++++++++++++ .../nvidia_gpu/vars/centos-7.yml | 3 + .../nvidia_gpu/vars/ubuntu-16.yml | 3 + .../nvidia_gpu/vars/ubuntu-18.yml | 3 + roles/kubernetes-apps/meta/main.yml | 6 ++ .../node/templates/kubelet.standard.env.j2 | 13 +++ 11 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 roles/kubernetes-apps/container_engine_accelerator/meta/main.yml create mode 100644 roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/defaults/main.yml create mode 100644 roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/tasks/main.yml create mode 100644 roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/templates/k8s-device-plugin-nvidia-daemonset.yml.j2 create mode 100644 roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/templates/nvidia-driver-install-daemonset.yml.j2 create mode 100644 roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/centos-7.yml create mode 100644 roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/ubuntu-16.yml create mode 100644 roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/ubuntu-18.yml diff --git a/inventory/sample/group_vars/k8s-cluster/k8s-cluster.yml b/inventory/sample/group_vars/k8s-cluster/k8s-cluster.yml index 6096f52a3..3746f3951 100644 --- a/inventory/sample/group_vars/k8s-cluster/k8s-cluster.yml +++ b/inventory/sample/group_vars/k8s-cluster/k8s-cluster.yml @@ -182,6 +182,20 @@ podsecuritypolicy_enabled: false ## See https://github.com/kubernetes-incubator/kubespray/issues/2141 ## Set this variable to true to get rid of this issue volume_cross_zone_attachment: false - # Add Persistent Volumes Storage Class for corresponding cloud provider ( OpenStack is only supported now ) persistent_volumes_enabled: false +# Add Persistent Volumes Storage Class for corresponding cloud provider ( OpenStack is only supported now ) +persistent_volumes_enabled: false + +## Container Engine Acceleration +## Enable container accelertion feature, for example use gpu acceleration in containers +# nvidia_accelerator_enabled: true +## Nvidia GPU driver install. Install will by done by a (init) pod running as a daemonset. +## Important: if you use Ubuntu then you should set in all.yml 'docker_storage_options: -s overlay2' +## Array with nvida_gpu_nodes, leave empty or comment if you dont't want to install drivers. +## Labels and taints won't be set to nodes if they are not in the array. +# nvidia_gpu_nodes: +# - kube-gpu-001 +# nvidia_driver_version: "384.111" +## flavor can be tesla or gtx +# nvidia_gpu_flavor: gtx diff --git a/roles/kubernetes-apps/container_engine_accelerator/meta/main.yml b/roles/kubernetes-apps/container_engine_accelerator/meta/main.yml new file mode 100644 index 000000000..c82c5d86b --- /dev/null +++ b/roles/kubernetes-apps/container_engine_accelerator/meta/main.yml @@ -0,0 +1,8 @@ +--- +dependencies: + - role: kubernetes-apps/container_engine_accelerator/nvidia_gpu + when: nvidia_accelerator_enabled + tags: + - apps + - nvidia_gpu + - container_engine_accelerator diff --git a/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/defaults/main.yml b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/defaults/main.yml new file mode 100644 index 000000000..34aea1c47 --- /dev/null +++ b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/defaults/main.yml @@ -0,0 +1,10 @@ +--- +nvidia_accelerator_enabled: false +nvidia_driver_version: "390.87" +nvidia_gpu_tesla_base_url: https://us.download.nvidia.com/tesla/ +nvidia_gpu_gtx_base_url: http://us.download.nvidia.com/XFree86/Linux-x86_64/ +nvidia_gpu_flavor: tesla +nvidia_url_end: "{{nvidia_driver_version}}/NVIDIA-Linux-x86_64-{{nvidia_driver_version}}.run" +nvidia_driver_install_container: false +nvidia_driver_install_supported: false +nvidia_gpu_nodes: [] diff --git a/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/tasks/main.yml b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/tasks/main.yml new file mode 100644 index 000000000..50822be7d --- /dev/null +++ b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/tasks/main.yml @@ -0,0 +1,54 @@ +--- + +- name: Container Engine Acceleration Nvidia GPU| gather os specific variables + include_vars: "{{ item }}" + with_first_found: + - files: + - "{{ ansible_distribution|lower }}-{{ ansible_distribution_version|lower|replace('/', '_') }}.yml" + - "{{ ansible_distribution|lower }}-{{ ansible_distribution_release }}.yml" + - "{{ ansible_distribution|lower }}-{{ ansible_distribution_major_version|lower|replace('/', '_') }}.yml" + - "{{ ansible_distribution|lower }}.yml" + - "{{ ansible_os_family|lower }}.yml" + skip: true + +- name: Container Engine Acceleration Nvidia GPU | Set fact of download url Tesla + set_fact: + nvidia_driver_download_url_default: "{{nvidia_gpu_tesla_base_url}}{{nvidia_url_end}}" + when: nvidia_gpu_flavor|lower == "tesla" + +- name: Container Engine Acceleration Nvidia GPU | Set fact of download url GTX + set_fact: + nvidia_driver_download_url_default: "{{nvidia_gpu_gtx_base_url}}{{nvidia_url_end}}" + when: nvidia_gpu_flavor|lower == "gtx" + +- name: Container Engine Acceleration Nvidia GPU | Create addon dir + file: + path: "{{ kube_config_dir }}/addons/container_engine_accelerator" + owner: root + group: root + mode: 0755 + recurse: true + +- name: Container Engine Acceleration Nvidia GPU | Create manifests for nvidia accelerators + template: + src: "{{ item.file }}.j2" + dest: "{{ kube_config_dir }}/addons/container_engine_accelerator/{{ item.file }}" + with_items: + - { name: nvidia-driver-install-daemonset, file: nvidia-driver-install-daemonset.yml, type: daemonset } + - { name: k8s-device-plugin-nvidia-daemonset, file: k8s-device-plugin-nvidia-daemonset.yml, type: daemonset } + register: container_engine_accelerator_manifests + when: + - inventory_hostname == groups['kube-master'][0] and nvidia_driver_install_container + +- name: Container Engine Acceleration Nvidia GPU | Apply manifests for nvidia accelerators + kube: + name: "{{ item.item.name }}" + namespace: "kube-system" + kubectl: "{{ bin_dir }}/kubectl" + resource: "{{ item.item.type }}" + filename: "{{ kube_config_dir }}/addons/container_engine_accelerator/{{ item.item.file }}" + state: "latest" + with_items: + - "{{container_engine_accelerator_manifests.results}}" + when: + - inventory_hostname == groups['kube-master'][0] and nvidia_driver_install_container and nvidia_driver_install_supported diff --git a/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/templates/k8s-device-plugin-nvidia-daemonset.yml.j2 b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/templates/k8s-device-plugin-nvidia-daemonset.yml.j2 new file mode 100644 index 000000000..84f440442 --- /dev/null +++ b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/templates/k8s-device-plugin-nvidia-daemonset.yml.j2 @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-gpu-device-plugin + namespace: kube-system + labels: + k8s-app: nvidia-gpu-device-plugin + addonmanager.kubernetes.io/mode: Reconcile +spec: + selector: + matchLabels: + k8s-app: nvidia-gpu-device-plugin + template: + metadata: + labels: + k8s-app: nvidia-gpu-device-plugin + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' + spec: + priorityClassName: system-node-critical + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "nvidia.com/gpu" + operator: Exists + tolerations: + - operator: "Exists" + effect: "NoExecute" + - operator: "Exists" + effect: "NoSchedule" + hostNetwork: true + hostPID: true + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: dev + hostPath: + path: /dev + containers: + - image: "k8s.gcr.io/nvidia-gpu-device-plugin@sha256:0842734032018be107fa2490c98156992911e3e1f2a21e059ff0105b07dd8e9e" + command: ["/usr/bin/nvidia-gpu-device-plugin", "-logtostderr"] + name: nvidia-gpu-device-plugin + resources: + requests: + cpu: 50m + memory: 10Mi + limits: + cpu: 50m + memory: 10Mi + securityContext: + privileged: true + volumeMounts: + - name: device-plugin + mountPath: /device-plugin + - name: dev + mountPath: /dev + updateStrategy: + type: RollingUpdate diff --git a/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/templates/nvidia-driver-install-daemonset.yml.j2 b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/templates/nvidia-driver-install-daemonset.yml.j2 new file mode 100644 index 000000000..a1adede5a --- /dev/null +++ b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/templates/nvidia-driver-install-daemonset.yml.j2 @@ -0,0 +1,80 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: nvidia-driver-installer + namespace: kube-system +spec: + template: + metadata: + labels: + name: nvidia-driver-installer + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' + spec: + priorityClassName: system-node-critical + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "nvidia.com/gpu" + operator: Exists + tolerations: + - key: "nvidia.com/gpu" + effect: "NoSchedule" + operator: "Exists" + hostNetwork: true + hostPID: true + volumes: + - name: dev + hostPath: + path: /dev + - name: nvidia-install-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: root-mount + hostPath: + path: / + initContainers: + - image: "{{nvidia_driver_install_container}}" + name: nvidia-driver-installer + resources: + requests: + cpu: 0.15 + securityContext: + privileged: true + env: + - name: NVIDIA_INSTALL_DIR_HOST + value: /home/kubernetes/bin/nvidia + - name: NVIDIA_INSTALL_DIR_CONTAINER + value: /usr/local/nvidia + - name: ROOT_MOUNT_DIR + value: /root + - name: NVIDIA_DRIVER_VERSION + value: "{{nvidia_driver_version}}" + - name: NVIDIA_DRIVER_DOWNLOAD_URL + value: "{{nvidia_driver_download_url_default}}" + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia + - name: dev + mountPath: /dev + - name: root-mount + mountPath: /root + containers: + - image: "gcr.io/google-containers/pause:2.0" + name: pause diff --git a/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/centos-7.yml b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/centos-7.yml new file mode 100644 index 000000000..5f6adfde7 --- /dev/null +++ b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/centos-7.yml @@ -0,0 +1,3 @@ +--- +nvidia_driver_install_container: atzedevries/nvidia-centos-driver-installer:2 +nvidia_driver_install_supported: true diff --git a/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/ubuntu-16.yml b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/ubuntu-16.yml new file mode 100644 index 000000000..04b9e0ac9 --- /dev/null +++ b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/ubuntu-16.yml @@ -0,0 +1,3 @@ +--- +nvidia_driver_install_container: gcr.io/google-containers/ubuntu-nvidia-driver-installer@sha256:eea7309dc4fa4a5c9d716157e74b90826e0a853aa26c7219db4710ddcd1ad8bc +nvidia_driver_install_supported: true diff --git a/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/ubuntu-18.yml b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/ubuntu-18.yml new file mode 100644 index 000000000..04b9e0ac9 --- /dev/null +++ b/roles/kubernetes-apps/container_engine_accelerator/nvidia_gpu/vars/ubuntu-18.yml @@ -0,0 +1,3 @@ +--- +nvidia_driver_install_container: gcr.io/google-containers/ubuntu-nvidia-driver-installer@sha256:eea7309dc4fa4a5c9d716157e74b90826e0a853aa26c7219db4710ddcd1ad8bc +nvidia_driver_install_supported: true diff --git a/roles/kubernetes-apps/meta/main.yml b/roles/kubernetes-apps/meta/main.yml index 4a2982f2a..b6f94d1f1 100644 --- a/roles/kubernetes-apps/meta/main.yml +++ b/roles/kubernetes-apps/meta/main.yml @@ -37,6 +37,12 @@ dependencies: - apps - persistent_volumes + - role: kubernetes-apps/container_engine_accelerator + when: nvidia_accelerator_enabled + tags: + - apps + - container_engine_accelerator + - role: kubernetes-apps/cloud_controller/oci when: - cloud_provider is defined diff --git a/roles/kubernetes/node/templates/kubelet.standard.env.j2 b/roles/kubernetes/node/templates/kubelet.standard.env.j2 index a1c514c8c..960492006 100644 --- a/roles/kubernetes/node/templates/kubelet.standard.env.j2 +++ b/roles/kubernetes/node/templates/kubelet.standard.env.j2 @@ -100,6 +100,11 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}" {% else %} {% set dummy = role_node_labels.append('node-role.kubernetes.io/node=true') %} {% endif %} +{% if nvidia_gpu_nodes is defined and nvidia_accelerator_enabled|bool %} +{% if inventory_hostname in nvidia_gpu_nodes %} +{% set dummy = role_node_labels.append('nvidia.com/gpu=true') %} +{% endif %} +{% endif %} {% set inventory_node_labels = [] %} {% if node_labels is defined %} {% for labelname, labelvalue in node_labels.iteritems() %} @@ -108,7 +113,15 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}" {% endif %} {% set all_node_labels = role_node_labels + inventory_node_labels %} +{# Kubelet node taints for gpu #} +{% if nvidia_gpu_nodes is defined and nvidia_accelerator_enabled|bool %} +{% if inventory_hostname in nvidia_gpu_nodes %} +{% set kubelet_args_kubeconfig %}{{ kubelet_args_kubeconfig }} --register-with-taints=nvidia.com/gpu=:NoSchedule{% endset %} +{% endif %} +{% endif %} + KUBELET_ARGS="{{ kubelet_args_base }} {{ kubelet_args_dns }} {{ kubelet_args_kubeconfig }} {{ kube_reserved }} --node-labels={{ all_node_labels | join(',') }} {% if kube_feature_gates %} --feature-gates={{ kube_feature_gates|join(',') }} {% endif %} {% if kubelet_custom_flags is string %} {{kubelet_custom_flags}} {% else %}{% for flag in kubelet_custom_flags %} {{flag}} {% endfor %}{% endif %}" + {% if kube_network_plugin is defined and kube_network_plugin in ["calico", "canal", "flannel", "weave", "contiv", "cilium"] %} KUBELET_NETWORK_PLUGIN="--network-plugin=cni --cni-conf-dir=/etc/cni/net.d --cni-bin-dir=/opt/cni/bin" {% elif kube_network_plugin is defined and kube_network_plugin == "weave" %}