Add support for GPU accelerator

This commit is contained in:
AtzeDeVries 2018-09-13 11:52:09 +02:00 committed by Antoine Legrand
parent 53a685dbf8
commit 91b02c057e
11 changed files with 256 additions and 1 deletions

View file

@ -182,6 +182,20 @@ podsecuritypolicy_enabled: false
## See https://github.com/kubernetes-incubator/kubespray/issues/2141
## Set this variable to true to get rid of this issue
volume_cross_zone_attachment: false
# Add Persistent Volumes Storage Class for corresponding cloud provider ( OpenStack is only supported now )
persistent_volumes_enabled: false
# Add Persistent Volumes Storage Class for corresponding cloud provider ( OpenStack is only supported now )
persistent_volumes_enabled: false
## Container Engine Acceleration
## Enable container accelertion feature, for example use gpu acceleration in containers
# nvidia_accelerator_enabled: true
## Nvidia GPU driver install. Install will by done by a (init) pod running as a daemonset.
## Important: if you use Ubuntu then you should set in all.yml 'docker_storage_options: -s overlay2'
## Array with nvida_gpu_nodes, leave empty or comment if you dont't want to install drivers.
## Labels and taints won't be set to nodes if they are not in the array.
# nvidia_gpu_nodes:
# - kube-gpu-001
# nvidia_driver_version: "384.111"
## flavor can be tesla or gtx
# nvidia_gpu_flavor: gtx

View file

@ -0,0 +1,8 @@
---
dependencies:
- role: kubernetes-apps/container_engine_accelerator/nvidia_gpu
when: nvidia_accelerator_enabled
tags:
- apps
- nvidia_gpu
- container_engine_accelerator

View file

@ -0,0 +1,10 @@
---
nvidia_accelerator_enabled: false
nvidia_driver_version: "390.87"
nvidia_gpu_tesla_base_url: https://us.download.nvidia.com/tesla/
nvidia_gpu_gtx_base_url: http://us.download.nvidia.com/XFree86/Linux-x86_64/
nvidia_gpu_flavor: tesla
nvidia_url_end: "{{nvidia_driver_version}}/NVIDIA-Linux-x86_64-{{nvidia_driver_version}}.run"
nvidia_driver_install_container: false
nvidia_driver_install_supported: false
nvidia_gpu_nodes: []

View file

@ -0,0 +1,54 @@
---
- name: Container Engine Acceleration Nvidia GPU| gather os specific variables
include_vars: "{{ item }}"
with_first_found:
- files:
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_version|lower|replace('/', '_') }}.yml"
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_release }}.yml"
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_major_version|lower|replace('/', '_') }}.yml"
- "{{ ansible_distribution|lower }}.yml"
- "{{ ansible_os_family|lower }}.yml"
skip: true
- name: Container Engine Acceleration Nvidia GPU | Set fact of download url Tesla
set_fact:
nvidia_driver_download_url_default: "{{nvidia_gpu_tesla_base_url}}{{nvidia_url_end}}"
when: nvidia_gpu_flavor|lower == "tesla"
- name: Container Engine Acceleration Nvidia GPU | Set fact of download url GTX
set_fact:
nvidia_driver_download_url_default: "{{nvidia_gpu_gtx_base_url}}{{nvidia_url_end}}"
when: nvidia_gpu_flavor|lower == "gtx"
- name: Container Engine Acceleration Nvidia GPU | Create addon dir
file:
path: "{{ kube_config_dir }}/addons/container_engine_accelerator"
owner: root
group: root
mode: 0755
recurse: true
- name: Container Engine Acceleration Nvidia GPU | Create manifests for nvidia accelerators
template:
src: "{{ item.file }}.j2"
dest: "{{ kube_config_dir }}/addons/container_engine_accelerator/{{ item.file }}"
with_items:
- { name: nvidia-driver-install-daemonset, file: nvidia-driver-install-daemonset.yml, type: daemonset }
- { name: k8s-device-plugin-nvidia-daemonset, file: k8s-device-plugin-nvidia-daemonset.yml, type: daemonset }
register: container_engine_accelerator_manifests
when:
- inventory_hostname == groups['kube-master'][0] and nvidia_driver_install_container
- name: Container Engine Acceleration Nvidia GPU | Apply manifests for nvidia accelerators
kube:
name: "{{ item.item.name }}"
namespace: "kube-system"
kubectl: "{{ bin_dir }}/kubectl"
resource: "{{ item.item.type }}"
filename: "{{ kube_config_dir }}/addons/container_engine_accelerator/{{ item.item.file }}"
state: "latest"
with_items:
- "{{container_engine_accelerator_manifests.results}}"
when:
- inventory_hostname == groups['kube-master'][0] and nvidia_driver_install_container and nvidia_driver_install_supported

View file

@ -0,0 +1,61 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-gpu-device-plugin
namespace: kube-system
labels:
k8s-app: nvidia-gpu-device-plugin
addonmanager.kubernetes.io/mode: Reconcile
spec:
selector:
matchLabels:
k8s-app: nvidia-gpu-device-plugin
template:
metadata:
labels:
k8s-app: nvidia-gpu-device-plugin
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ''
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "nvidia.com/gpu"
operator: Exists
tolerations:
- operator: "Exists"
effect: "NoExecute"
- operator: "Exists"
effect: "NoSchedule"
hostNetwork: true
hostPID: true
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: dev
hostPath:
path: /dev
containers:
- image: "k8s.gcr.io/nvidia-gpu-device-plugin@sha256:0842734032018be107fa2490c98156992911e3e1f2a21e059ff0105b07dd8e9e"
command: ["/usr/bin/nvidia-gpu-device-plugin", "-logtostderr"]
name: nvidia-gpu-device-plugin
resources:
requests:
cpu: 50m
memory: 10Mi
limits:
cpu: 50m
memory: 10Mi
securityContext:
privileged: true
volumeMounts:
- name: device-plugin
mountPath: /device-plugin
- name: dev
mountPath: /dev
updateStrategy:
type: RollingUpdate

View file

@ -0,0 +1,80 @@
# Copyright 2017 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: nvidia-driver-installer
namespace: kube-system
spec:
template:
metadata:
labels:
name: nvidia-driver-installer
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ''
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "nvidia.com/gpu"
operator: Exists
tolerations:
- key: "nvidia.com/gpu"
effect: "NoSchedule"
operator: "Exists"
hostNetwork: true
hostPID: true
volumes:
- name: dev
hostPath:
path: /dev
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia
- name: root-mount
hostPath:
path: /
initContainers:
- image: "{{nvidia_driver_install_container}}"
name: nvidia-driver-installer
resources:
requests:
cpu: 0.15
securityContext:
privileged: true
env:
- name: NVIDIA_INSTALL_DIR_HOST
value: /home/kubernetes/bin/nvidia
- name: NVIDIA_INSTALL_DIR_CONTAINER
value: /usr/local/nvidia
- name: ROOT_MOUNT_DIR
value: /root
- name: NVIDIA_DRIVER_VERSION
value: "{{nvidia_driver_version}}"
- name: NVIDIA_DRIVER_DOWNLOAD_URL
value: "{{nvidia_driver_download_url_default}}"
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: dev
mountPath: /dev
- name: root-mount
mountPath: /root
containers:
- image: "gcr.io/google-containers/pause:2.0"
name: pause

View file

@ -0,0 +1,3 @@
---
nvidia_driver_install_container: atzedevries/nvidia-centos-driver-installer:2
nvidia_driver_install_supported: true

View file

@ -0,0 +1,3 @@
---
nvidia_driver_install_container: gcr.io/google-containers/ubuntu-nvidia-driver-installer@sha256:eea7309dc4fa4a5c9d716157e74b90826e0a853aa26c7219db4710ddcd1ad8bc
nvidia_driver_install_supported: true

View file

@ -0,0 +1,3 @@
---
nvidia_driver_install_container: gcr.io/google-containers/ubuntu-nvidia-driver-installer@sha256:eea7309dc4fa4a5c9d716157e74b90826e0a853aa26c7219db4710ddcd1ad8bc
nvidia_driver_install_supported: true

View file

@ -37,6 +37,12 @@ dependencies:
- apps
- persistent_volumes
- role: kubernetes-apps/container_engine_accelerator
when: nvidia_accelerator_enabled
tags:
- apps
- container_engine_accelerator
- role: kubernetes-apps/cloud_controller/oci
when:
- cloud_provider is defined

View file

@ -100,6 +100,11 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}"
{% else %}
{% set dummy = role_node_labels.append('node-role.kubernetes.io/node=true') %}
{% endif %}
{% if nvidia_gpu_nodes is defined and nvidia_accelerator_enabled|bool %}
{% if inventory_hostname in nvidia_gpu_nodes %}
{% set dummy = role_node_labels.append('nvidia.com/gpu=true') %}
{% endif %}
{% endif %}
{% set inventory_node_labels = [] %}
{% if node_labels is defined %}
{% for labelname, labelvalue in node_labels.iteritems() %}
@ -108,7 +113,15 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}"
{% endif %}
{% set all_node_labels = role_node_labels + inventory_node_labels %}
{# Kubelet node taints for gpu #}
{% if nvidia_gpu_nodes is defined and nvidia_accelerator_enabled|bool %}
{% if inventory_hostname in nvidia_gpu_nodes %}
{% set kubelet_args_kubeconfig %}{{ kubelet_args_kubeconfig }} --register-with-taints=nvidia.com/gpu=:NoSchedule{% endset %}
{% endif %}
{% endif %}
KUBELET_ARGS="{{ kubelet_args_base }} {{ kubelet_args_dns }} {{ kubelet_args_kubeconfig }} {{ kube_reserved }} --node-labels={{ all_node_labels | join(',') }} {% if kube_feature_gates %} --feature-gates={{ kube_feature_gates|join(',') }} {% endif %} {% if kubelet_custom_flags is string %} {{kubelet_custom_flags}} {% else %}{% for flag in kubelet_custom_flags %} {{flag}} {% endfor %}{% endif %}"
{% if kube_network_plugin is defined and kube_network_plugin in ["calico", "canal", "flannel", "weave", "contiv", "cilium"] %}
KUBELET_NETWORK_PLUGIN="--network-plugin=cni --cni-conf-dir=/etc/cni/net.d --cni-bin-dir=/opt/cni/bin"
{% elif kube_network_plugin is defined and kube_network_plugin == "weave" %}