Merge pull request #3304 from kubernetes-incubator/gpu2
Add support for GPU accelerator
This commit is contained in:
commit
a8a62afd74
11 changed files with 256 additions and 1 deletions
|
@ -174,6 +174,20 @@ podsecuritypolicy_enabled: false
|
|||
## See https://github.com/kubernetes-incubator/kubespray/issues/2141
|
||||
## Set this variable to true to get rid of this issue
|
||||
volume_cross_zone_attachment: false
|
||||
|
||||
# Add Persistent Volumes Storage Class for corresponding cloud provider ( OpenStack is only supported now )
|
||||
persistent_volumes_enabled: false
|
||||
# Add Persistent Volumes Storage Class for corresponding cloud provider ( OpenStack is only supported now )
|
||||
persistent_volumes_enabled: false
|
||||
|
||||
## Container Engine Acceleration
|
||||
## Enable container accelertion feature, for example use gpu acceleration in containers
|
||||
# nvidia_accelerator_enabled: true
|
||||
## Nvidia GPU driver install. Install will by done by a (init) pod running as a daemonset.
|
||||
## Important: if you use Ubuntu then you should set in all.yml 'docker_storage_options: -s overlay2'
|
||||
## Array with nvida_gpu_nodes, leave empty or comment if you dont't want to install drivers.
|
||||
## Labels and taints won't be set to nodes if they are not in the array.
|
||||
# nvidia_gpu_nodes:
|
||||
# - kube-gpu-001
|
||||
# nvidia_driver_version: "384.111"
|
||||
## flavor can be tesla or gtx
|
||||
# nvidia_gpu_flavor: gtx
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
---
|
||||
dependencies:
|
||||
- role: kubernetes-apps/container_engine_accelerator/nvidia_gpu
|
||||
when: nvidia_accelerator_enabled
|
||||
tags:
|
||||
- apps
|
||||
- nvidia_gpu
|
||||
- container_engine_accelerator
|
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
nvidia_accelerator_enabled: false
|
||||
nvidia_driver_version: "390.87"
|
||||
nvidia_gpu_tesla_base_url: https://us.download.nvidia.com/tesla/
|
||||
nvidia_gpu_gtx_base_url: http://us.download.nvidia.com/XFree86/Linux-x86_64/
|
||||
nvidia_gpu_flavor: tesla
|
||||
nvidia_url_end: "{{nvidia_driver_version}}/NVIDIA-Linux-x86_64-{{nvidia_driver_version}}.run"
|
||||
nvidia_driver_install_container: false
|
||||
nvidia_driver_install_supported: false
|
||||
nvidia_gpu_nodes: []
|
|
@ -0,0 +1,54 @@
|
|||
---
|
||||
|
||||
- name: Container Engine Acceleration Nvidia GPU| gather os specific variables
|
||||
include_vars: "{{ item }}"
|
||||
with_first_found:
|
||||
- files:
|
||||
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_version|lower|replace('/', '_') }}.yml"
|
||||
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_release }}.yml"
|
||||
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_major_version|lower|replace('/', '_') }}.yml"
|
||||
- "{{ ansible_distribution|lower }}.yml"
|
||||
- "{{ ansible_os_family|lower }}.yml"
|
||||
skip: true
|
||||
|
||||
- name: Container Engine Acceleration Nvidia GPU | Set fact of download url Tesla
|
||||
set_fact:
|
||||
nvidia_driver_download_url_default: "{{nvidia_gpu_tesla_base_url}}{{nvidia_url_end}}"
|
||||
when: nvidia_gpu_flavor|lower == "tesla"
|
||||
|
||||
- name: Container Engine Acceleration Nvidia GPU | Set fact of download url GTX
|
||||
set_fact:
|
||||
nvidia_driver_download_url_default: "{{nvidia_gpu_gtx_base_url}}{{nvidia_url_end}}"
|
||||
when: nvidia_gpu_flavor|lower == "gtx"
|
||||
|
||||
- name: Container Engine Acceleration Nvidia GPU | Create addon dir
|
||||
file:
|
||||
path: "{{ kube_config_dir }}/addons/container_engine_accelerator"
|
||||
owner: root
|
||||
group: root
|
||||
mode: 0755
|
||||
recurse: true
|
||||
|
||||
- name: Container Engine Acceleration Nvidia GPU | Create manifests for nvidia accelerators
|
||||
template:
|
||||
src: "{{ item.file }}.j2"
|
||||
dest: "{{ kube_config_dir }}/addons/container_engine_accelerator/{{ item.file }}"
|
||||
with_items:
|
||||
- { name: nvidia-driver-install-daemonset, file: nvidia-driver-install-daemonset.yml, type: daemonset }
|
||||
- { name: k8s-device-plugin-nvidia-daemonset, file: k8s-device-plugin-nvidia-daemonset.yml, type: daemonset }
|
||||
register: container_engine_accelerator_manifests
|
||||
when:
|
||||
- inventory_hostname == groups['kube-master'][0] and nvidia_driver_install_container
|
||||
|
||||
- name: Container Engine Acceleration Nvidia GPU | Apply manifests for nvidia accelerators
|
||||
kube:
|
||||
name: "{{ item.item.name }}"
|
||||
namespace: "kube-system"
|
||||
kubectl: "{{ bin_dir }}/kubectl"
|
||||
resource: "{{ item.item.type }}"
|
||||
filename: "{{ kube_config_dir }}/addons/container_engine_accelerator/{{ item.item.file }}"
|
||||
state: "latest"
|
||||
with_items:
|
||||
- "{{container_engine_accelerator_manifests.results}}"
|
||||
when:
|
||||
- inventory_hostname == groups['kube-master'][0] and nvidia_driver_install_container and nvidia_driver_install_supported
|
|
@ -0,0 +1,61 @@
|
|||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-gpu-device-plugin
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-app: nvidia-gpu-device-plugin
|
||||
addonmanager.kubernetes.io/mode: Reconcile
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: nvidia-gpu-device-plugin
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: nvidia-gpu-device-plugin
|
||||
annotations:
|
||||
scheduler.alpha.kubernetes.io/critical-pod: ''
|
||||
spec:
|
||||
priorityClassName: system-node-critical
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: "nvidia.com/gpu"
|
||||
operator: Exists
|
||||
tolerations:
|
||||
- operator: "Exists"
|
||||
effect: "NoExecute"
|
||||
- operator: "Exists"
|
||||
effect: "NoSchedule"
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: dev
|
||||
hostPath:
|
||||
path: /dev
|
||||
containers:
|
||||
- image: "k8s.gcr.io/nvidia-gpu-device-plugin@sha256:0842734032018be107fa2490c98156992911e3e1f2a21e059ff0105b07dd8e9e"
|
||||
command: ["/usr/bin/nvidia-gpu-device-plugin", "-logtostderr"]
|
||||
name: nvidia-gpu-device-plugin
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 10Mi
|
||||
limits:
|
||||
cpu: 50m
|
||||
memory: 10Mi
|
||||
securityContext:
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /device-plugin
|
||||
- name: dev
|
||||
mountPath: /dev
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
|
@ -0,0 +1,80 @@
|
|||
# Copyright 2017 Google Inc. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-driver-installer
|
||||
namespace: kube-system
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: nvidia-driver-installer
|
||||
annotations:
|
||||
scheduler.alpha.kubernetes.io/critical-pod: ''
|
||||
spec:
|
||||
priorityClassName: system-node-critical
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: "nvidia.com/gpu"
|
||||
operator: Exists
|
||||
tolerations:
|
||||
- key: "nvidia.com/gpu"
|
||||
effect: "NoSchedule"
|
||||
operator: "Exists"
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
volumes:
|
||||
- name: dev
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: nvidia-install-dir-host
|
||||
hostPath:
|
||||
path: /home/kubernetes/bin/nvidia
|
||||
- name: root-mount
|
||||
hostPath:
|
||||
path: /
|
||||
initContainers:
|
||||
- image: "{{nvidia_driver_install_container}}"
|
||||
name: nvidia-driver-installer
|
||||
resources:
|
||||
requests:
|
||||
cpu: 0.15
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: NVIDIA_INSTALL_DIR_HOST
|
||||
value: /home/kubernetes/bin/nvidia
|
||||
- name: NVIDIA_INSTALL_DIR_CONTAINER
|
||||
value: /usr/local/nvidia
|
||||
- name: ROOT_MOUNT_DIR
|
||||
value: /root
|
||||
- name: NVIDIA_DRIVER_VERSION
|
||||
value: "{{nvidia_driver_version}}"
|
||||
- name: NVIDIA_DRIVER_DOWNLOAD_URL
|
||||
value: "{{nvidia_driver_download_url_default}}"
|
||||
volumeMounts:
|
||||
- name: nvidia-install-dir-host
|
||||
mountPath: /usr/local/nvidia
|
||||
- name: dev
|
||||
mountPath: /dev
|
||||
- name: root-mount
|
||||
mountPath: /root
|
||||
containers:
|
||||
- image: "gcr.io/google-containers/pause:2.0"
|
||||
name: pause
|
|
@ -0,0 +1,3 @@
|
|||
---
|
||||
nvidia_driver_install_container: atzedevries/nvidia-centos-driver-installer:2
|
||||
nvidia_driver_install_supported: true
|
|
@ -0,0 +1,3 @@
|
|||
---
|
||||
nvidia_driver_install_container: gcr.io/google-containers/ubuntu-nvidia-driver-installer@sha256:eea7309dc4fa4a5c9d716157e74b90826e0a853aa26c7219db4710ddcd1ad8bc
|
||||
nvidia_driver_install_supported: true
|
|
@ -0,0 +1,3 @@
|
|||
---
|
||||
nvidia_driver_install_container: gcr.io/google-containers/ubuntu-nvidia-driver-installer@sha256:eea7309dc4fa4a5c9d716157e74b90826e0a853aa26c7219db4710ddcd1ad8bc
|
||||
nvidia_driver_install_supported: true
|
|
@ -37,6 +37,12 @@ dependencies:
|
|||
- apps
|
||||
- persistent_volumes
|
||||
|
||||
- role: kubernetes-apps/container_engine_accelerator
|
||||
when: nvidia_accelerator_enabled
|
||||
tags:
|
||||
- apps
|
||||
- container_engine_accelerator
|
||||
|
||||
- role: kubernetes-apps/cloud_controller/oci
|
||||
when:
|
||||
- cloud_provider is defined
|
||||
|
|
|
@ -100,6 +100,11 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}"
|
|||
{% else %}
|
||||
{% set dummy = role_node_labels.append('node-role.kubernetes.io/node=true') %}
|
||||
{% endif %}
|
||||
{% if nvidia_gpu_nodes is defined and nvidia_accelerator_enabled|bool %}
|
||||
{% if inventory_hostname in nvidia_gpu_nodes %}
|
||||
{% set dummy = role_node_labels.append('nvidia.com/gpu=true') %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% set inventory_node_labels = [] %}
|
||||
{% if node_labels is defined %}
|
||||
{% for labelname, labelvalue in node_labels.iteritems() %}
|
||||
|
@ -108,7 +113,15 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}"
|
|||
{% endif %}
|
||||
{% set all_node_labels = role_node_labels + inventory_node_labels %}
|
||||
|
||||
{# Kubelet node taints for gpu #}
|
||||
{% if nvidia_gpu_nodes is defined and nvidia_accelerator_enabled|bool %}
|
||||
{% if inventory_hostname in nvidia_gpu_nodes %}
|
||||
{% set kubelet_args_kubeconfig %}{{ kubelet_args_kubeconfig }} --register-with-taints=nvidia.com/gpu=:NoSchedule{% endset %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
KUBELET_ARGS="{{ kubelet_args_base }} {{ kubelet_args_dns }} {{ kubelet_args_kubeconfig }} {{ kube_reserved }} --node-labels={{ all_node_labels | join(',') }} {% if kube_feature_gates %} --feature-gates={{ kube_feature_gates|join(',') }} {% endif %} {% if kubelet_custom_flags is string %} {{kubelet_custom_flags}} {% else %}{% for flag in kubelet_custom_flags %} {{flag}} {% endfor %}{% endif %}{% if inventory_hostname in groups['kube-node'] %}{% if kubelet_node_custom_flags is string %} {{kubelet_node_custom_flags}} {% else %}{% for flag in kubelet_node_custom_flags %} {{flag}} {% endfor %}{% endif %}{% endif %}"
|
||||
|
||||
{% if kube_network_plugin is defined and kube_network_plugin in ["calico", "canal", "flannel", "weave", "contiv", "cilium"] %}
|
||||
KUBELET_NETWORK_PLUGIN="--network-plugin=cni --cni-conf-dir=/etc/cni/net.d --cni-bin-dir=/opt/cni/bin"
|
||||
{% elif kube_network_plugin is defined and kube_network_plugin == "weave" %}
|
||||
|
|
Loading…
Reference in a new issue