Merge pull request #3304 from kubernetes-incubator/gpu2
Add support for GPU accelerator
This commit is contained in:
commit
a8a62afd74
11 changed files with 256 additions and 1 deletions
|
@ -174,6 +174,20 @@ podsecuritypolicy_enabled: false
|
||||||
## See https://github.com/kubernetes-incubator/kubespray/issues/2141
|
## See https://github.com/kubernetes-incubator/kubespray/issues/2141
|
||||||
## Set this variable to true to get rid of this issue
|
## Set this variable to true to get rid of this issue
|
||||||
volume_cross_zone_attachment: false
|
volume_cross_zone_attachment: false
|
||||||
|
|
||||||
# Add Persistent Volumes Storage Class for corresponding cloud provider ( OpenStack is only supported now )
|
# Add Persistent Volumes Storage Class for corresponding cloud provider ( OpenStack is only supported now )
|
||||||
persistent_volumes_enabled: false
|
persistent_volumes_enabled: false
|
||||||
|
# Add Persistent Volumes Storage Class for corresponding cloud provider ( OpenStack is only supported now )
|
||||||
|
persistent_volumes_enabled: false
|
||||||
|
|
||||||
|
## Container Engine Acceleration
|
||||||
|
## Enable container accelertion feature, for example use gpu acceleration in containers
|
||||||
|
# nvidia_accelerator_enabled: true
|
||||||
|
## Nvidia GPU driver install. Install will by done by a (init) pod running as a daemonset.
|
||||||
|
## Important: if you use Ubuntu then you should set in all.yml 'docker_storage_options: -s overlay2'
|
||||||
|
## Array with nvida_gpu_nodes, leave empty or comment if you dont't want to install drivers.
|
||||||
|
## Labels and taints won't be set to nodes if they are not in the array.
|
||||||
|
# nvidia_gpu_nodes:
|
||||||
|
# - kube-gpu-001
|
||||||
|
# nvidia_driver_version: "384.111"
|
||||||
|
## flavor can be tesla or gtx
|
||||||
|
# nvidia_gpu_flavor: gtx
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
---
|
||||||
|
dependencies:
|
||||||
|
- role: kubernetes-apps/container_engine_accelerator/nvidia_gpu
|
||||||
|
when: nvidia_accelerator_enabled
|
||||||
|
tags:
|
||||||
|
- apps
|
||||||
|
- nvidia_gpu
|
||||||
|
- container_engine_accelerator
|
|
@ -0,0 +1,10 @@
|
||||||
|
---
|
||||||
|
nvidia_accelerator_enabled: false
|
||||||
|
nvidia_driver_version: "390.87"
|
||||||
|
nvidia_gpu_tesla_base_url: https://us.download.nvidia.com/tesla/
|
||||||
|
nvidia_gpu_gtx_base_url: http://us.download.nvidia.com/XFree86/Linux-x86_64/
|
||||||
|
nvidia_gpu_flavor: tesla
|
||||||
|
nvidia_url_end: "{{nvidia_driver_version}}/NVIDIA-Linux-x86_64-{{nvidia_driver_version}}.run"
|
||||||
|
nvidia_driver_install_container: false
|
||||||
|
nvidia_driver_install_supported: false
|
||||||
|
nvidia_gpu_nodes: []
|
|
@ -0,0 +1,54 @@
|
||||||
|
---
|
||||||
|
|
||||||
|
- name: Container Engine Acceleration Nvidia GPU| gather os specific variables
|
||||||
|
include_vars: "{{ item }}"
|
||||||
|
with_first_found:
|
||||||
|
- files:
|
||||||
|
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_version|lower|replace('/', '_') }}.yml"
|
||||||
|
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_release }}.yml"
|
||||||
|
- "{{ ansible_distribution|lower }}-{{ ansible_distribution_major_version|lower|replace('/', '_') }}.yml"
|
||||||
|
- "{{ ansible_distribution|lower }}.yml"
|
||||||
|
- "{{ ansible_os_family|lower }}.yml"
|
||||||
|
skip: true
|
||||||
|
|
||||||
|
- name: Container Engine Acceleration Nvidia GPU | Set fact of download url Tesla
|
||||||
|
set_fact:
|
||||||
|
nvidia_driver_download_url_default: "{{nvidia_gpu_tesla_base_url}}{{nvidia_url_end}}"
|
||||||
|
when: nvidia_gpu_flavor|lower == "tesla"
|
||||||
|
|
||||||
|
- name: Container Engine Acceleration Nvidia GPU | Set fact of download url GTX
|
||||||
|
set_fact:
|
||||||
|
nvidia_driver_download_url_default: "{{nvidia_gpu_gtx_base_url}}{{nvidia_url_end}}"
|
||||||
|
when: nvidia_gpu_flavor|lower == "gtx"
|
||||||
|
|
||||||
|
- name: Container Engine Acceleration Nvidia GPU | Create addon dir
|
||||||
|
file:
|
||||||
|
path: "{{ kube_config_dir }}/addons/container_engine_accelerator"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: 0755
|
||||||
|
recurse: true
|
||||||
|
|
||||||
|
- name: Container Engine Acceleration Nvidia GPU | Create manifests for nvidia accelerators
|
||||||
|
template:
|
||||||
|
src: "{{ item.file }}.j2"
|
||||||
|
dest: "{{ kube_config_dir }}/addons/container_engine_accelerator/{{ item.file }}"
|
||||||
|
with_items:
|
||||||
|
- { name: nvidia-driver-install-daemonset, file: nvidia-driver-install-daemonset.yml, type: daemonset }
|
||||||
|
- { name: k8s-device-plugin-nvidia-daemonset, file: k8s-device-plugin-nvidia-daemonset.yml, type: daemonset }
|
||||||
|
register: container_engine_accelerator_manifests
|
||||||
|
when:
|
||||||
|
- inventory_hostname == groups['kube-master'][0] and nvidia_driver_install_container
|
||||||
|
|
||||||
|
- name: Container Engine Acceleration Nvidia GPU | Apply manifests for nvidia accelerators
|
||||||
|
kube:
|
||||||
|
name: "{{ item.item.name }}"
|
||||||
|
namespace: "kube-system"
|
||||||
|
kubectl: "{{ bin_dir }}/kubectl"
|
||||||
|
resource: "{{ item.item.type }}"
|
||||||
|
filename: "{{ kube_config_dir }}/addons/container_engine_accelerator/{{ item.item.file }}"
|
||||||
|
state: "latest"
|
||||||
|
with_items:
|
||||||
|
- "{{container_engine_accelerator_manifests.results}}"
|
||||||
|
when:
|
||||||
|
- inventory_hostname == groups['kube-master'][0] and nvidia_driver_install_container and nvidia_driver_install_supported
|
|
@ -0,0 +1,61 @@
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: nvidia-gpu-device-plugin
|
||||||
|
namespace: kube-system
|
||||||
|
labels:
|
||||||
|
k8s-app: nvidia-gpu-device-plugin
|
||||||
|
addonmanager.kubernetes.io/mode: Reconcile
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
k8s-app: nvidia-gpu-device-plugin
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
k8s-app: nvidia-gpu-device-plugin
|
||||||
|
annotations:
|
||||||
|
scheduler.alpha.kubernetes.io/critical-pod: ''
|
||||||
|
spec:
|
||||||
|
priorityClassName: system-node-critical
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: "nvidia.com/gpu"
|
||||||
|
operator: Exists
|
||||||
|
tolerations:
|
||||||
|
- operator: "Exists"
|
||||||
|
effect: "NoExecute"
|
||||||
|
- operator: "Exists"
|
||||||
|
effect: "NoSchedule"
|
||||||
|
hostNetwork: true
|
||||||
|
hostPID: true
|
||||||
|
volumes:
|
||||||
|
- name: device-plugin
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet/device-plugins
|
||||||
|
- name: dev
|
||||||
|
hostPath:
|
||||||
|
path: /dev
|
||||||
|
containers:
|
||||||
|
- image: "k8s.gcr.io/nvidia-gpu-device-plugin@sha256:0842734032018be107fa2490c98156992911e3e1f2a21e059ff0105b07dd8e9e"
|
||||||
|
command: ["/usr/bin/nvidia-gpu-device-plugin", "-logtostderr"]
|
||||||
|
name: nvidia-gpu-device-plugin
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 10Mi
|
||||||
|
limits:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 10Mi
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
volumeMounts:
|
||||||
|
- name: device-plugin
|
||||||
|
mountPath: /device-plugin
|
||||||
|
- name: dev
|
||||||
|
mountPath: /dev
|
||||||
|
updateStrategy:
|
||||||
|
type: RollingUpdate
|
|
@ -0,0 +1,80 @@
|
||||||
|
# Copyright 2017 Google Inc. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: nvidia-driver-installer
|
||||||
|
namespace: kube-system
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: nvidia-driver-installer
|
||||||
|
annotations:
|
||||||
|
scheduler.alpha.kubernetes.io/critical-pod: ''
|
||||||
|
spec:
|
||||||
|
priorityClassName: system-node-critical
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: "nvidia.com/gpu"
|
||||||
|
operator: Exists
|
||||||
|
tolerations:
|
||||||
|
- key: "nvidia.com/gpu"
|
||||||
|
effect: "NoSchedule"
|
||||||
|
operator: "Exists"
|
||||||
|
hostNetwork: true
|
||||||
|
hostPID: true
|
||||||
|
volumes:
|
||||||
|
- name: dev
|
||||||
|
hostPath:
|
||||||
|
path: /dev
|
||||||
|
- name: nvidia-install-dir-host
|
||||||
|
hostPath:
|
||||||
|
path: /home/kubernetes/bin/nvidia
|
||||||
|
- name: root-mount
|
||||||
|
hostPath:
|
||||||
|
path: /
|
||||||
|
initContainers:
|
||||||
|
- image: "{{nvidia_driver_install_container}}"
|
||||||
|
name: nvidia-driver-installer
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 0.15
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
env:
|
||||||
|
- name: NVIDIA_INSTALL_DIR_HOST
|
||||||
|
value: /home/kubernetes/bin/nvidia
|
||||||
|
- name: NVIDIA_INSTALL_DIR_CONTAINER
|
||||||
|
value: /usr/local/nvidia
|
||||||
|
- name: ROOT_MOUNT_DIR
|
||||||
|
value: /root
|
||||||
|
- name: NVIDIA_DRIVER_VERSION
|
||||||
|
value: "{{nvidia_driver_version}}"
|
||||||
|
- name: NVIDIA_DRIVER_DOWNLOAD_URL
|
||||||
|
value: "{{nvidia_driver_download_url_default}}"
|
||||||
|
volumeMounts:
|
||||||
|
- name: nvidia-install-dir-host
|
||||||
|
mountPath: /usr/local/nvidia
|
||||||
|
- name: dev
|
||||||
|
mountPath: /dev
|
||||||
|
- name: root-mount
|
||||||
|
mountPath: /root
|
||||||
|
containers:
|
||||||
|
- image: "gcr.io/google-containers/pause:2.0"
|
||||||
|
name: pause
|
|
@ -0,0 +1,3 @@
|
||||||
|
---
|
||||||
|
nvidia_driver_install_container: atzedevries/nvidia-centos-driver-installer:2
|
||||||
|
nvidia_driver_install_supported: true
|
|
@ -0,0 +1,3 @@
|
||||||
|
---
|
||||||
|
nvidia_driver_install_container: gcr.io/google-containers/ubuntu-nvidia-driver-installer@sha256:eea7309dc4fa4a5c9d716157e74b90826e0a853aa26c7219db4710ddcd1ad8bc
|
||||||
|
nvidia_driver_install_supported: true
|
|
@ -0,0 +1,3 @@
|
||||||
|
---
|
||||||
|
nvidia_driver_install_container: gcr.io/google-containers/ubuntu-nvidia-driver-installer@sha256:eea7309dc4fa4a5c9d716157e74b90826e0a853aa26c7219db4710ddcd1ad8bc
|
||||||
|
nvidia_driver_install_supported: true
|
|
@ -37,6 +37,12 @@ dependencies:
|
||||||
- apps
|
- apps
|
||||||
- persistent_volumes
|
- persistent_volumes
|
||||||
|
|
||||||
|
- role: kubernetes-apps/container_engine_accelerator
|
||||||
|
when: nvidia_accelerator_enabled
|
||||||
|
tags:
|
||||||
|
- apps
|
||||||
|
- container_engine_accelerator
|
||||||
|
|
||||||
- role: kubernetes-apps/cloud_controller/oci
|
- role: kubernetes-apps/cloud_controller/oci
|
||||||
when:
|
when:
|
||||||
- cloud_provider is defined
|
- cloud_provider is defined
|
||||||
|
|
|
@ -100,6 +100,11 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}"
|
||||||
{% else %}
|
{% else %}
|
||||||
{% set dummy = role_node_labels.append('node-role.kubernetes.io/node=true') %}
|
{% set dummy = role_node_labels.append('node-role.kubernetes.io/node=true') %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{% if nvidia_gpu_nodes is defined and nvidia_accelerator_enabled|bool %}
|
||||||
|
{% if inventory_hostname in nvidia_gpu_nodes %}
|
||||||
|
{% set dummy = role_node_labels.append('nvidia.com/gpu=true') %}
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
{% set inventory_node_labels = [] %}
|
{% set inventory_node_labels = [] %}
|
||||||
{% if node_labels is defined %}
|
{% if node_labels is defined %}
|
||||||
{% for labelname, labelvalue in node_labels.iteritems() %}
|
{% for labelname, labelvalue in node_labels.iteritems() %}
|
||||||
|
@ -108,7 +113,15 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}"
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% set all_node_labels = role_node_labels + inventory_node_labels %}
|
{% set all_node_labels = role_node_labels + inventory_node_labels %}
|
||||||
|
|
||||||
|
{# Kubelet node taints for gpu #}
|
||||||
|
{% if nvidia_gpu_nodes is defined and nvidia_accelerator_enabled|bool %}
|
||||||
|
{% if inventory_hostname in nvidia_gpu_nodes %}
|
||||||
|
{% set kubelet_args_kubeconfig %}{{ kubelet_args_kubeconfig }} --register-with-taints=nvidia.com/gpu=:NoSchedule{% endset %}
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
KUBELET_ARGS="{{ kubelet_args_base }} {{ kubelet_args_dns }} {{ kubelet_args_kubeconfig }} {{ kube_reserved }} --node-labels={{ all_node_labels | join(',') }} {% if kube_feature_gates %} --feature-gates={{ kube_feature_gates|join(',') }} {% endif %} {% if kubelet_custom_flags is string %} {{kubelet_custom_flags}} {% else %}{% for flag in kubelet_custom_flags %} {{flag}} {% endfor %}{% endif %}{% if inventory_hostname in groups['kube-node'] %}{% if kubelet_node_custom_flags is string %} {{kubelet_node_custom_flags}} {% else %}{% for flag in kubelet_node_custom_flags %} {{flag}} {% endfor %}{% endif %}{% endif %}"
|
KUBELET_ARGS="{{ kubelet_args_base }} {{ kubelet_args_dns }} {{ kubelet_args_kubeconfig }} {{ kube_reserved }} --node-labels={{ all_node_labels | join(',') }} {% if kube_feature_gates %} --feature-gates={{ kube_feature_gates|join(',') }} {% endif %} {% if kubelet_custom_flags is string %} {{kubelet_custom_flags}} {% else %}{% for flag in kubelet_custom_flags %} {{flag}} {% endfor %}{% endif %}{% if inventory_hostname in groups['kube-node'] %}{% if kubelet_node_custom_flags is string %} {{kubelet_node_custom_flags}} {% else %}{% for flag in kubelet_node_custom_flags %} {{flag}} {% endfor %}{% endif %}{% endif %}"
|
||||||
|
|
||||||
{% if kube_network_plugin is defined and kube_network_plugin in ["calico", "canal", "flannel", "weave", "contiv", "cilium"] %}
|
{% if kube_network_plugin is defined and kube_network_plugin in ["calico", "canal", "flannel", "weave", "contiv", "cilium"] %}
|
||||||
KUBELET_NETWORK_PLUGIN="--network-plugin=cni --cni-conf-dir=/etc/cni/net.d --cni-bin-dir=/opt/cni/bin"
|
KUBELET_NETWORK_PLUGIN="--network-plugin=cni --cni-conf-dir=/etc/cni/net.d --cni-bin-dir=/opt/cni/bin"
|
||||||
{% elif kube_network_plugin is defined and kube_network_plugin == "weave" %}
|
{% elif kube_network_plugin is defined and kube_network_plugin == "weave" %}
|
||||||
|
|
Loading…
Reference in a new issue