mirror of https://github.com/kubernetes/kops.git
Support GPU in OpenStack
This commit is contained in:
parent
b79157ff43
commit
28caf02878
46
docs/gpu.md
46
docs/gpu.md
|
@ -49,4 +49,48 @@ spec:
|
|||
role: Node
|
||||
subnets:
|
||||
- eu-central-1c
|
||||
```
|
||||
```
|
||||
|
||||
## GPUs in OpenStack
|
||||
|
||||
OpenStack does not support enabling containerd configuration in cluster level. It needs to be done in instance group:
|
||||
|
||||
```yaml
|
||||
apiVersion: kops.k8s.io/v1alpha2
|
||||
kind: InstanceGroup
|
||||
metadata:
|
||||
labels:
|
||||
kops.k8s.io/cluster: <cluster name>
|
||||
name: gpu-nodes
|
||||
spec:
|
||||
image: 099720109477/ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-20200907
|
||||
nodeLabels:
|
||||
kops.k8s.io/instancegroup: gpu-nodes
|
||||
machineType: g4dn.xlarge
|
||||
maxSize: 1
|
||||
minSize: 1
|
||||
role: Node
|
||||
subnets:
|
||||
- eu-central-1c
|
||||
containerd:
|
||||
nvidiaGPU:
|
||||
enabled: true
|
||||
```
|
||||
|
||||
## Verifying GPUs
|
||||
|
||||
1. after new GPU nodes are coming up, you should see them in `kubectl get nodes`
|
||||
2. nodes should have `kops.k8s.io/gpu` label and `nvidia.com/gpu:NoSchedule` taint
|
||||
3. `kube-system` namespace should have nvidia-device-plugin-daemonset pod provisioned to GPU node(s)
|
||||
4. if you see `nvidia.com/gpu` in kubectl describe node <node> everything should work.
|
||||
|
||||
```
|
||||
Capacity:
|
||||
cpu: 4
|
||||
ephemeral-storage: 9983232Ki
|
||||
hugepages-1Gi: 0
|
||||
hugepages-2Mi: 0
|
||||
memory: 32796292Ki
|
||||
nvidia.com/gpu: 1 <- this one
|
||||
pods: 110
|
||||
```
|
||||
|
|
|
@ -101,6 +101,72 @@ spec:
|
|||
description: CompressUserData compresses parts of the user data to
|
||||
save space
|
||||
type: boolean
|
||||
containerd:
|
||||
description: Containerd specifies override configuration for instance
|
||||
group
|
||||
properties:
|
||||
address:
|
||||
description: Address of containerd's GRPC server (default "/run/containerd/containerd.sock").
|
||||
type: string
|
||||
configOverride:
|
||||
description: ConfigOverride is the complete containerd config
|
||||
file provided by the user.
|
||||
type: string
|
||||
logLevel:
|
||||
description: LogLevel controls the logging details [trace, debug,
|
||||
info, warn, error, fatal, panic] (default "info").
|
||||
type: string
|
||||
nvidiaGPU:
|
||||
description: NvidiaGPU configures the Nvidia GPU runtime.
|
||||
properties:
|
||||
enabled:
|
||||
description: Enabled determines if kOps will install the Nvidia
|
||||
GPU runtime and drivers. They will only be installed on
|
||||
intances that has an Nvidia GPU.
|
||||
type: boolean
|
||||
package:
|
||||
description: Package is the name of the nvidia driver package
|
||||
that will be installed. Default is "nvidia-headless-460-server".
|
||||
type: string
|
||||
type: object
|
||||
packages:
|
||||
description: Packages overrides the URL and hash for the packages.
|
||||
properties:
|
||||
hashAmd64:
|
||||
description: HashAmd64 overrides the hash for the AMD64 package.
|
||||
type: string
|
||||
hashArm64:
|
||||
description: HashArm64 overrides the hash for the ARM64 package.
|
||||
type: string
|
||||
urlAmd64:
|
||||
description: UrlAmd64 overrides the URL for the AMD64 package.
|
||||
type: string
|
||||
urlArm64:
|
||||
description: UrlArm64 overrides the URL for the ARM64 package.
|
||||
type: string
|
||||
type: object
|
||||
registryMirrors:
|
||||
additionalProperties:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
description: RegistryMirrors is list of image registries
|
||||
type: object
|
||||
root:
|
||||
description: Root directory for persistent data (default "/var/lib/containerd").
|
||||
type: string
|
||||
skipInstall:
|
||||
description: SkipInstall prevents kOps from installing and modifying
|
||||
containerd in any way (default "false").
|
||||
type: boolean
|
||||
state:
|
||||
description: State directory for execution state files (default
|
||||
"/run/containerd").
|
||||
type: string
|
||||
version:
|
||||
description: Version used to pick the containerd package.
|
||||
type: string
|
||||
type: object
|
||||
cpuCredits:
|
||||
description: CPUCredits is the credit option for CPU Usage on burstable
|
||||
instance types (AWS only)
|
||||
|
|
|
@ -190,6 +190,8 @@ type InstanceGroupSpec struct {
|
|||
UpdatePolicy *string `json:"updatePolicy,omitempty"`
|
||||
// WarmPool specifies a pool of pre-warmed instances for later use (AWS only).
|
||||
WarmPool *WarmPoolSpec `json:"warmPool,omitempty"`
|
||||
// Containerd specifies override configuration for instance group
|
||||
Containerd *ContainerdConfig `json:"containerd,omitempty"`
|
||||
}
|
||||
|
||||
const (
|
||||
|
|
|
@ -156,6 +156,8 @@ type InstanceGroupSpec struct {
|
|||
UpdatePolicy *string `json:"updatePolicy,omitempty"`
|
||||
// WarmPool configures an ASG warm pool for the instance group
|
||||
WarmPool *WarmPoolSpec `json:"warmPool,omitempty"`
|
||||
// Containerd specifies override configuration for instance group
|
||||
Containerd *ContainerdConfig `json:"containerd,omitempty"`
|
||||
}
|
||||
|
||||
// InstanceMetadataOptions defines the EC2 instance metadata service options (AWS Only)
|
||||
|
|
|
@ -4402,6 +4402,15 @@ func autoConvert_v1alpha2_InstanceGroupSpec_To_kops_InstanceGroupSpec(in *Instan
|
|||
} else {
|
||||
out.WarmPool = nil
|
||||
}
|
||||
if in.Containerd != nil {
|
||||
in, out := &in.Containerd, &out.Containerd
|
||||
*out = new(kops.ContainerdConfig)
|
||||
if err := Convert_v1alpha2_ContainerdConfig_To_kops_ContainerdConfig(*in, *out, s); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
out.Containerd = nil
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -4564,6 +4573,15 @@ func autoConvert_kops_InstanceGroupSpec_To_v1alpha2_InstanceGroupSpec(in *kops.I
|
|||
} else {
|
||||
out.WarmPool = nil
|
||||
}
|
||||
if in.Containerd != nil {
|
||||
in, out := &in.Containerd, &out.Containerd
|
||||
*out = new(ContainerdConfig)
|
||||
if err := Convert_kops_ContainerdConfig_To_v1alpha2_ContainerdConfig(*in, *out, s); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
out.Containerd = nil
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -2467,6 +2467,11 @@ func (in *InstanceGroupSpec) DeepCopyInto(out *InstanceGroupSpec) {
|
|||
*out = new(WarmPoolSpec)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
if in.Containerd != nil {
|
||||
in, out := &in.Containerd, &out.Containerd
|
||||
*out = new(ContainerdConfig)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
|
|
@ -153,6 +153,8 @@ type InstanceGroupSpec struct {
|
|||
UpdatePolicy *string `json:"updatePolicy,omitempty"`
|
||||
// WarmPool configures an ASG warm pool for the instance group
|
||||
WarmPool *WarmPoolSpec `json:"warmPool,omitempty"`
|
||||
// Containerd specifies override configuration for instance group
|
||||
Containerd *ContainerdConfig `json:"containerd,omitempty"`
|
||||
}
|
||||
|
||||
// InstanceMetadataOptions defines the EC2 instance metadata service options (AWS Only)
|
||||
|
|
|
@ -4535,6 +4535,15 @@ func autoConvert_v1alpha3_InstanceGroupSpec_To_kops_InstanceGroupSpec(in *Instan
|
|||
} else {
|
||||
out.WarmPool = nil
|
||||
}
|
||||
if in.Containerd != nil {
|
||||
in, out := &in.Containerd, &out.Containerd
|
||||
*out = new(kops.ContainerdConfig)
|
||||
if err := Convert_v1alpha3_ContainerdConfig_To_kops_ContainerdConfig(*in, *out, s); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
out.Containerd = nil
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -4697,6 +4706,15 @@ func autoConvert_kops_InstanceGroupSpec_To_v1alpha3_InstanceGroupSpec(in *kops.I
|
|||
} else {
|
||||
out.WarmPool = nil
|
||||
}
|
||||
if in.Containerd != nil {
|
||||
in, out := &in.Containerd, &out.Containerd
|
||||
*out = new(ContainerdConfig)
|
||||
if err := Convert_kops_ContainerdConfig_To_v1alpha3_ContainerdConfig(*in, *out, s); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
out.Containerd = nil
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -2478,6 +2478,11 @@ func (in *InstanceGroupSpec) DeepCopyInto(out *InstanceGroupSpec) {
|
|||
*out = new(WarmPoolSpec)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
if in.Containerd != nil {
|
||||
in, out := &in.Containerd, &out.Containerd
|
||||
*out = new(ContainerdConfig)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
|
|
@ -247,6 +247,10 @@ func CrossValidateInstanceGroup(g *kops.InstanceGroup, cluster *kops.Cluster, cl
|
|||
}
|
||||
}
|
||||
|
||||
if g.Spec.Containerd != nil {
|
||||
allErrs = append(allErrs, validateContainerdConfig(&cluster.Spec, g.Spec.Containerd, field.NewPath("spec", "containerd"), false)...)
|
||||
}
|
||||
|
||||
{
|
||||
warmPool := cluster.Spec.WarmPool.ResolveDefaults(g)
|
||||
if warmPool.MaxSize == nil || *warmPool.MaxSize != 0 {
|
||||
|
|
|
@ -227,7 +227,7 @@ func validateClusterSpec(spec *kops.ClusterSpec, c *kops.Cluster, fieldPath *fie
|
|||
}
|
||||
|
||||
if spec.Containerd != nil {
|
||||
allErrs = append(allErrs, validateContainerdConfig(spec, spec.Containerd, fieldPath.Child("containerd"))...)
|
||||
allErrs = append(allErrs, validateContainerdConfig(spec, spec.Containerd, fieldPath.Child("containerd"), true)...)
|
||||
}
|
||||
|
||||
if spec.Docker != nil {
|
||||
|
@ -1371,7 +1371,7 @@ func validateContainerRuntime(c *kops.Cluster, runtime string, fldPath *field.Pa
|
|||
return allErrs
|
||||
}
|
||||
|
||||
func validateContainerdConfig(spec *kops.ClusterSpec, config *kops.ContainerdConfig, fldPath *field.Path) field.ErrorList {
|
||||
func validateContainerdConfig(spec *kops.ClusterSpec, config *kops.ContainerdConfig, fldPath *field.Path, inClusterConfig bool) field.ErrorList {
|
||||
allErrs := field.ErrorList{}
|
||||
|
||||
if config.Version != nil {
|
||||
|
@ -1429,7 +1429,7 @@ func validateContainerdConfig(spec *kops.ClusterSpec, config *kops.ContainerdCon
|
|||
}
|
||||
|
||||
if config.NvidiaGPU != nil {
|
||||
allErrs = append(allErrs, validateNvidiaConfig(spec, config.NvidiaGPU, fldPath.Child("nvidia"))...)
|
||||
allErrs = append(allErrs, validateNvidiaConfig(spec, config.NvidiaGPU, fldPath.Child("nvidia"), inClusterConfig)...)
|
||||
}
|
||||
|
||||
return allErrs
|
||||
|
@ -1506,16 +1506,19 @@ func validateDockerConfig(config *kops.DockerConfig, fldPath *field.Path) field.
|
|||
return allErrs
|
||||
}
|
||||
|
||||
func validateNvidiaConfig(spec *kops.ClusterSpec, nvidia *kops.NvidiaGPUConfig, fldPath *field.Path) (allErrs field.ErrorList) {
|
||||
func validateNvidiaConfig(spec *kops.ClusterSpec, nvidia *kops.NvidiaGPUConfig, fldPath *field.Path, inClusterConfig bool) (allErrs field.ErrorList) {
|
||||
if !fi.BoolValue(nvidia.Enabled) {
|
||||
return allErrs
|
||||
}
|
||||
if spec.GetCloudProvider() != kops.CloudProviderAWS {
|
||||
allErrs = append(allErrs, field.Forbidden(fldPath, "Nvidia is only supported on AWS"))
|
||||
if spec.GetCloudProvider() != kops.CloudProviderAWS && spec.GetCloudProvider() != kops.CloudProviderOpenstack {
|
||||
allErrs = append(allErrs, field.Forbidden(fldPath, "Nvidia is only supported on AWS and OpenStack"))
|
||||
}
|
||||
if spec.ContainerRuntime != "" && spec.ContainerRuntime != "containerd" {
|
||||
allErrs = append(allErrs, field.Forbidden(fldPath, "Nvidia is only supported using containerd"))
|
||||
}
|
||||
if spec.GetCloudProvider() == kops.CloudProviderOpenstack && inClusterConfig {
|
||||
allErrs = append(allErrs, field.Forbidden(fldPath, "OpenStack supports nvidia configuration only in instance group"))
|
||||
}
|
||||
return allErrs
|
||||
}
|
||||
|
||||
|
|
|
@ -1359,7 +1359,7 @@ func TestValidateSAExternalPermissions(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func Test_Validate_Nvdia(t *testing.T) {
|
||||
func Test_Validate_Nvidia_Cluster(t *testing.T) {
|
||||
grid := []struct {
|
||||
Input kops.ClusterSpec
|
||||
ExpectedErrors []string
|
||||
|
@ -1377,6 +1377,86 @@ func Test_Validate_Nvdia(t *testing.T) {
|
|||
ContainerRuntime: "containerd",
|
||||
},
|
||||
},
|
||||
{
|
||||
Input: kops.ClusterSpec{
|
||||
Containerd: &kops.ContainerdConfig{
|
||||
NvidiaGPU: &kops.NvidiaGPUConfig{
|
||||
Enabled: fi.Bool(true),
|
||||
},
|
||||
},
|
||||
CloudProvider: kops.CloudProviderSpec{
|
||||
Openstack: &kops.OpenstackSpec{},
|
||||
},
|
||||
ContainerRuntime: "containerd",
|
||||
},
|
||||
ExpectedErrors: []string{"Forbidden::containerd.nvidiaGPU"},
|
||||
},
|
||||
{
|
||||
Input: kops.ClusterSpec{
|
||||
Containerd: &kops.ContainerdConfig{
|
||||
NvidiaGPU: &kops.NvidiaGPUConfig{
|
||||
Enabled: fi.Bool(true),
|
||||
},
|
||||
},
|
||||
CloudProvider: kops.CloudProviderSpec{
|
||||
GCE: &kops.GCESpec{},
|
||||
},
|
||||
ContainerRuntime: "containerd",
|
||||
},
|
||||
ExpectedErrors: []string{"Forbidden::containerd.nvidiaGPU"},
|
||||
},
|
||||
{
|
||||
Input: kops.ClusterSpec{
|
||||
Containerd: &kops.ContainerdConfig{
|
||||
NvidiaGPU: &kops.NvidiaGPUConfig{
|
||||
Enabled: fi.Bool(true),
|
||||
},
|
||||
},
|
||||
CloudProvider: kops.CloudProviderSpec{
|
||||
AWS: &kops.AWSSpec{},
|
||||
},
|
||||
ContainerRuntime: "docker",
|
||||
},
|
||||
ExpectedErrors: []string{"Forbidden::containerd.nvidiaGPU"},
|
||||
},
|
||||
}
|
||||
for _, g := range grid {
|
||||
errs := validateNvidiaConfig(&g.Input, g.Input.Containerd.NvidiaGPU, field.NewPath("containerd", "nvidiaGPU"), true)
|
||||
testErrors(t, g.Input, errs, g.ExpectedErrors)
|
||||
}
|
||||
}
|
||||
|
||||
func Test_Validate_Nvidia_Ig(t *testing.T) {
|
||||
grid := []struct {
|
||||
Input kops.ClusterSpec
|
||||
ExpectedErrors []string
|
||||
}{
|
||||
{
|
||||
Input: kops.ClusterSpec{
|
||||
Containerd: &kops.ContainerdConfig{
|
||||
NvidiaGPU: &kops.NvidiaGPUConfig{
|
||||
Enabled: fi.Bool(true),
|
||||
},
|
||||
},
|
||||
CloudProvider: kops.CloudProviderSpec{
|
||||
AWS: &kops.AWSSpec{},
|
||||
},
|
||||
ContainerRuntime: "containerd",
|
||||
},
|
||||
},
|
||||
{
|
||||
Input: kops.ClusterSpec{
|
||||
Containerd: &kops.ContainerdConfig{
|
||||
NvidiaGPU: &kops.NvidiaGPUConfig{
|
||||
Enabled: fi.Bool(true),
|
||||
},
|
||||
},
|
||||
CloudProvider: kops.CloudProviderSpec{
|
||||
Openstack: &kops.OpenstackSpec{},
|
||||
},
|
||||
ContainerRuntime: "containerd",
|
||||
},
|
||||
},
|
||||
{
|
||||
Input: kops.ClusterSpec{
|
||||
Containerd: &kops.ContainerdConfig{
|
||||
|
@ -1407,7 +1487,7 @@ func Test_Validate_Nvdia(t *testing.T) {
|
|||
},
|
||||
}
|
||||
for _, g := range grid {
|
||||
errs := validateNvidiaConfig(&g.Input, g.Input.Containerd.NvidiaGPU, field.NewPath("containerd", "nvidiaGPU"))
|
||||
errs := validateNvidiaConfig(&g.Input, g.Input.Containerd.NvidiaGPU, field.NewPath("containerd", "nvidiaGPU"), false)
|
||||
testErrors(t, g.Input, errs, g.ExpectedErrors)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2641,6 +2641,11 @@ func (in *InstanceGroupSpec) DeepCopyInto(out *InstanceGroupSpec) {
|
|||
*out = new(WarmPoolSpec)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
if in.Containerd != nil {
|
||||
in, out := &in.Containerd, &out.Containerd
|
||||
*out = new(ContainerdConfig)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
|
|
@ -70,6 +70,7 @@ import (
|
|||
"k8s.io/kops/util/pkg/architectures"
|
||||
"k8s.io/kops/util/pkg/hashing"
|
||||
"k8s.io/kops/util/pkg/mirrors"
|
||||
"k8s.io/kops/util/pkg/reflectutils"
|
||||
"k8s.io/kops/util/pkg/vfs"
|
||||
)
|
||||
|
||||
|
@ -1397,14 +1398,26 @@ func (n *nodeUpConfigBuilder) BuildConfig(ig *kops.InstanceGroup, apiserverAddit
|
|||
config.Channels = n.channels
|
||||
config.EtcdManifests = n.etcdManifests[role]
|
||||
|
||||
if cluster.Spec.ContainerRuntime == "containerd" {
|
||||
config.ContainerdConfig = cluster.Spec.Containerd
|
||||
if ig.Spec.Containerd != nil || cluster.Spec.ContainerRuntime == "containerd" {
|
||||
config.ContainerdConfig = n.buildContainerdConfig(ig)
|
||||
}
|
||||
|
||||
if cluster.Spec.Containerd != nil && cluster.Spec.Containerd.NvidiaGPU != nil {
|
||||
config.NvidiaGPU = cluster.Spec.Containerd.NvidiaGPU
|
||||
}
|
||||
|
||||
if ig.Spec.Containerd != nil && ig.Spec.Containerd.NvidiaGPU != nil {
|
||||
if config.NvidiaGPU == nil {
|
||||
config.NvidiaGPU = ig.Spec.Containerd.NvidiaGPU
|
||||
} else {
|
||||
reflectutils.JSONMergeStruct(&config.NvidiaGPU, ig.Spec.Containerd.NvidiaGPU)
|
||||
}
|
||||
}
|
||||
|
||||
if config.NvidiaGPU != nil && config.NvidiaGPU.DriverPackage == "" {
|
||||
config.NvidiaGPU.DriverPackage = "nvidia-headless-460-server"
|
||||
}
|
||||
|
||||
if ig.Spec.WarmPool != nil || cluster.Spec.WarmPool != nil {
|
||||
config.WarmPoolImages = n.buildWarmPoolImages(ig)
|
||||
}
|
||||
|
@ -1431,6 +1444,15 @@ func loadCertificates(keysets map[string]*fi.Keyset, name string, config *nodeup
|
|||
return nil
|
||||
}
|
||||
|
||||
// buildContainerdConfig builds containerd configuration for instance. Instance group configuration will override cluster configuration
|
||||
func (n *nodeUpConfigBuilder) buildContainerdConfig(ig *kops.InstanceGroup) *kops.ContainerdConfig {
|
||||
config := n.cluster.Spec.Containerd.DeepCopy()
|
||||
if ig.Spec.Containerd != nil {
|
||||
reflectutils.JSONMergeStruct(&config, ig.Spec.Containerd)
|
||||
}
|
||||
return config
|
||||
}
|
||||
|
||||
// buildWarmPoolImages returns a list of container images that should be pre-pulled during instance pre-initialization
|
||||
func (n *nodeUpConfigBuilder) buildWarmPoolImages(ig *kops.InstanceGroup) []string {
|
||||
if ig == nil || ig.Spec.Role == kops.InstanceGroupRoleMaster {
|
||||
|
|
|
@ -658,8 +658,15 @@ func (b *BootstrapChannelBuilder) buildAddons(c *fi.ModelBuilderContext) (*Addon
|
|||
}
|
||||
|
||||
nvidia := b.Cluster.Spec.Containerd.NvidiaGPU
|
||||
igNvidia := false
|
||||
for _, ig := range b.KopsModelContext.InstanceGroups {
|
||||
if ig.Spec.Containerd != nil && ig.Spec.Containerd.NvidiaGPU != nil && fi.BoolValue(ig.Spec.Containerd.NvidiaGPU.Enabled) {
|
||||
igNvidia = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if nvidia != nil && fi.BoolValue(nvidia.Enabled) {
|
||||
if nvidia != nil && fi.BoolValue(nvidia.Enabled) || igNvidia {
|
||||
|
||||
key := "nvidia.addons.k8s.io"
|
||||
|
||||
|
|
|
@ -175,29 +175,45 @@ func PopulateInstanceGroupSpec(cluster *kops.Cluster, input *kops.InstanceGroup,
|
|||
return nil, fmt.Errorf("unable to infer any Subnets for InstanceGroup %s ", ig.ObjectMeta.Name)
|
||||
}
|
||||
|
||||
hasGPU := false
|
||||
clusterNvidia := false
|
||||
if cluster.Spec.Containerd != nil && cluster.Spec.Containerd.NvidiaGPU != nil && fi.BoolValue(cluster.Spec.Containerd.NvidiaGPU.Enabled) {
|
||||
switch cluster.Spec.GetCloudProvider() {
|
||||
case kops.CloudProviderAWS:
|
||||
clusterNvidia = true
|
||||
}
|
||||
igNvidia := false
|
||||
if ig.Spec.Containerd != nil && ig.Spec.Containerd.NvidiaGPU != nil && fi.BoolValue(ig.Spec.Containerd.NvidiaGPU.Enabled) {
|
||||
igNvidia = true
|
||||
}
|
||||
|
||||
switch cluster.Spec.GetCloudProvider() {
|
||||
case kops.CloudProviderAWS:
|
||||
if clusterNvidia || igNvidia {
|
||||
mt, err := awsup.GetMachineTypeInfo(cloud.(awsup.AWSCloud), ig.Spec.MachineType)
|
||||
if err != nil {
|
||||
return ig, fmt.Errorf("error looking up machine type info: %v", err)
|
||||
}
|
||||
if mt.GPU {
|
||||
if ig.Spec.NodeLabels == nil {
|
||||
ig.Spec.NodeLabels = make(map[string]string)
|
||||
}
|
||||
ig.Spec.NodeLabels["kops.k8s.io/gpu"] = "1"
|
||||
hasNvidiaTaint := false
|
||||
for _, taint := range ig.Spec.Taints {
|
||||
if strings.HasPrefix(taint, "nvidia.com/gpu") {
|
||||
hasNvidiaTaint = true
|
||||
}
|
||||
}
|
||||
if !hasNvidiaTaint {
|
||||
ig.Spec.Taints = append(ig.Spec.Taints, "nvidia.com/gpu:NoSchedule")
|
||||
}
|
||||
hasGPU = mt.GPU
|
||||
}
|
||||
case kops.CloudProviderOpenstack:
|
||||
if igNvidia {
|
||||
hasGPU = true
|
||||
}
|
||||
}
|
||||
|
||||
if hasGPU {
|
||||
if ig.Spec.NodeLabels == nil {
|
||||
ig.Spec.NodeLabels = make(map[string]string)
|
||||
}
|
||||
ig.Spec.NodeLabels["kops.k8s.io/gpu"] = "1"
|
||||
hasNvidiaTaint := false
|
||||
for _, taint := range ig.Spec.Taints {
|
||||
if strings.HasPrefix(taint, "nvidia.com/gpu") {
|
||||
hasNvidiaTaint = true
|
||||
}
|
||||
}
|
||||
if !hasNvidiaTaint {
|
||||
ig.Spec.Taints = append(ig.Spec.Taints, "nvidia.com/gpu:NoSchedule")
|
||||
}
|
||||
}
|
||||
|
||||
if ig.Spec.Manager == "" {
|
||||
|
|
|
@ -301,7 +301,12 @@ func (c *NodeUpCommand) Run(out io.Writer) error {
|
|||
modelContext.GPUVendor = architectures.GPUVendorNvidia
|
||||
}
|
||||
}
|
||||
|
||||
} else if cloudProvider == api.CloudProviderOpenstack {
|
||||
// NvidiaGPU possible to enable only in instance group level in OpenStack. When we assume that GPU is supported
|
||||
if nodeupConfig.NvidiaGPU != nil && fi.BoolValue(nodeupConfig.NvidiaGPU.Enabled) {
|
||||
klog.Info("instance supports GPU acceleration")
|
||||
modelContext.GPUVendor = architectures.GPUVendorNvidia
|
||||
}
|
||||
}
|
||||
|
||||
if err := loadKernelModules(modelContext); err != nil {
|
||||
|
|
Loading…
Reference in New Issue