diff --git a/docs/gpu.md b/docs/gpu.md index 1cc1fa7da9..049607a63a 100644 --- a/docs/gpu.md +++ b/docs/gpu.md @@ -49,4 +49,48 @@ spec: role: Node subnets: - eu-central-1c -``` \ No newline at end of file +``` + +## GPUs in OpenStack + +OpenStack does not support enabling containerd configuration in cluster level. It needs to be done in instance group: + +```yaml +apiVersion: kops.k8s.io/v1alpha2 +kind: InstanceGroup +metadata: + labels: + kops.k8s.io/cluster: + name: gpu-nodes +spec: + image: 099720109477/ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-20200907 + nodeLabels: + kops.k8s.io/instancegroup: gpu-nodes + machineType: g4dn.xlarge + maxSize: 1 + minSize: 1 + role: Node + subnets: + - eu-central-1c + containerd: + nvidiaGPU: + enabled: true +``` + +## Verifying GPUs + +1. after new GPU nodes are coming up, you should see them in `kubectl get nodes` +2. nodes should have `kops.k8s.io/gpu` label and `nvidia.com/gpu:NoSchedule` taint +3. `kube-system` namespace should have nvidia-device-plugin-daemonset pod provisioned to GPU node(s) +4. if you see `nvidia.com/gpu` in kubectl describe node everything should work. + +``` +Capacity: + cpu: 4 + ephemeral-storage: 9983232Ki + hugepages-1Gi: 0 + hugepages-2Mi: 0 + memory: 32796292Ki + nvidia.com/gpu: 1 <- this one + pods: 110 +``` diff --git a/k8s/crds/kops.k8s.io_instancegroups.yaml b/k8s/crds/kops.k8s.io_instancegroups.yaml index 4a0b0c4155..93a74a1a46 100644 --- a/k8s/crds/kops.k8s.io_instancegroups.yaml +++ b/k8s/crds/kops.k8s.io_instancegroups.yaml @@ -101,6 +101,72 @@ spec: description: CompressUserData compresses parts of the user data to save space type: boolean + containerd: + description: Containerd specifies override configuration for instance + group + properties: + address: + description: Address of containerd's GRPC server (default "/run/containerd/containerd.sock"). + type: string + configOverride: + description: ConfigOverride is the complete containerd config + file provided by the user. + type: string + logLevel: + description: LogLevel controls the logging details [trace, debug, + info, warn, error, fatal, panic] (default "info"). + type: string + nvidiaGPU: + description: NvidiaGPU configures the Nvidia GPU runtime. + properties: + enabled: + description: Enabled determines if kOps will install the Nvidia + GPU runtime and drivers. They will only be installed on + intances that has an Nvidia GPU. + type: boolean + package: + description: Package is the name of the nvidia driver package + that will be installed. Default is "nvidia-headless-460-server". + type: string + type: object + packages: + description: Packages overrides the URL and hash for the packages. + properties: + hashAmd64: + description: HashAmd64 overrides the hash for the AMD64 package. + type: string + hashArm64: + description: HashArm64 overrides the hash for the ARM64 package. + type: string + urlAmd64: + description: UrlAmd64 overrides the URL for the AMD64 package. + type: string + urlArm64: + description: UrlArm64 overrides the URL for the ARM64 package. + type: string + type: object + registryMirrors: + additionalProperties: + items: + type: string + type: array + description: RegistryMirrors is list of image registries + type: object + root: + description: Root directory for persistent data (default "/var/lib/containerd"). + type: string + skipInstall: + description: SkipInstall prevents kOps from installing and modifying + containerd in any way (default "false"). + type: boolean + state: + description: State directory for execution state files (default + "/run/containerd"). + type: string + version: + description: Version used to pick the containerd package. + type: string + type: object cpuCredits: description: CPUCredits is the credit option for CPU Usage on burstable instance types (AWS only) diff --git a/pkg/apis/kops/instancegroup.go b/pkg/apis/kops/instancegroup.go index 31a931cc9a..08e6459e02 100644 --- a/pkg/apis/kops/instancegroup.go +++ b/pkg/apis/kops/instancegroup.go @@ -190,6 +190,8 @@ type InstanceGroupSpec struct { UpdatePolicy *string `json:"updatePolicy,omitempty"` // WarmPool specifies a pool of pre-warmed instances for later use (AWS only). WarmPool *WarmPoolSpec `json:"warmPool,omitempty"` + // Containerd specifies override configuration for instance group + Containerd *ContainerdConfig `json:"containerd,omitempty"` } const ( diff --git a/pkg/apis/kops/v1alpha2/instancegroup.go b/pkg/apis/kops/v1alpha2/instancegroup.go index f6022aef49..0e45fc2473 100644 --- a/pkg/apis/kops/v1alpha2/instancegroup.go +++ b/pkg/apis/kops/v1alpha2/instancegroup.go @@ -156,6 +156,8 @@ type InstanceGroupSpec struct { UpdatePolicy *string `json:"updatePolicy,omitempty"` // WarmPool configures an ASG warm pool for the instance group WarmPool *WarmPoolSpec `json:"warmPool,omitempty"` + // Containerd specifies override configuration for instance group + Containerd *ContainerdConfig `json:"containerd,omitempty"` } // InstanceMetadataOptions defines the EC2 instance metadata service options (AWS Only) diff --git a/pkg/apis/kops/v1alpha2/zz_generated.conversion.go b/pkg/apis/kops/v1alpha2/zz_generated.conversion.go index adba64f89b..f30e47e90a 100644 --- a/pkg/apis/kops/v1alpha2/zz_generated.conversion.go +++ b/pkg/apis/kops/v1alpha2/zz_generated.conversion.go @@ -4402,6 +4402,15 @@ func autoConvert_v1alpha2_InstanceGroupSpec_To_kops_InstanceGroupSpec(in *Instan } else { out.WarmPool = nil } + if in.Containerd != nil { + in, out := &in.Containerd, &out.Containerd + *out = new(kops.ContainerdConfig) + if err := Convert_v1alpha2_ContainerdConfig_To_kops_ContainerdConfig(*in, *out, s); err != nil { + return err + } + } else { + out.Containerd = nil + } return nil } @@ -4564,6 +4573,15 @@ func autoConvert_kops_InstanceGroupSpec_To_v1alpha2_InstanceGroupSpec(in *kops.I } else { out.WarmPool = nil } + if in.Containerd != nil { + in, out := &in.Containerd, &out.Containerd + *out = new(ContainerdConfig) + if err := Convert_kops_ContainerdConfig_To_v1alpha2_ContainerdConfig(*in, *out, s); err != nil { + return err + } + } else { + out.Containerd = nil + } return nil } diff --git a/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go b/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go index fc155ca4b7..0e202696fb 100644 --- a/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go +++ b/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go @@ -2467,6 +2467,11 @@ func (in *InstanceGroupSpec) DeepCopyInto(out *InstanceGroupSpec) { *out = new(WarmPoolSpec) (*in).DeepCopyInto(*out) } + if in.Containerd != nil { + in, out := &in.Containerd, &out.Containerd + *out = new(ContainerdConfig) + (*in).DeepCopyInto(*out) + } return } diff --git a/pkg/apis/kops/v1alpha3/instancegroup.go b/pkg/apis/kops/v1alpha3/instancegroup.go index 0b08fb6649..f3fcb304d9 100644 --- a/pkg/apis/kops/v1alpha3/instancegroup.go +++ b/pkg/apis/kops/v1alpha3/instancegroup.go @@ -153,6 +153,8 @@ type InstanceGroupSpec struct { UpdatePolicy *string `json:"updatePolicy,omitempty"` // WarmPool configures an ASG warm pool for the instance group WarmPool *WarmPoolSpec `json:"warmPool,omitempty"` + // Containerd specifies override configuration for instance group + Containerd *ContainerdConfig `json:"containerd,omitempty"` } // InstanceMetadataOptions defines the EC2 instance metadata service options (AWS Only) diff --git a/pkg/apis/kops/v1alpha3/zz_generated.conversion.go b/pkg/apis/kops/v1alpha3/zz_generated.conversion.go index a3652e8be3..11b2e10fe4 100644 --- a/pkg/apis/kops/v1alpha3/zz_generated.conversion.go +++ b/pkg/apis/kops/v1alpha3/zz_generated.conversion.go @@ -4535,6 +4535,15 @@ func autoConvert_v1alpha3_InstanceGroupSpec_To_kops_InstanceGroupSpec(in *Instan } else { out.WarmPool = nil } + if in.Containerd != nil { + in, out := &in.Containerd, &out.Containerd + *out = new(kops.ContainerdConfig) + if err := Convert_v1alpha3_ContainerdConfig_To_kops_ContainerdConfig(*in, *out, s); err != nil { + return err + } + } else { + out.Containerd = nil + } return nil } @@ -4697,6 +4706,15 @@ func autoConvert_kops_InstanceGroupSpec_To_v1alpha3_InstanceGroupSpec(in *kops.I } else { out.WarmPool = nil } + if in.Containerd != nil { + in, out := &in.Containerd, &out.Containerd + *out = new(ContainerdConfig) + if err := Convert_kops_ContainerdConfig_To_v1alpha3_ContainerdConfig(*in, *out, s); err != nil { + return err + } + } else { + out.Containerd = nil + } return nil } diff --git a/pkg/apis/kops/v1alpha3/zz_generated.deepcopy.go b/pkg/apis/kops/v1alpha3/zz_generated.deepcopy.go index 6cc89b35de..a8afbdf076 100644 --- a/pkg/apis/kops/v1alpha3/zz_generated.deepcopy.go +++ b/pkg/apis/kops/v1alpha3/zz_generated.deepcopy.go @@ -2478,6 +2478,11 @@ func (in *InstanceGroupSpec) DeepCopyInto(out *InstanceGroupSpec) { *out = new(WarmPoolSpec) (*in).DeepCopyInto(*out) } + if in.Containerd != nil { + in, out := &in.Containerd, &out.Containerd + *out = new(ContainerdConfig) + (*in).DeepCopyInto(*out) + } return } diff --git a/pkg/apis/kops/validation/instancegroup.go b/pkg/apis/kops/validation/instancegroup.go index 25e2437009..7ac436552a 100644 --- a/pkg/apis/kops/validation/instancegroup.go +++ b/pkg/apis/kops/validation/instancegroup.go @@ -247,6 +247,10 @@ func CrossValidateInstanceGroup(g *kops.InstanceGroup, cluster *kops.Cluster, cl } } + if g.Spec.Containerd != nil { + allErrs = append(allErrs, validateContainerdConfig(&cluster.Spec, g.Spec.Containerd, field.NewPath("spec", "containerd"), false)...) + } + { warmPool := cluster.Spec.WarmPool.ResolveDefaults(g) if warmPool.MaxSize == nil || *warmPool.MaxSize != 0 { diff --git a/pkg/apis/kops/validation/validation.go b/pkg/apis/kops/validation/validation.go index 82bcc97637..cd61045825 100644 --- a/pkg/apis/kops/validation/validation.go +++ b/pkg/apis/kops/validation/validation.go @@ -227,7 +227,7 @@ func validateClusterSpec(spec *kops.ClusterSpec, c *kops.Cluster, fieldPath *fie } if spec.Containerd != nil { - allErrs = append(allErrs, validateContainerdConfig(spec, spec.Containerd, fieldPath.Child("containerd"))...) + allErrs = append(allErrs, validateContainerdConfig(spec, spec.Containerd, fieldPath.Child("containerd"), true)...) } if spec.Docker != nil { @@ -1371,7 +1371,7 @@ func validateContainerRuntime(c *kops.Cluster, runtime string, fldPath *field.Pa return allErrs } -func validateContainerdConfig(spec *kops.ClusterSpec, config *kops.ContainerdConfig, fldPath *field.Path) field.ErrorList { +func validateContainerdConfig(spec *kops.ClusterSpec, config *kops.ContainerdConfig, fldPath *field.Path, inClusterConfig bool) field.ErrorList { allErrs := field.ErrorList{} if config.Version != nil { @@ -1429,7 +1429,7 @@ func validateContainerdConfig(spec *kops.ClusterSpec, config *kops.ContainerdCon } if config.NvidiaGPU != nil { - allErrs = append(allErrs, validateNvidiaConfig(spec, config.NvidiaGPU, fldPath.Child("nvidia"))...) + allErrs = append(allErrs, validateNvidiaConfig(spec, config.NvidiaGPU, fldPath.Child("nvidia"), inClusterConfig)...) } return allErrs @@ -1506,16 +1506,19 @@ func validateDockerConfig(config *kops.DockerConfig, fldPath *field.Path) field. return allErrs } -func validateNvidiaConfig(spec *kops.ClusterSpec, nvidia *kops.NvidiaGPUConfig, fldPath *field.Path) (allErrs field.ErrorList) { +func validateNvidiaConfig(spec *kops.ClusterSpec, nvidia *kops.NvidiaGPUConfig, fldPath *field.Path, inClusterConfig bool) (allErrs field.ErrorList) { if !fi.BoolValue(nvidia.Enabled) { return allErrs } - if spec.GetCloudProvider() != kops.CloudProviderAWS { - allErrs = append(allErrs, field.Forbidden(fldPath, "Nvidia is only supported on AWS")) + if spec.GetCloudProvider() != kops.CloudProviderAWS && spec.GetCloudProvider() != kops.CloudProviderOpenstack { + allErrs = append(allErrs, field.Forbidden(fldPath, "Nvidia is only supported on AWS and OpenStack")) } if spec.ContainerRuntime != "" && spec.ContainerRuntime != "containerd" { allErrs = append(allErrs, field.Forbidden(fldPath, "Nvidia is only supported using containerd")) } + if spec.GetCloudProvider() == kops.CloudProviderOpenstack && inClusterConfig { + allErrs = append(allErrs, field.Forbidden(fldPath, "OpenStack supports nvidia configuration only in instance group")) + } return allErrs } diff --git a/pkg/apis/kops/validation/validation_test.go b/pkg/apis/kops/validation/validation_test.go index cbe627c612..0c7ef4e0b8 100644 --- a/pkg/apis/kops/validation/validation_test.go +++ b/pkg/apis/kops/validation/validation_test.go @@ -1359,7 +1359,7 @@ func TestValidateSAExternalPermissions(t *testing.T) { } } -func Test_Validate_Nvdia(t *testing.T) { +func Test_Validate_Nvidia_Cluster(t *testing.T) { grid := []struct { Input kops.ClusterSpec ExpectedErrors []string @@ -1377,6 +1377,86 @@ func Test_Validate_Nvdia(t *testing.T) { ContainerRuntime: "containerd", }, }, + { + Input: kops.ClusterSpec{ + Containerd: &kops.ContainerdConfig{ + NvidiaGPU: &kops.NvidiaGPUConfig{ + Enabled: fi.Bool(true), + }, + }, + CloudProvider: kops.CloudProviderSpec{ + Openstack: &kops.OpenstackSpec{}, + }, + ContainerRuntime: "containerd", + }, + ExpectedErrors: []string{"Forbidden::containerd.nvidiaGPU"}, + }, + { + Input: kops.ClusterSpec{ + Containerd: &kops.ContainerdConfig{ + NvidiaGPU: &kops.NvidiaGPUConfig{ + Enabled: fi.Bool(true), + }, + }, + CloudProvider: kops.CloudProviderSpec{ + GCE: &kops.GCESpec{}, + }, + ContainerRuntime: "containerd", + }, + ExpectedErrors: []string{"Forbidden::containerd.nvidiaGPU"}, + }, + { + Input: kops.ClusterSpec{ + Containerd: &kops.ContainerdConfig{ + NvidiaGPU: &kops.NvidiaGPUConfig{ + Enabled: fi.Bool(true), + }, + }, + CloudProvider: kops.CloudProviderSpec{ + AWS: &kops.AWSSpec{}, + }, + ContainerRuntime: "docker", + }, + ExpectedErrors: []string{"Forbidden::containerd.nvidiaGPU"}, + }, + } + for _, g := range grid { + errs := validateNvidiaConfig(&g.Input, g.Input.Containerd.NvidiaGPU, field.NewPath("containerd", "nvidiaGPU"), true) + testErrors(t, g.Input, errs, g.ExpectedErrors) + } +} + +func Test_Validate_Nvidia_Ig(t *testing.T) { + grid := []struct { + Input kops.ClusterSpec + ExpectedErrors []string + }{ + { + Input: kops.ClusterSpec{ + Containerd: &kops.ContainerdConfig{ + NvidiaGPU: &kops.NvidiaGPUConfig{ + Enabled: fi.Bool(true), + }, + }, + CloudProvider: kops.CloudProviderSpec{ + AWS: &kops.AWSSpec{}, + }, + ContainerRuntime: "containerd", + }, + }, + { + Input: kops.ClusterSpec{ + Containerd: &kops.ContainerdConfig{ + NvidiaGPU: &kops.NvidiaGPUConfig{ + Enabled: fi.Bool(true), + }, + }, + CloudProvider: kops.CloudProviderSpec{ + Openstack: &kops.OpenstackSpec{}, + }, + ContainerRuntime: "containerd", + }, + }, { Input: kops.ClusterSpec{ Containerd: &kops.ContainerdConfig{ @@ -1407,7 +1487,7 @@ func Test_Validate_Nvdia(t *testing.T) { }, } for _, g := range grid { - errs := validateNvidiaConfig(&g.Input, g.Input.Containerd.NvidiaGPU, field.NewPath("containerd", "nvidiaGPU")) + errs := validateNvidiaConfig(&g.Input, g.Input.Containerd.NvidiaGPU, field.NewPath("containerd", "nvidiaGPU"), false) testErrors(t, g.Input, errs, g.ExpectedErrors) } } diff --git a/pkg/apis/kops/zz_generated.deepcopy.go b/pkg/apis/kops/zz_generated.deepcopy.go index d5b2dfd1ff..a5ffb254a6 100644 --- a/pkg/apis/kops/zz_generated.deepcopy.go +++ b/pkg/apis/kops/zz_generated.deepcopy.go @@ -2641,6 +2641,11 @@ func (in *InstanceGroupSpec) DeepCopyInto(out *InstanceGroupSpec) { *out = new(WarmPoolSpec) (*in).DeepCopyInto(*out) } + if in.Containerd != nil { + in, out := &in.Containerd, &out.Containerd + *out = new(ContainerdConfig) + (*in).DeepCopyInto(*out) + } return } diff --git a/upup/pkg/fi/cloudup/apply_cluster.go b/upup/pkg/fi/cloudup/apply_cluster.go index 2669ed63f4..b5c7d4f03d 100644 --- a/upup/pkg/fi/cloudup/apply_cluster.go +++ b/upup/pkg/fi/cloudup/apply_cluster.go @@ -70,6 +70,7 @@ import ( "k8s.io/kops/util/pkg/architectures" "k8s.io/kops/util/pkg/hashing" "k8s.io/kops/util/pkg/mirrors" + "k8s.io/kops/util/pkg/reflectutils" "k8s.io/kops/util/pkg/vfs" ) @@ -1397,14 +1398,26 @@ func (n *nodeUpConfigBuilder) BuildConfig(ig *kops.InstanceGroup, apiserverAddit config.Channels = n.channels config.EtcdManifests = n.etcdManifests[role] - if cluster.Spec.ContainerRuntime == "containerd" { - config.ContainerdConfig = cluster.Spec.Containerd + if ig.Spec.Containerd != nil || cluster.Spec.ContainerRuntime == "containerd" { + config.ContainerdConfig = n.buildContainerdConfig(ig) } if cluster.Spec.Containerd != nil && cluster.Spec.Containerd.NvidiaGPU != nil { config.NvidiaGPU = cluster.Spec.Containerd.NvidiaGPU } + if ig.Spec.Containerd != nil && ig.Spec.Containerd.NvidiaGPU != nil { + if config.NvidiaGPU == nil { + config.NvidiaGPU = ig.Spec.Containerd.NvidiaGPU + } else { + reflectutils.JSONMergeStruct(&config.NvidiaGPU, ig.Spec.Containerd.NvidiaGPU) + } + } + + if config.NvidiaGPU != nil && config.NvidiaGPU.DriverPackage == "" { + config.NvidiaGPU.DriverPackage = "nvidia-headless-460-server" + } + if ig.Spec.WarmPool != nil || cluster.Spec.WarmPool != nil { config.WarmPoolImages = n.buildWarmPoolImages(ig) } @@ -1431,6 +1444,15 @@ func loadCertificates(keysets map[string]*fi.Keyset, name string, config *nodeup return nil } +// buildContainerdConfig builds containerd configuration for instance. Instance group configuration will override cluster configuration +func (n *nodeUpConfigBuilder) buildContainerdConfig(ig *kops.InstanceGroup) *kops.ContainerdConfig { + config := n.cluster.Spec.Containerd.DeepCopy() + if ig.Spec.Containerd != nil { + reflectutils.JSONMergeStruct(&config, ig.Spec.Containerd) + } + return config +} + // buildWarmPoolImages returns a list of container images that should be pre-pulled during instance pre-initialization func (n *nodeUpConfigBuilder) buildWarmPoolImages(ig *kops.InstanceGroup) []string { if ig == nil || ig.Spec.Role == kops.InstanceGroupRoleMaster { diff --git a/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go b/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go index 830f5c78dd..6c80c21307 100644 --- a/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go +++ b/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go @@ -658,8 +658,15 @@ func (b *BootstrapChannelBuilder) buildAddons(c *fi.ModelBuilderContext) (*Addon } nvidia := b.Cluster.Spec.Containerd.NvidiaGPU + igNvidia := false + for _, ig := range b.KopsModelContext.InstanceGroups { + if ig.Spec.Containerd != nil && ig.Spec.Containerd.NvidiaGPU != nil && fi.BoolValue(ig.Spec.Containerd.NvidiaGPU.Enabled) { + igNvidia = true + break + } + } - if nvidia != nil && fi.BoolValue(nvidia.Enabled) { + if nvidia != nil && fi.BoolValue(nvidia.Enabled) || igNvidia { key := "nvidia.addons.k8s.io" diff --git a/upup/pkg/fi/cloudup/populate_instancegroup_spec.go b/upup/pkg/fi/cloudup/populate_instancegroup_spec.go index c709060aba..2361c29860 100644 --- a/upup/pkg/fi/cloudup/populate_instancegroup_spec.go +++ b/upup/pkg/fi/cloudup/populate_instancegroup_spec.go @@ -175,29 +175,45 @@ func PopulateInstanceGroupSpec(cluster *kops.Cluster, input *kops.InstanceGroup, return nil, fmt.Errorf("unable to infer any Subnets for InstanceGroup %s ", ig.ObjectMeta.Name) } + hasGPU := false + clusterNvidia := false if cluster.Spec.Containerd != nil && cluster.Spec.Containerd.NvidiaGPU != nil && fi.BoolValue(cluster.Spec.Containerd.NvidiaGPU.Enabled) { - switch cluster.Spec.GetCloudProvider() { - case kops.CloudProviderAWS: + clusterNvidia = true + } + igNvidia := false + if ig.Spec.Containerd != nil && ig.Spec.Containerd.NvidiaGPU != nil && fi.BoolValue(ig.Spec.Containerd.NvidiaGPU.Enabled) { + igNvidia = true + } + + switch cluster.Spec.GetCloudProvider() { + case kops.CloudProviderAWS: + if clusterNvidia || igNvidia { mt, err := awsup.GetMachineTypeInfo(cloud.(awsup.AWSCloud), ig.Spec.MachineType) if err != nil { return ig, fmt.Errorf("error looking up machine type info: %v", err) } - if mt.GPU { - if ig.Spec.NodeLabels == nil { - ig.Spec.NodeLabels = make(map[string]string) - } - ig.Spec.NodeLabels["kops.k8s.io/gpu"] = "1" - hasNvidiaTaint := false - for _, taint := range ig.Spec.Taints { - if strings.HasPrefix(taint, "nvidia.com/gpu") { - hasNvidiaTaint = true - } - } - if !hasNvidiaTaint { - ig.Spec.Taints = append(ig.Spec.Taints, "nvidia.com/gpu:NoSchedule") - } + hasGPU = mt.GPU + } + case kops.CloudProviderOpenstack: + if igNvidia { + hasGPU = true + } + } + + if hasGPU { + if ig.Spec.NodeLabels == nil { + ig.Spec.NodeLabels = make(map[string]string) + } + ig.Spec.NodeLabels["kops.k8s.io/gpu"] = "1" + hasNvidiaTaint := false + for _, taint := range ig.Spec.Taints { + if strings.HasPrefix(taint, "nvidia.com/gpu") { + hasNvidiaTaint = true } } + if !hasNvidiaTaint { + ig.Spec.Taints = append(ig.Spec.Taints, "nvidia.com/gpu:NoSchedule") + } } if ig.Spec.Manager == "" { diff --git a/upup/pkg/fi/nodeup/command.go b/upup/pkg/fi/nodeup/command.go index 003a46785d..bdfa0350d0 100644 --- a/upup/pkg/fi/nodeup/command.go +++ b/upup/pkg/fi/nodeup/command.go @@ -301,7 +301,12 @@ func (c *NodeUpCommand) Run(out io.Writer) error { modelContext.GPUVendor = architectures.GPUVendorNvidia } } - + } else if cloudProvider == api.CloudProviderOpenstack { + // NvidiaGPU possible to enable only in instance group level in OpenStack. When we assume that GPU is supported + if nodeupConfig.NvidiaGPU != nil && fi.BoolValue(nodeupConfig.NvidiaGPU.Enabled) { + klog.Info("instance supports GPU acceleration") + modelContext.GPUVendor = architectures.GPUVendorNvidia + } } if err := loadKernelModules(modelContext); err != nil {