Add a workaround for scaling of VMs with GPUs

When a machine with GPU becomes ready it can take up to 15 minutes before it reports that GPU is allocatable. This can cause Cluster Autoscaler to trigger a second unnecessary scale up. The workaround sets allocatable to capacity for GPU so that a node that waits for GPUs to become ready to use will be considered as a place where pods requesting GPUs can be scheduled.
2017-11-06 13:23:34 +01:00 · 2017-11-06 13:23:34 +01:00 · 2b28ac1a04
parent c4a678a347
commit 2b28ac1a04
5 changed files with 123 additions and 6 deletions
--- a/cluster-autoscaler/cloudprovider/gce/templates.go
+++ b/cluster-autoscaler/cloudprovider/gce/templates.go
@ -30,6 +30,7 @@ import (
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
 	kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"

 	"github.com/golang/glog"
@ -38,7 +39,6 @@ import (
 const (
 	mbPerGB           = 1000
 	millicoresPerCore = 1000
-	resourceNvidiaGPU = "nvidia.com/gpu"
 )

 // builds templates for gce cloud provider
@ -98,7 +98,7 @@ func (t *templateBuilder) buildCapacity(machineType string, accelerators []*gce.
 	capacity[apiv1.ResourceMemory] = *resource.NewQuantity(mem, resource.DecimalSI)

 	if accelerators != nil && len(accelerators) > 0 {
-		capacity[resourceNvidiaGPU] = *resource.NewQuantity(t.getAcceleratorCount(accelerators), resource.DecimalSI)
+		capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(t.getAcceleratorCount(accelerators), resource.DecimalSI)
 	}

 	return capacity, nil
--- a/cluster-autoscaler/cloudprovider/gce/templates_test.go
+++ b/cluster-autoscaler/cloudprovider/gce/templates_test.go
@ -20,13 +20,16 @@ import (
 	"fmt"
 	"testing"

-	"github.com/stretchr/testify/assert"
+	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
+	gpuUtils "k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
+
 	gce "google.golang.org/api/compute/v1"
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
-	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
 	kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
 	"k8s.io/kubernetes/pkg/quota"
+
+	"github.com/stretchr/testify/assert"
 )

 func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
@ -486,7 +489,7 @@ func makeResourceList(cpu string, memory string, gpu int64) (apiv1.ResourceList,
 		if err != nil {
 			return nil, err
 		}
-		result[resourceNvidiaGPU] = resultGpu
+		result[gpuUtils.ResourceNvidiaGPU] = resultGpu
 	}
 	return result, nil
 }
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@ -19,16 +19,17 @@ package core
 import (
 	"time"

+	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
 	"k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils"
 	"k8s.io/autoscaler/cluster-autoscaler/metrics"
 	"k8s.io/autoscaler/cluster-autoscaler/simulator"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
 	kube_client "k8s.io/client-go/kubernetes"
 	kube_record "k8s.io/client-go/tools/record"

 	"github.com/golang/glog"
-	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
 )

 // StaticAutoscaler is an autoscaler which has all the core functionality of a CA but without the reconfiguration feature
@ -106,6 +107,10 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		glog.Errorf("Failed to list ready nodes: %v", err)
 		return errors.ToAutoscalerError(errors.ApiCallError, err)
 	}
+	// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
+	// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
+	// TODO: Remove this call when we handle dynamically provisioned resources.
+	readyNodes = gpu.SetGPUAllocatableToCapacity(readyNodes)
 	if len(readyNodes) == 0 {
 		glog.Warningf("No ready nodes in the cluster")
 		scaleDown.CleanUpUnneededNodes()
--- a/cluster-autoscaler/utils/gpu/gpu.go
+++ b/cluster-autoscaler/utils/gpu/gpu.go
@ -0,0 +1,54 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package gpu
+
+import (
+	apiv1 "k8s.io/api/core/v1"
+	"k8s.io/kubernetes/pkg/api"
+
+	"github.com/golang/glog"
+)
+
+const (
+	// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
+	ResourceNvidiaGPU = "nvidia.com/gpu"
+)
+
+// SetGPUAllocatableToCapacity allows us to tolerate the fact that nodes with
+// GPUs can have allocatable set to 0 for multiple minutes after becoming ready
+// Without this workaround, Cluster Autoscaler will trigger an unnecessary
+// additional scale up before the node is fully operational.
+// TODO: Remove this once we handle dynamically privisioned resources well.
+func SetGPUAllocatableToCapacity(nodes []*apiv1.Node) []*apiv1.Node {
+	result := []*apiv1.Node{}
+	for _, node := range nodes {
+		newNode := node
+		if gpuCapacity, ok := node.Status.Capacity[ResourceNvidiaGPU]; ok {
+			if gpuAllocatable, ok := node.Status.Allocatable[ResourceNvidiaGPU]; !ok || gpuAllocatable.IsZero() {
+				nodeCopy, err := api.Scheme.DeepCopy(node)
+				if err != nil {
+					glog.Errorf("Failed to make a copy of node %v", node.ObjectMeta.Name)
+				} else {
+					newNode = nodeCopy.(*apiv1.Node)
+					newNode.Status.Allocatable[ResourceNvidiaGPU] = gpuCapacity.DeepCopy()
+				}
+			}
+		}
+		result = append(result, newNode)
+	}
+	return result
+}
--- a/cluster-autoscaler/utils/gpu/gpu_test.go
+++ b/cluster-autoscaler/utils/gpu/gpu_test.go
@ -0,0 +1,55 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package gpu
+
+import (
+	"testing"
+
+	apiv1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestSetGPUAllocatableToCapacity(t *testing.T) {
+	nodeGPU := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpu"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
+	nodeGPU.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
+	nodeGPU.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
+	nodeGPUUnready := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuUnready"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
+	nodeGPUUnready.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
+	nodeGPUUnready.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(2, resource.DecimalSI)
+	nodeGPUNoAllocatable := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuNoAllocatable"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
+	nodeGPUNoAllocatable.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
+	nodeNoGPU := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuUnready"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
+	nodeNoGPU.Status.Allocatable[apiv1.ResourceCPU] = *resource.NewQuantity(1, resource.DecimalSI)
+	nodeNoGPU.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(2, resource.DecimalSI)
+	result := SetGPUAllocatableToCapacity([]*apiv1.Node{nodeGPU, nodeGPUUnready, nodeGPUNoAllocatable, nodeNoGPU})
+	assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 1, 1, result[0])
+	assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 2, 2, result[1])
+	assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 1, 1, result[2])
+	assertAllocatableAndCapacity(t, apiv1.ResourceCPU, 1, 2, result[3])
+}
+
+func assertAllocatableAndCapacity(t *testing.T, resourceName apiv1.ResourceName, allocatable, capacity int64, node *apiv1.Node) {
+	allocatableResource := *resource.NewQuantity(allocatable, resource.DecimalSI)
+	capacityResource := *resource.NewQuantity(capacity, resource.DecimalSI)
+	assert.Equal(t, node.Status.Allocatable[resourceName], allocatableResource,
+		"Node %v, expected allocatable %v: %v got: %v", node.ObjectMeta.Name, resourceName, node.Status.Allocatable[resourceName], allocatableResource)
+	assert.Equal(t, node.Status.Capacity[resourceName], capacityResource,
+		"Node %v, expected capacity %v: %v got: %v", node.ObjectMeta.Name, resourceName, node.Status.Capacity[resourceName], capacityResource)
+}