Add a workaround for scaling of VMs with GPUs
When a machine with GPU becomes ready it can take up to 15 minutes before it reports that GPU is allocatable. This can cause Cluster Autoscaler to trigger a second unnecessary scale up. The workaround sets allocatable to capacity for GPU so that a node that waits for GPUs to become ready to use will be considered as a place where pods requesting GPUs can be scheduled.
This commit is contained in:
parent
c4a678a347
commit
2b28ac1a04
|
@ -30,6 +30,7 @@ import (
|
|||
apiv1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
|
||||
kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
|
||||
|
||||
"github.com/golang/glog"
|
||||
|
@ -38,7 +39,6 @@ import (
|
|||
const (
|
||||
mbPerGB = 1000
|
||||
millicoresPerCore = 1000
|
||||
resourceNvidiaGPU = "nvidia.com/gpu"
|
||||
)
|
||||
|
||||
// builds templates for gce cloud provider
|
||||
|
@ -98,7 +98,7 @@ func (t *templateBuilder) buildCapacity(machineType string, accelerators []*gce.
|
|||
capacity[apiv1.ResourceMemory] = *resource.NewQuantity(mem, resource.DecimalSI)
|
||||
|
||||
if accelerators != nil && len(accelerators) > 0 {
|
||||
capacity[resourceNvidiaGPU] = *resource.NewQuantity(t.getAcceleratorCount(accelerators), resource.DecimalSI)
|
||||
capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(t.getAcceleratorCount(accelerators), resource.DecimalSI)
|
||||
}
|
||||
|
||||
return capacity, nil
|
||||
|
|
|
@ -20,13 +20,16 @@ import (
|
|||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
gpuUtils "k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
|
||||
|
||||
gce "google.golang.org/api/compute/v1"
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
|
||||
"k8s.io/kubernetes/pkg/quota"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
|
||||
|
@ -486,7 +489,7 @@ func makeResourceList(cpu string, memory string, gpu int64) (apiv1.ResourceList,
|
|||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
result[resourceNvidiaGPU] = resultGpu
|
||||
result[gpuUtils.ResourceNvidiaGPU] = resultGpu
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
|
|
@ -19,16 +19,17 @@ package core
|
|||
import (
|
||||
"time"
|
||||
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/metrics"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/simulator"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
|
||||
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
|
||||
kube_client "k8s.io/client-go/kubernetes"
|
||||
kube_record "k8s.io/client-go/tools/record"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
)
|
||||
|
||||
// StaticAutoscaler is an autoscaler which has all the core functionality of a CA but without the reconfiguration feature
|
||||
|
@ -106,6 +107,10 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
glog.Errorf("Failed to list ready nodes: %v", err)
|
||||
return errors.ToAutoscalerError(errors.ApiCallError, err)
|
||||
}
|
||||
// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
|
||||
// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
|
||||
// TODO: Remove this call when we handle dynamically provisioned resources.
|
||||
readyNodes = gpu.SetGPUAllocatableToCapacity(readyNodes)
|
||||
if len(readyNodes) == 0 {
|
||||
glog.Warningf("No ready nodes in the cluster")
|
||||
scaleDown.CleanUpUnneededNodes()
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gpu
|
||||
|
||||
import (
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
const (
|
||||
// ResourceNvidiaGPU is the name of the Nvidia GPU resource.
|
||||
ResourceNvidiaGPU = "nvidia.com/gpu"
|
||||
)
|
||||
|
||||
// SetGPUAllocatableToCapacity allows us to tolerate the fact that nodes with
|
||||
// GPUs can have allocatable set to 0 for multiple minutes after becoming ready
|
||||
// Without this workaround, Cluster Autoscaler will trigger an unnecessary
|
||||
// additional scale up before the node is fully operational.
|
||||
// TODO: Remove this once we handle dynamically privisioned resources well.
|
||||
func SetGPUAllocatableToCapacity(nodes []*apiv1.Node) []*apiv1.Node {
|
||||
result := []*apiv1.Node{}
|
||||
for _, node := range nodes {
|
||||
newNode := node
|
||||
if gpuCapacity, ok := node.Status.Capacity[ResourceNvidiaGPU]; ok {
|
||||
if gpuAllocatable, ok := node.Status.Allocatable[ResourceNvidiaGPU]; !ok || gpuAllocatable.IsZero() {
|
||||
nodeCopy, err := api.Scheme.DeepCopy(node)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to make a copy of node %v", node.ObjectMeta.Name)
|
||||
} else {
|
||||
newNode = nodeCopy.(*apiv1.Node)
|
||||
newNode.Status.Allocatable[ResourceNvidiaGPU] = gpuCapacity.DeepCopy()
|
||||
}
|
||||
}
|
||||
}
|
||||
result = append(result, newNode)
|
||||
}
|
||||
return result
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gpu
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestSetGPUAllocatableToCapacity(t *testing.T) {
|
||||
nodeGPU := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpu"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
|
||||
nodeGPU.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
|
||||
nodeGPU.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
|
||||
nodeGPUUnready := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuUnready"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
|
||||
nodeGPUUnready.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(0, resource.DecimalSI)
|
||||
nodeGPUUnready.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(2, resource.DecimalSI)
|
||||
nodeGPUNoAllocatable := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuNoAllocatable"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
|
||||
nodeGPUNoAllocatable.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
|
||||
nodeNoGPU := &apiv1.Node{ObjectMeta: metav1.ObjectMeta{Name: "nodeGpuUnready"}, Status: apiv1.NodeStatus{Capacity: apiv1.ResourceList{}, Allocatable: apiv1.ResourceList{}}}
|
||||
nodeNoGPU.Status.Allocatable[apiv1.ResourceCPU] = *resource.NewQuantity(1, resource.DecimalSI)
|
||||
nodeNoGPU.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(2, resource.DecimalSI)
|
||||
result := SetGPUAllocatableToCapacity([]*apiv1.Node{nodeGPU, nodeGPUUnready, nodeGPUNoAllocatable, nodeNoGPU})
|
||||
assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 1, 1, result[0])
|
||||
assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 2, 2, result[1])
|
||||
assertAllocatableAndCapacity(t, ResourceNvidiaGPU, 1, 1, result[2])
|
||||
assertAllocatableAndCapacity(t, apiv1.ResourceCPU, 1, 2, result[3])
|
||||
}
|
||||
|
||||
func assertAllocatableAndCapacity(t *testing.T, resourceName apiv1.ResourceName, allocatable, capacity int64, node *apiv1.Node) {
|
||||
allocatableResource := *resource.NewQuantity(allocatable, resource.DecimalSI)
|
||||
capacityResource := *resource.NewQuantity(capacity, resource.DecimalSI)
|
||||
assert.Equal(t, node.Status.Allocatable[resourceName], allocatableResource,
|
||||
"Node %v, expected allocatable %v: %v got: %v", node.ObjectMeta.Name, resourceName, node.Status.Allocatable[resourceName], allocatableResource)
|
||||
assert.Equal(t, node.Status.Capacity[resourceName], capacityResource,
|
||||
"Node %v, expected capacity %v: %v got: %v", node.ObjectMeta.Name, resourceName, node.Status.Capacity[resourceName], capacityResource)
|
||||
}
|
Loading…
Reference in New Issue