Create node groups with GPU in scale-up.go
This is still not implemented in cloudprovider. Extended NewNodeGroup inteface to have a way of passing parameters for more complex resources.
This commit is contained in:
parent
6554919700
commit
b7f8622eb2
|
|
@ -162,7 +162,8 @@ func (aws *awsCloudProvider) GetAvailableMachineTypes() ([]string, error) {
|
|||
|
||||
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
||||
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
||||
func (aws *awsCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
func (aws *awsCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
|
||||
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
return nil, cloudprovider.ErrNotImplemented
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ type CloudProvider interface {
|
|||
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
||||
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
||||
// Implementation optional.
|
||||
NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (NodeGroup, error)
|
||||
NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string, extraResources map[string]resource.Quantity) (NodeGroup, error)
|
||||
|
||||
// GetResourceLimiter returns struct containing limits (max, min) for resources (cores, memory etc.).
|
||||
GetResourceLimiter() (*ResourceLimiter, error)
|
||||
|
|
@ -72,6 +72,10 @@ var ErrNotImplemented errors.AutoscalerError = errors.NewAutoscalerError(errors.
|
|||
// ErrAlreadyExist is returned if a method is not implemented.
|
||||
var ErrAlreadyExist errors.AutoscalerError = errors.NewAutoscalerError(errors.InternalError, "Already exist")
|
||||
|
||||
// ErrIllegalConfiguration is returned when trying to create NewNodeGroup with
|
||||
// configuration that is not supported by cloudprovider.
|
||||
var ErrIllegalConfiguration errors.AutoscalerError = errors.NewAutoscalerError(errors.InternalError, "Configuration not allowed by cloud provider")
|
||||
|
||||
// NodeGroup contains configuration info and functions to control a set
|
||||
// of nodes that have the same capacity and set of labels.
|
||||
type NodeGroup interface {
|
||||
|
|
|
|||
|
|
@ -137,7 +137,8 @@ func (gce *GceCloudProvider) GetAvailableMachineTypes() ([]string, error) {
|
|||
|
||||
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
||||
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
||||
func (gce *GceCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
func (gce *GceCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
|
||||
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
nodePoolName := fmt.Sprintf("%s-%s-%d", nodeAutoprovisioningPrefix, machineType, time.Now().Unix())
|
||||
mig := &Mig{
|
||||
autoprovisioned: true,
|
||||
|
|
|
|||
|
|
@ -376,7 +376,7 @@ func TestMig(t *testing.T) {
|
|||
gceManagerMock.On("getLocation").Return("us-central1-b").Once()
|
||||
gceManagerMock.On("getTemplates").Return(templateBuilder).Once()
|
||||
server.On("handle", "/project1/zones/us-central1-b/machineTypes/n1-standard-1").Return(getMachineTypeResponse).Once()
|
||||
nodeGroup, err := gce.NewNodeGroup("n1-standard-1", nil, nil)
|
||||
nodeGroup, err := gce.NewNodeGroup("n1-standard-1", nil, nil, nil)
|
||||
assert.NoError(t, err)
|
||||
assert.NotNil(t, nodeGroup)
|
||||
mig1 := reflect.ValueOf(nodeGroup).Interface().(*Mig)
|
||||
|
|
|
|||
|
|
@ -113,7 +113,8 @@ func (kubemark *KubemarkCloudProvider) GetAvailableMachineTypes() ([]string, err
|
|||
}
|
||||
|
||||
// NewNodeGroup builds a theoretical node group based on the node definition provided.
|
||||
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
|
||||
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
return nil, cloudprovider.ErrNotImplemented
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -56,7 +56,8 @@ func (kubemark *KubemarkCloudProvider) GetAvailableMachineTypes() ([]string, err
|
|||
return []string{}, cloudprovider.ErrNotImplemented
|
||||
}
|
||||
|
||||
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
|
||||
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
return nil, cloudprovider.ErrNotImplemented
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -136,7 +136,8 @@ func (tcp *TestCloudProvider) GetAvailableMachineTypes() ([]string, error) {
|
|||
|
||||
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
||||
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
||||
func (tcp *TestCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
func (tcp *TestCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
|
||||
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||
return &TestNodeGroup{
|
||||
cloudProvider: tcp,
|
||||
id: "autoprovisioned-" + machineType,
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ import (
|
|||
|
||||
apiv1 "k8s.io/api/core/v1"
|
||||
extensionsv1 "k8s.io/api/extensions/v1beta1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/estimator"
|
||||
|
|
@ -29,6 +30,7 @@ import (
|
|||
"k8s.io/autoscaler/cluster-autoscaler/metrics"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/simulator"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/labels"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/nodegroupset"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
|
|
@ -371,15 +373,48 @@ func addAutoprovisionedCandidates(context *AutoscalingContext, nodeGroups []clou
|
|||
return nodeGroups, nodeInfos
|
||||
}
|
||||
|
||||
newGroupsCount := 0
|
||||
|
||||
newNodeGroups := addAllMachineTypesForConfig(context, map[string]string{}, map[string]resource.Quantity{},
|
||||
nodeInfos, unschedulablePods)
|
||||
newGroupsCount += len(newNodeGroups)
|
||||
nodeGroups = append(nodeGroups, newNodeGroups...)
|
||||
|
||||
gpuRequests := gpu.GetGpuRequests(unschedulablePods)
|
||||
for _, gpuRequestInfo := range gpuRequests {
|
||||
glog.V(4).Info("Adding node groups using GPU to NAP simulations")
|
||||
extraResources := map[string]resource.Quantity{
|
||||
gpu.ResourceNvidiaGPU: gpuRequestInfo.MaxRequest,
|
||||
}
|
||||
newNodeGroups := addAllMachineTypesForConfig(context, gpuRequestInfo.SystemLabels, extraResources,
|
||||
nodeInfos, gpuRequestInfo.Pods)
|
||||
newGroupsCount += len(newNodeGroups)
|
||||
nodeGroups = append(nodeGroups, newNodeGroups...)
|
||||
}
|
||||
glog.V(4).Infof("Considering %v potential node groups in NAP simulations", newGroupsCount)
|
||||
|
||||
return nodeGroups, nodeInfos
|
||||
}
|
||||
|
||||
func addAllMachineTypesForConfig(context *AutoscalingContext, systemLabels map[string]string, extraResources map[string]resource.Quantity,
|
||||
nodeInfos map[string]*schedulercache.NodeInfo, unschedulablePods []*apiv1.Pod) []cloudprovider.NodeGroup {
|
||||
|
||||
nodeGroups := make([]cloudprovider.NodeGroup, 0)
|
||||
machines, err := context.CloudProvider.GetAvailableMachineTypes()
|
||||
if err != nil {
|
||||
glog.Warningf("Failed to get machine types: %v", err)
|
||||
} else {
|
||||
return nodeGroups
|
||||
}
|
||||
|
||||
bestLabels := labels.BestLabelSet(unschedulablePods)
|
||||
for _, machineType := range machines {
|
||||
nodeGroup, err := context.CloudProvider.NewNodeGroup(machineType, bestLabels, nil)
|
||||
nodeGroup, err := context.CloudProvider.NewNodeGroup(machineType, bestLabels, systemLabels, extraResources)
|
||||
if err != nil {
|
||||
// We don't check if a given node group setup is allowed.
|
||||
// It's fine if it isn't, just don't consider it an option.
|
||||
if err != cloudprovider.ErrIllegalConfiguration {
|
||||
glog.Warningf("Unable to build temporary node group for %s: %v", machineType, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
nodeInfo, err := nodeGroup.TemplateNodeInfo()
|
||||
|
|
@ -390,8 +425,7 @@ func addAutoprovisionedCandidates(context *AutoscalingContext, nodeGroups []clou
|
|||
nodeInfos[nodeGroup.Id()] = nodeInfo
|
||||
nodeGroups = append(nodeGroups, nodeGroup)
|
||||
}
|
||||
}
|
||||
return nodeGroups, nodeInfos
|
||||
return nodeGroups
|
||||
}
|
||||
|
||||
func calculateClusterCoresMemoryTotal(nodeGroups []cloudprovider.NodeGroup, nodeInfos map[string]*schedulercache.NodeInfo) (int64, int64) {
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ func TestPriceExpander(t *testing.T) {
|
|||
provider.AddNode("ng2", n2)
|
||||
ng1, _ := provider.NodeGroupForNode(n1)
|
||||
ng2, _ := provider.NodeGroupForNode(n2)
|
||||
ng3, _ := provider.NewNodeGroup("MT1", nil, nil)
|
||||
ng3, _ := provider.NewNodeGroup("MT1", nil, nil, nil)
|
||||
|
||||
ni1 := schedulercache.NewNodeInfo()
|
||||
ni1.SetNode(n1)
|
||||
|
|
|
|||
|
|
@ -98,6 +98,9 @@ type GpuRequestInfo struct {
|
|||
MaxRequest resource.Quantity
|
||||
// Pods is a list of pods requesting GPU
|
||||
Pods []*apiv1.Pod
|
||||
// SystemLabels is a set of system labels corresponding to selected GPU
|
||||
// that needs to be passed to cloudprovider
|
||||
SystemLabels map[string]string
|
||||
}
|
||||
|
||||
// GetGpuRequests returns a GpuRequestInfo for each type of GPU requested by
|
||||
|
|
@ -127,6 +130,9 @@ func GetGpuRequests(pods []*apiv1.Pod) map[string]GpuRequestInfo {
|
|||
requestInfo = GpuRequestInfo{
|
||||
MaxRequest: podGpu,
|
||||
Pods: make([]*apiv1.Pod, 0),
|
||||
SystemLabels: map[string]string{
|
||||
GPULabel: gpuType,
|
||||
},
|
||||
}
|
||||
}
|
||||
if podGpu.Cmp(requestInfo.MaxRequest) > 0 {
|
||||
|
|
|
|||
Loading…
Reference in New Issue