Create node groups with GPU in scale-up.go
This is still not implemented in cloudprovider. Extended NewNodeGroup inteface to have a way of passing parameters for more complex resources.
This commit is contained in:
parent
6554919700
commit
b7f8622eb2
|
|
@ -162,7 +162,8 @@ func (aws *awsCloudProvider) GetAvailableMachineTypes() ([]string, error) {
|
||||||
|
|
||||||
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
||||||
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
||||||
func (aws *awsCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
func (aws *awsCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
|
||||||
|
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||||
return nil, cloudprovider.ErrNotImplemented
|
return nil, cloudprovider.ErrNotImplemented
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,7 @@ type CloudProvider interface {
|
||||||
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
||||||
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
||||||
// Implementation optional.
|
// Implementation optional.
|
||||||
NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (NodeGroup, error)
|
NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string, extraResources map[string]resource.Quantity) (NodeGroup, error)
|
||||||
|
|
||||||
// GetResourceLimiter returns struct containing limits (max, min) for resources (cores, memory etc.).
|
// GetResourceLimiter returns struct containing limits (max, min) for resources (cores, memory etc.).
|
||||||
GetResourceLimiter() (*ResourceLimiter, error)
|
GetResourceLimiter() (*ResourceLimiter, error)
|
||||||
|
|
@ -72,6 +72,10 @@ var ErrNotImplemented errors.AutoscalerError = errors.NewAutoscalerError(errors.
|
||||||
// ErrAlreadyExist is returned if a method is not implemented.
|
// ErrAlreadyExist is returned if a method is not implemented.
|
||||||
var ErrAlreadyExist errors.AutoscalerError = errors.NewAutoscalerError(errors.InternalError, "Already exist")
|
var ErrAlreadyExist errors.AutoscalerError = errors.NewAutoscalerError(errors.InternalError, "Already exist")
|
||||||
|
|
||||||
|
// ErrIllegalConfiguration is returned when trying to create NewNodeGroup with
|
||||||
|
// configuration that is not supported by cloudprovider.
|
||||||
|
var ErrIllegalConfiguration errors.AutoscalerError = errors.NewAutoscalerError(errors.InternalError, "Configuration not allowed by cloud provider")
|
||||||
|
|
||||||
// NodeGroup contains configuration info and functions to control a set
|
// NodeGroup contains configuration info and functions to control a set
|
||||||
// of nodes that have the same capacity and set of labels.
|
// of nodes that have the same capacity and set of labels.
|
||||||
type NodeGroup interface {
|
type NodeGroup interface {
|
||||||
|
|
|
||||||
|
|
@ -137,7 +137,8 @@ func (gce *GceCloudProvider) GetAvailableMachineTypes() ([]string, error) {
|
||||||
|
|
||||||
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
||||||
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
||||||
func (gce *GceCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
func (gce *GceCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
|
||||||
|
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||||
nodePoolName := fmt.Sprintf("%s-%s-%d", nodeAutoprovisioningPrefix, machineType, time.Now().Unix())
|
nodePoolName := fmt.Sprintf("%s-%s-%d", nodeAutoprovisioningPrefix, machineType, time.Now().Unix())
|
||||||
mig := &Mig{
|
mig := &Mig{
|
||||||
autoprovisioned: true,
|
autoprovisioned: true,
|
||||||
|
|
|
||||||
|
|
@ -376,7 +376,7 @@ func TestMig(t *testing.T) {
|
||||||
gceManagerMock.On("getLocation").Return("us-central1-b").Once()
|
gceManagerMock.On("getLocation").Return("us-central1-b").Once()
|
||||||
gceManagerMock.On("getTemplates").Return(templateBuilder).Once()
|
gceManagerMock.On("getTemplates").Return(templateBuilder).Once()
|
||||||
server.On("handle", "/project1/zones/us-central1-b/machineTypes/n1-standard-1").Return(getMachineTypeResponse).Once()
|
server.On("handle", "/project1/zones/us-central1-b/machineTypes/n1-standard-1").Return(getMachineTypeResponse).Once()
|
||||||
nodeGroup, err := gce.NewNodeGroup("n1-standard-1", nil, nil)
|
nodeGroup, err := gce.NewNodeGroup("n1-standard-1", nil, nil, nil)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.NotNil(t, nodeGroup)
|
assert.NotNil(t, nodeGroup)
|
||||||
mig1 := reflect.ValueOf(nodeGroup).Interface().(*Mig)
|
mig1 := reflect.ValueOf(nodeGroup).Interface().(*Mig)
|
||||||
|
|
|
||||||
|
|
@ -113,7 +113,8 @@ func (kubemark *KubemarkCloudProvider) GetAvailableMachineTypes() ([]string, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewNodeGroup builds a theoretical node group based on the node definition provided.
|
// NewNodeGroup builds a theoretical node group based on the node definition provided.
|
||||||
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
|
||||||
|
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||||
return nil, cloudprovider.ErrNotImplemented
|
return nil, cloudprovider.ErrNotImplemented
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,8 @@ func (kubemark *KubemarkCloudProvider) GetAvailableMachineTypes() ([]string, err
|
||||||
return []string{}, cloudprovider.ErrNotImplemented
|
return []string{}, cloudprovider.ErrNotImplemented
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
|
||||||
|
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||||
return nil, cloudprovider.ErrNotImplemented
|
return nil, cloudprovider.ErrNotImplemented
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -136,7 +136,8 @@ func (tcp *TestCloudProvider) GetAvailableMachineTypes() ([]string, error) {
|
||||||
|
|
||||||
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
|
||||||
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
|
||||||
func (tcp *TestCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
func (tcp *TestCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
|
||||||
|
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
|
||||||
return &TestNodeGroup{
|
return &TestNodeGroup{
|
||||||
cloudProvider: tcp,
|
cloudProvider: tcp,
|
||||||
id: "autoprovisioned-" + machineType,
|
id: "autoprovisioned-" + machineType,
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,7 @@ import (
|
||||||
|
|
||||||
apiv1 "k8s.io/api/core/v1"
|
apiv1 "k8s.io/api/core/v1"
|
||||||
extensionsv1 "k8s.io/api/extensions/v1beta1"
|
extensionsv1 "k8s.io/api/extensions/v1beta1"
|
||||||
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
|
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/estimator"
|
"k8s.io/autoscaler/cluster-autoscaler/estimator"
|
||||||
|
|
@ -29,6 +30,7 @@ import (
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/metrics"
|
"k8s.io/autoscaler/cluster-autoscaler/metrics"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/simulator"
|
"k8s.io/autoscaler/cluster-autoscaler/simulator"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||||
|
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/utils/labels"
|
"k8s.io/autoscaler/cluster-autoscaler/utils/labels"
|
||||||
"k8s.io/autoscaler/cluster-autoscaler/utils/nodegroupset"
|
"k8s.io/autoscaler/cluster-autoscaler/utils/nodegroupset"
|
||||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||||
|
|
@ -371,27 +373,59 @@ func addAutoprovisionedCandidates(context *AutoscalingContext, nodeGroups []clou
|
||||||
return nodeGroups, nodeInfos
|
return nodeGroups, nodeInfos
|
||||||
}
|
}
|
||||||
|
|
||||||
|
newGroupsCount := 0
|
||||||
|
|
||||||
|
newNodeGroups := addAllMachineTypesForConfig(context, map[string]string{}, map[string]resource.Quantity{},
|
||||||
|
nodeInfos, unschedulablePods)
|
||||||
|
newGroupsCount += len(newNodeGroups)
|
||||||
|
nodeGroups = append(nodeGroups, newNodeGroups...)
|
||||||
|
|
||||||
|
gpuRequests := gpu.GetGpuRequests(unschedulablePods)
|
||||||
|
for _, gpuRequestInfo := range gpuRequests {
|
||||||
|
glog.V(4).Info("Adding node groups using GPU to NAP simulations")
|
||||||
|
extraResources := map[string]resource.Quantity{
|
||||||
|
gpu.ResourceNvidiaGPU: gpuRequestInfo.MaxRequest,
|
||||||
|
}
|
||||||
|
newNodeGroups := addAllMachineTypesForConfig(context, gpuRequestInfo.SystemLabels, extraResources,
|
||||||
|
nodeInfos, gpuRequestInfo.Pods)
|
||||||
|
newGroupsCount += len(newNodeGroups)
|
||||||
|
nodeGroups = append(nodeGroups, newNodeGroups...)
|
||||||
|
}
|
||||||
|
glog.V(4).Infof("Considering %v potential node groups in NAP simulations", newGroupsCount)
|
||||||
|
|
||||||
|
return nodeGroups, nodeInfos
|
||||||
|
}
|
||||||
|
|
||||||
|
func addAllMachineTypesForConfig(context *AutoscalingContext, systemLabels map[string]string, extraResources map[string]resource.Quantity,
|
||||||
|
nodeInfos map[string]*schedulercache.NodeInfo, unschedulablePods []*apiv1.Pod) []cloudprovider.NodeGroup {
|
||||||
|
|
||||||
|
nodeGroups := make([]cloudprovider.NodeGroup, 0)
|
||||||
machines, err := context.CloudProvider.GetAvailableMachineTypes()
|
machines, err := context.CloudProvider.GetAvailableMachineTypes()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Warningf("Failed to get machine types: %v", err)
|
glog.Warningf("Failed to get machine types: %v", err)
|
||||||
} else {
|
return nodeGroups
|
||||||
bestLabels := labels.BestLabelSet(unschedulablePods)
|
|
||||||
for _, machineType := range machines {
|
|
||||||
nodeGroup, err := context.CloudProvider.NewNodeGroup(machineType, bestLabels, nil)
|
|
||||||
if err != nil {
|
|
||||||
glog.Warningf("Unable to build temporary node group for %s: %v", machineType, err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
nodeInfo, err := nodeGroup.TemplateNodeInfo()
|
|
||||||
if err != nil {
|
|
||||||
glog.Warningf("Unable to build template for node group for %s: %v", nodeGroup.Id(), err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
nodeInfos[nodeGroup.Id()] = nodeInfo
|
|
||||||
nodeGroups = append(nodeGroups, nodeGroup)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return nodeGroups, nodeInfos
|
|
||||||
|
bestLabels := labels.BestLabelSet(unschedulablePods)
|
||||||
|
for _, machineType := range machines {
|
||||||
|
nodeGroup, err := context.CloudProvider.NewNodeGroup(machineType, bestLabels, systemLabels, extraResources)
|
||||||
|
if err != nil {
|
||||||
|
// We don't check if a given node group setup is allowed.
|
||||||
|
// It's fine if it isn't, just don't consider it an option.
|
||||||
|
if err != cloudprovider.ErrIllegalConfiguration {
|
||||||
|
glog.Warningf("Unable to build temporary node group for %s: %v", machineType, err)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
nodeInfo, err := nodeGroup.TemplateNodeInfo()
|
||||||
|
if err != nil {
|
||||||
|
glog.Warningf("Unable to build template for node group for %s: %v", nodeGroup.Id(), err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
nodeInfos[nodeGroup.Id()] = nodeInfo
|
||||||
|
nodeGroups = append(nodeGroups, nodeGroup)
|
||||||
|
}
|
||||||
|
return nodeGroups
|
||||||
}
|
}
|
||||||
|
|
||||||
func calculateClusterCoresMemoryTotal(nodeGroups []cloudprovider.NodeGroup, nodeInfos map[string]*schedulercache.NodeInfo) (int64, int64) {
|
func calculateClusterCoresMemoryTotal(nodeGroups []cloudprovider.NodeGroup, nodeInfos map[string]*schedulercache.NodeInfo) (int64, int64) {
|
||||||
|
|
|
||||||
|
|
@ -73,7 +73,7 @@ func TestPriceExpander(t *testing.T) {
|
||||||
provider.AddNode("ng2", n2)
|
provider.AddNode("ng2", n2)
|
||||||
ng1, _ := provider.NodeGroupForNode(n1)
|
ng1, _ := provider.NodeGroupForNode(n1)
|
||||||
ng2, _ := provider.NodeGroupForNode(n2)
|
ng2, _ := provider.NodeGroupForNode(n2)
|
||||||
ng3, _ := provider.NewNodeGroup("MT1", nil, nil)
|
ng3, _ := provider.NewNodeGroup("MT1", nil, nil, nil)
|
||||||
|
|
||||||
ni1 := schedulercache.NewNodeInfo()
|
ni1 := schedulercache.NewNodeInfo()
|
||||||
ni1.SetNode(n1)
|
ni1.SetNode(n1)
|
||||||
|
|
|
||||||
|
|
@ -98,6 +98,9 @@ type GpuRequestInfo struct {
|
||||||
MaxRequest resource.Quantity
|
MaxRequest resource.Quantity
|
||||||
// Pods is a list of pods requesting GPU
|
// Pods is a list of pods requesting GPU
|
||||||
Pods []*apiv1.Pod
|
Pods []*apiv1.Pod
|
||||||
|
// SystemLabels is a set of system labels corresponding to selected GPU
|
||||||
|
// that needs to be passed to cloudprovider
|
||||||
|
SystemLabels map[string]string
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetGpuRequests returns a GpuRequestInfo for each type of GPU requested by
|
// GetGpuRequests returns a GpuRequestInfo for each type of GPU requested by
|
||||||
|
|
@ -127,6 +130,9 @@ func GetGpuRequests(pods []*apiv1.Pod) map[string]GpuRequestInfo {
|
||||||
requestInfo = GpuRequestInfo{
|
requestInfo = GpuRequestInfo{
|
||||||
MaxRequest: podGpu,
|
MaxRequest: podGpu,
|
||||||
Pods: make([]*apiv1.Pod, 0),
|
Pods: make([]*apiv1.Pod, 0),
|
||||||
|
SystemLabels: map[string]string{
|
||||||
|
GPULabel: gpuType,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if podGpu.Cmp(requestInfo.MaxRequest) > 0 {
|
if podGpu.Cmp(requestInfo.MaxRequest) > 0 {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue