Create node groups with GPU in scale-up.go

This is still not implemented in cloudprovider.
Extended NewNodeGroup inteface to have a way of passing
parameters for more complex resources.
This commit is contained in:
Maciej Pytel 2017-11-16 11:27:39 +01:00
parent 6554919700
commit b7f8622eb2
10 changed files with 74 additions and 25 deletions

View File

@ -162,7 +162,8 @@ func (aws *awsCloudProvider) GetAvailableMachineTypes() ([]string, error) {
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
func (aws *awsCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
func (aws *awsCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
return nil, cloudprovider.ErrNotImplemented
}

View File

@ -53,7 +53,7 @@ type CloudProvider interface {
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
// Implementation optional.
NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (NodeGroup, error)
NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string, extraResources map[string]resource.Quantity) (NodeGroup, error)
// GetResourceLimiter returns struct containing limits (max, min) for resources (cores, memory etc.).
GetResourceLimiter() (*ResourceLimiter, error)
@ -72,6 +72,10 @@ var ErrNotImplemented errors.AutoscalerError = errors.NewAutoscalerError(errors.
// ErrAlreadyExist is returned if a method is not implemented.
var ErrAlreadyExist errors.AutoscalerError = errors.NewAutoscalerError(errors.InternalError, "Already exist")
// ErrIllegalConfiguration is returned when trying to create NewNodeGroup with
// configuration that is not supported by cloudprovider.
var ErrIllegalConfiguration errors.AutoscalerError = errors.NewAutoscalerError(errors.InternalError, "Configuration not allowed by cloud provider")
// NodeGroup contains configuration info and functions to control a set
// of nodes that have the same capacity and set of labels.
type NodeGroup interface {

View File

@ -137,7 +137,8 @@ func (gce *GceCloudProvider) GetAvailableMachineTypes() ([]string, error) {
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
func (gce *GceCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
func (gce *GceCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
nodePoolName := fmt.Sprintf("%s-%s-%d", nodeAutoprovisioningPrefix, machineType, time.Now().Unix())
mig := &Mig{
autoprovisioned: true,

View File

@ -376,7 +376,7 @@ func TestMig(t *testing.T) {
gceManagerMock.On("getLocation").Return("us-central1-b").Once()
gceManagerMock.On("getTemplates").Return(templateBuilder).Once()
server.On("handle", "/project1/zones/us-central1-b/machineTypes/n1-standard-1").Return(getMachineTypeResponse).Once()
nodeGroup, err := gce.NewNodeGroup("n1-standard-1", nil, nil)
nodeGroup, err := gce.NewNodeGroup("n1-standard-1", nil, nil, nil)
assert.NoError(t, err)
assert.NotNil(t, nodeGroup)
mig1 := reflect.ValueOf(nodeGroup).Interface().(*Mig)

View File

@ -113,7 +113,8 @@ func (kubemark *KubemarkCloudProvider) GetAvailableMachineTypes() ([]string, err
}
// NewNodeGroup builds a theoretical node group based on the node definition provided.
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
return nil, cloudprovider.ErrNotImplemented
}

View File

@ -56,7 +56,8 @@ func (kubemark *KubemarkCloudProvider) GetAvailableMachineTypes() ([]string, err
return []string{}, cloudprovider.ErrNotImplemented
}
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
func (kubemark *KubemarkCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
return nil, cloudprovider.ErrNotImplemented
}

View File

@ -136,7 +136,8 @@ func (tcp *TestCloudProvider) GetAvailableMachineTypes() ([]string, error) {
// NewNodeGroup builds a theoretical node group based on the node definition provided. The node group is not automatically
// created on the cloud provider side. The node group is not returned by NodeGroups() until it is created.
func (tcp *TestCloudProvider) NewNodeGroup(machineType string, labels map[string]string, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
func (tcp *TestCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string,
extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
return &TestNodeGroup{
cloudProvider: tcp,
id: "autoprovisioned-" + machineType,

View File

@ -22,6 +22,7 @@ import (
apiv1 "k8s.io/api/core/v1"
extensionsv1 "k8s.io/api/extensions/v1beta1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
"k8s.io/autoscaler/cluster-autoscaler/estimator"
@ -29,6 +30,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/simulator"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/autoscaler/cluster-autoscaler/utils/labels"
"k8s.io/autoscaler/cluster-autoscaler/utils/nodegroupset"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
@ -371,15 +373,48 @@ func addAutoprovisionedCandidates(context *AutoscalingContext, nodeGroups []clou
return nodeGroups, nodeInfos
}
newGroupsCount := 0
newNodeGroups := addAllMachineTypesForConfig(context, map[string]string{}, map[string]resource.Quantity{},
nodeInfos, unschedulablePods)
newGroupsCount += len(newNodeGroups)
nodeGroups = append(nodeGroups, newNodeGroups...)
gpuRequests := gpu.GetGpuRequests(unschedulablePods)
for _, gpuRequestInfo := range gpuRequests {
glog.V(4).Info("Adding node groups using GPU to NAP simulations")
extraResources := map[string]resource.Quantity{
gpu.ResourceNvidiaGPU: gpuRequestInfo.MaxRequest,
}
newNodeGroups := addAllMachineTypesForConfig(context, gpuRequestInfo.SystemLabels, extraResources,
nodeInfos, gpuRequestInfo.Pods)
newGroupsCount += len(newNodeGroups)
nodeGroups = append(nodeGroups, newNodeGroups...)
}
glog.V(4).Infof("Considering %v potential node groups in NAP simulations", newGroupsCount)
return nodeGroups, nodeInfos
}
func addAllMachineTypesForConfig(context *AutoscalingContext, systemLabels map[string]string, extraResources map[string]resource.Quantity,
nodeInfos map[string]*schedulercache.NodeInfo, unschedulablePods []*apiv1.Pod) []cloudprovider.NodeGroup {
nodeGroups := make([]cloudprovider.NodeGroup, 0)
machines, err := context.CloudProvider.GetAvailableMachineTypes()
if err != nil {
glog.Warningf("Failed to get machine types: %v", err)
} else {
return nodeGroups
}
bestLabels := labels.BestLabelSet(unschedulablePods)
for _, machineType := range machines {
nodeGroup, err := context.CloudProvider.NewNodeGroup(machineType, bestLabels, nil)
nodeGroup, err := context.CloudProvider.NewNodeGroup(machineType, bestLabels, systemLabels, extraResources)
if err != nil {
// We don't check if a given node group setup is allowed.
// It's fine if it isn't, just don't consider it an option.
if err != cloudprovider.ErrIllegalConfiguration {
glog.Warningf("Unable to build temporary node group for %s: %v", machineType, err)
}
continue
}
nodeInfo, err := nodeGroup.TemplateNodeInfo()
@ -390,8 +425,7 @@ func addAutoprovisionedCandidates(context *AutoscalingContext, nodeGroups []clou
nodeInfos[nodeGroup.Id()] = nodeInfo
nodeGroups = append(nodeGroups, nodeGroup)
}
}
return nodeGroups, nodeInfos
return nodeGroups
}
func calculateClusterCoresMemoryTotal(nodeGroups []cloudprovider.NodeGroup, nodeInfos map[string]*schedulercache.NodeInfo) (int64, int64) {

View File

@ -73,7 +73,7 @@ func TestPriceExpander(t *testing.T) {
provider.AddNode("ng2", n2)
ng1, _ := provider.NodeGroupForNode(n1)
ng2, _ := provider.NodeGroupForNode(n2)
ng3, _ := provider.NewNodeGroup("MT1", nil, nil)
ng3, _ := provider.NewNodeGroup("MT1", nil, nil, nil)
ni1 := schedulercache.NewNodeInfo()
ni1.SetNode(n1)

View File

@ -98,6 +98,9 @@ type GpuRequestInfo struct {
MaxRequest resource.Quantity
// Pods is a list of pods requesting GPU
Pods []*apiv1.Pod
// SystemLabels is a set of system labels corresponding to selected GPU
// that needs to be passed to cloudprovider
SystemLabels map[string]string
}
// GetGpuRequests returns a GpuRequestInfo for each type of GPU requested by
@ -127,6 +130,9 @@ func GetGpuRequests(pods []*apiv1.Pod) map[string]GpuRequestInfo {
requestInfo = GpuRequestInfo{
MaxRequest: podGpu,
Pods: make([]*apiv1.Pod, 0),
SystemLabels: map[string]string{
GPULabel: gpuType,
},
}
}
if podGpu.Cmp(requestInfo.MaxRequest) > 0 {