Merge pull request #401 from bskiba/gpu

Support GPUs in scale from 0
This commit is contained in:
Marcin Wielgus 2017-10-18 11:28:45 +02:00 committed by GitHub
commit 2418dba233
2 changed files with 80 additions and 13 deletions

View File

@ -75,10 +75,19 @@ func (t *templateBuilder) getCpuAndMemoryForMachineType(machineType string) (cpu
return machine.GuestCpus, machine.MemoryMb * 1024 * 1024, nil
}
func (t *templateBuilder) buildCapacity(machineType string) (apiv1.ResourceList, error) {
func (t *templateBuilder) getAcceleratorCount(accelerators []*gce.AcceleratorConfig) int64 {
count := int64(0)
for _, accelerator := range accelerators {
if strings.HasPrefix(accelerator.AcceleratorType, "nvidia-") {
count += accelerator.AcceleratorCount
}
}
return count
}
func (t *templateBuilder) buildCapacity(machineType string, accelerators []*gce.AcceleratorConfig) (apiv1.ResourceList, error) {
capacity := apiv1.ResourceList{}
// TODO: get a real value.
// TODO: handle GPU
capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
cpu, mem, err := t.getCpuAndMemoryForMachineType(machineType)
@ -87,6 +96,11 @@ func (t *templateBuilder) buildCapacity(machineType string) (apiv1.ResourceList,
}
capacity[apiv1.ResourceCPU] = *resource.NewQuantity(cpu, resource.DecimalSI)
capacity[apiv1.ResourceMemory] = *resource.NewQuantity(mem, resource.DecimalSI)
if accelerators != nil && len(accelerators) > 0 {
capacity[apiv1.ResourceNvidiaGPU] = *resource.NewQuantity(t.getAcceleratorCount(accelerators), resource.DecimalSI)
}
return capacity, nil
}
@ -147,7 +161,8 @@ func (t *templateBuilder) buildNodeFromTemplate(mig *Mig, template *gce.Instance
SelfLink: fmt.Sprintf("/api/v1/nodes/%s", nodeName),
Labels: map[string]string{},
}
capacity, err := t.buildCapacity(template.Properties.MachineType)
capacity, err := t.buildCapacity(template.Properties.MachineType, template.Properties.GuestAccelerators)
if err != nil {
return nil, err
}
@ -215,7 +230,8 @@ func (t *templateBuilder) buildNodeFromAutoprovisioningSpec(mig *Mig) (*apiv1.No
SelfLink: fmt.Sprintf("/api/v1/nodes/%s", nodeName),
Labels: map[string]string{},
}
capacity, err := t.buildCapacity(mig.spec.machineType)
// TODO: Handle GPU
capacity, err := t.buildCapacity(mig.spec.machineType, nil)
if err != nil {
return nil, err
}

View File

@ -34,11 +34,13 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
kubeEnv string
name string
machineType string
accelerators []*gce.AcceleratorConfig
mig *Mig
capacityCpu string
capacityMemory string
allocatableCpu string
allocatableMemory string
gpuCount int64
expectedErr bool
}
testCases := []testCase{{
@ -49,6 +51,10 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
"NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n",
name: "nodeName",
machineType: "custom-8-2",
accelerators: []*gce.AcceleratorConfig{
{AcceleratorType: "nvidia-tesla-k80", AcceleratorCount: 3},
{AcceleratorType: "nvidia-tesla-p100", AcceleratorCount: 8},
},
mig: &Mig{GceRef: GceRef{
Name: "some-name",
Project: "some-proj",
@ -57,6 +63,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
capacityMemory: fmt.Sprintf("%v", 2*1024*1024),
allocatableCpu: "7000m",
allocatableMemory: fmt.Sprintf("%v", 1024*1024),
gpuCount: 11,
expectedErr: false,
}, {
kubeEnv: "ENABLE_NODE_PROBLEM_DETECTOR: 'daemonset'\n" +
@ -90,6 +97,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
template := &gce.InstanceTemplate{
Name: tc.name,
Properties: &gce.InstanceProperties{
GuestAccelerators: tc.accelerators,
Metadata: &gce.Metadata{
Items: []*gce.MetadataItems{{Key: "kube-env", Value: &tc.kubeEnv}},
},
@ -102,10 +110,10 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
} else {
assert.NoError(t, err)
podsQuantity, _ := resource.ParseQuantity("110")
capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory)
capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory, tc.gpuCount)
capacity[apiv1.ResourcePods] = podsQuantity
assert.NoError(t, err)
allocatable, err := makeResourceList(tc.allocatableCpu, tc.allocatableMemory)
allocatable, err := makeResourceList(tc.allocatableCpu, tc.allocatableMemory, tc.gpuCount)
allocatable[apiv1.ResourcePods] = podsQuantity
assert.NoError(t, err)
assertEqualResourceLists(t, "Capacity", capacity, node.Status.Capacity)
@ -182,6 +190,7 @@ func TestBuildAllocatableFromKubeEnv(t *testing.T) {
capacityMemory string
expectedCpu string
expectedMemory string
gcuCount int64
expectedErr bool
}
testCases := []testCase{{
@ -194,6 +203,7 @@ func TestBuildAllocatableFromKubeEnv(t *testing.T) {
capacityMemory: "700000Mi",
expectedCpu: "3000m",
expectedMemory: "400000Mi",
gcuCount: 10,
expectedErr: false,
}, {
kubeEnv: "ENABLE_NODE_PROBLEM_DETECTOR: 'daemonset'\n" +
@ -205,7 +215,7 @@ func TestBuildAllocatableFromKubeEnv(t *testing.T) {
expectedErr: true,
}}
for _, tc := range testCases {
capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory)
capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory, tc.gcuCount)
assert.NoError(t, err)
tb := templateBuilder{}
allocatable, err := tb.buildAllocatableFromKubeEnv(capacity, tc.kubeEnv)
@ -213,25 +223,59 @@ func TestBuildAllocatableFromKubeEnv(t *testing.T) {
assert.Error(t, err)
} else {
assert.NoError(t, err)
expectedResources, err := makeResourceList(tc.expectedCpu, tc.expectedMemory)
expectedResources, err := makeResourceList(tc.expectedCpu, tc.expectedMemory, tc.gcuCount)
assert.NoError(t, err)
assert.Equal(t, expectedResources, allocatable)
}
}
}
func TestGetAcceleratorCount(t *testing.T) {
testCases := []struct {
accelerators []*gce.AcceleratorConfig
count int64
}{{
accelerators: []*gce.AcceleratorConfig{},
count: 0,
}, {
accelerators: []*gce.AcceleratorConfig{
{AcceleratorType: "nvidia-tesla-k80", AcceleratorCount: 3},
},
count: 3,
}, {
accelerators: []*gce.AcceleratorConfig{
{AcceleratorType: "nvidia-tesla-k80", AcceleratorCount: 3},
{AcceleratorType: "nvidia-tesla-p100", AcceleratorCount: 8},
},
count: 11,
}, {
accelerators: []*gce.AcceleratorConfig{
{AcceleratorType: "other-type", AcceleratorCount: 3},
{AcceleratorType: "nvidia-tesla-p100", AcceleratorCount: 8},
},
count: 8,
}}
for _, tc := range testCases {
tb := templateBuilder{}
assert.Equal(t, tc.count, tb.getAcceleratorCount(tc.accelerators))
}
}
func TestBuildAllocatableFromCapacity(t *testing.T) {
type testCase struct {
capacityCpu string
capacityMemory string
allocatableCpu string
allocatableMemory string
gpuCount int64
}
testCases := []testCase{{
capacityCpu: "16000m",
capacityMemory: fmt.Sprintf("%v", 1*1024*1024*1024),
allocatableCpu: "15890m",
allocatableMemory: fmt.Sprintf("%v", 0.75*1024*1024*1024),
gpuCount: 1,
}, {
capacityCpu: "500m",
capacityMemory: fmt.Sprintf("%v", 200*1000*1024*1024),
@ -240,9 +284,9 @@ func TestBuildAllocatableFromCapacity(t *testing.T) {
}}
for _, tc := range testCases {
tb := templateBuilder{}
capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory)
capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory, tc.gpuCount)
assert.NoError(t, err)
expectedAllocatable, err := makeResourceList(tc.allocatableCpu, tc.allocatableMemory)
expectedAllocatable, err := makeResourceList(tc.allocatableCpu, tc.allocatableMemory, tc.gpuCount)
assert.NoError(t, err)
allocatable := tb.buildAllocatableFromCapacity(capacity)
assertEqualResourceLists(t, "Allocatable", expectedAllocatable, allocatable)
@ -364,7 +408,7 @@ func TestParseKubeReserved(t *testing.T) {
assert.Nil(t, resources)
} else {
assert.NoError(t, err)
expectedResources, err := makeResourceList(tc.expectedCpu, tc.expectedMemory)
expectedResources, err := makeResourceList(tc.expectedCpu, tc.expectedMemory, 0)
assert.NoError(t, err)
assertEqualResourceLists(t, "Resources", expectedResources, resources)
}
@ -425,18 +469,25 @@ func makeTaintSet(taints []apiv1.Taint) map[apiv1.Taint]bool {
return set
}
func makeResourceList(cpu string, memory string) (apiv1.ResourceList, error) {
func makeResourceList(cpu string, memory string, gpu int64) (apiv1.ResourceList, error) {
result := apiv1.ResourceList{}
resultCpu, err := resource.ParseQuantity(cpu)
if err != nil {
return nil, err
}
result[apiv1.ResourceCPU] = resultCpu
resultMemory, err := resource.ParseQuantity(memory)
if err != nil {
return nil, err
}
result[apiv1.ResourceCPU] = resultCpu
result[apiv1.ResourceMemory] = resultMemory
if gpu > 0 {
resultGpu := *resource.NewQuantity(gpu, resource.DecimalSI)
if err != nil {
return nil, err
}
result[apiv1.ResourceNvidiaGPU] = resultGpu
}
return result, nil
}