Merge pull request #4318 from DataDog/gce-ephemeral-storage-local-ssd

GCE ephemeral storage on local SSDs
This commit is contained in:
Kubernetes Prow Robot 2021-10-18 06:59:49 -07:00 committed by GitHub
commit 47bc0f291d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 276 additions and 49 deletions

View File

@ -23,12 +23,14 @@ import (
"strings"
"k8s.io/apimachinery/pkg/api/resource"
klog "k8s.io/klog/v2"
"k8s.io/klog/v2"
)
// There should be no imports as it is used standalone in e2e tests
const (
// KiB - KibiByte size (2^10)
KiB = 1024
// MiB - MebiByte size (2^20)
MiB = 1024 * 1024
// GiB - GibiByte size (2^30)
@ -197,6 +199,75 @@ func parsePercentageToRatio(percentString string) (float64, error) {
return percentVal / 100, nil
}
// ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount was
// measured by creating 1-node nodepools in a GKE cluster with ephemeral
// storage on N local SSDs, measuring for each node
// N * 375GiB - .status.capacity["ephemeral-storage"]
var ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount = map[OperatingSystemDistribution]map[int64]int64{
OperatingSystemDistributionCOS: {
1: 7289472,
2: 13725224,
3: 20031312,
4: 26332924,
5: 32634536,
6: 38946604,
7: 45254008,
8: 51556096,
16: 52837800,
24: 78686620,
},
OperatingSystemDistributionUbuntu: {
1: 7219840,
2: 13651496,
3: 19953488,
4: 26255100,
5: 32556712,
6: 38860588,
7: 45163896,
8: 51465984,
16: 52747688,
24: 78601704,
},
}
// EphemeralStorageOnLocalSSDFilesystemOverheadInBytes estimates the difference
// between the total physical capacity of the local SSDs and the ephemeral
// storage filesystem capacity. It uses experimental values measured for all
// possible disk counts in GKE. Custom Kubernetes on GCE may allow intermediate
// counts, attaching the measured count, but not using it all for ephemeral
// storage. In that case, the difference in overhead between GKE and custom node
// images may be higher than the difference in overhead between two disk counts,
// so interpolating wouldn't make much sense. Instead, we use the next count for
// which we measured a filesystem overhead, which is a safer approximation
// (better to reserve more and not scale up than not enough and not schedule).
func EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(diskCount int64, osDistribution OperatingSystemDistribution) int64 {
var measuredCount int64
if diskCount <= 8 {
measuredCount = diskCount
} else if diskCount <= 16 {
measuredCount = 16
} else {
measuredCount = 24 // max attachable
}
// the container runtime doesn't affect filesystem overhead
var measuredOS OperatingSystemDistribution
if osDistribution == OperatingSystemDistributionCOSContainerd {
measuredOS = OperatingSystemDistributionCOS
} else if osDistribution == OperatingSystemDistributionUbuntuContainerd {
measuredOS = OperatingSystemDistributionUbuntu
} else {
measuredOS = osDistribution
}
o, ok := ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount[measuredOS]
if !ok {
klog.Errorf("Ephemeral storage backed by local SSDs is not supported for image family %v", osDistribution)
return 0
}
return o[measuredCount] * KiB
}
// CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold
func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution) int64 {
switch osDistribution {

View File

@ -103,3 +103,50 @@ func TestCalculateKernelReservedLinux(t *testing.T) {
})
}
}
func TestEphemeralStorageOnLocalSSDFilesystemOverheadInBytes(t *testing.T) {
type testCase struct {
scenario string
diskCount int64
osDistribution OperatingSystemDistribution
expected int64
}
testCases := []testCase{
{
scenario: "measured disk count and OS (cos)",
diskCount: 1,
osDistribution: OperatingSystemDistributionCOS,
expected: 7289472 * KiB,
},
{
scenario: "measured disk count but OS with different container runtime (cos_containerd)",
diskCount: 1,
osDistribution: OperatingSystemDistributionCOSContainerd,
expected: 7289472 * KiB, // same as COS
},
{
scenario: "measured disk count and OS (ubuntu)",
diskCount: 1,
osDistribution: OperatingSystemDistributionUbuntu,
expected: 7219840 * KiB,
},
{
scenario: "measured disk count but OS with different container runtime (ubuntu_containerd)",
diskCount: 1,
osDistribution: OperatingSystemDistributionUbuntuContainerd,
expected: 7219840 * KiB, // same as Ubuntu
},
{
scenario: "mapped disk count",
diskCount: 10,
osDistribution: OperatingSystemDistributionCOS,
expected: 52837800 * KiB, // value measured for 16 disks
},
}
for _, tc := range testCases {
t.Run(tc.scenario, func(t *testing.T) {
actual := EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(tc.diskCount, tc.osDistribution)
assert.Equal(t, tc.expected, actual)
})
}
}

View File

@ -30,15 +30,18 @@ import (
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/autoscaler/cluster-autoscaler/utils/units"
klog "k8s.io/klog/v2"
)
// GceTemplateBuilder builds templates for GCE nodes.
type GceTemplateBuilder struct{}
const LocalSSDDiskSizeInGiB = 375
// TODO: This should be imported from sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common/constants.go
// This key is applicable to both GCE and GKE
const gceCSITopologyKeyZone = "topology.gke.io/zone"
@ -54,7 +57,7 @@ func (t *GceTemplateBuilder) getAcceleratorCount(accelerators []*gce.Accelerator
}
// BuildCapacity builds a list of resource capacities given list of hardware.
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, pods *int64) (apiv1.ResourceList, error) {
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, ephemeralStorageLocalSSDCount int64, pods *int64) (apiv1.ResourceList, error) {
capacity := apiv1.ResourceList{}
if pods == nil {
capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
@ -71,7 +74,12 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []
}
if ephemeralStorage > 0 {
storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution)
var storageTotal int64
if ephemeralStorageLocalSSDCount > 0 {
storageTotal = ephemeralStorage - EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(ephemeralStorageLocalSSDCount, osDistribution)
} else {
storageTotal = ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution)
}
capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(math.Max(float64(storageTotal), 0)), resource.DecimalSI)
}
@ -166,15 +174,17 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
}
var ephemeralStorage int64 = -1
if !isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getEphemeralStorageFromInstanceTemplateProperties(template.Properties)
if err != nil {
klog.Errorf("could not fetch ephemeral storage from instance template. %s", err)
return nil, err
}
ssdCount := ephemeralStorageLocalSSDCount(kubeEnvValue)
if ssdCount > 0 {
ephemeralStorage, err = getLocalSSDEphemeralStorageFromInstanceTemplateProperties(template.Properties, ssdCount)
} else if !isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getBootDiskEphemeralStorageFromInstanceTemplateProperties(template.Properties)
}
if err != nil {
return nil, fmt.Errorf("could not fetch ephemeral storage from instance template: %v", err)
}
capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, pods)
capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, ssdCount, pods)
if err != nil {
return nil, err
}
@ -228,10 +238,51 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
return &node, nil
}
// isEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
func ephemeralStorageLocalSSDCount(kubeEnvValue string) int64 {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "ephemeral_storage_local_ssd_count")
if err != nil {
klog.Warningf("cannot extract ephemeral_storage_local_ssd_count from kube-env, default to 0: %v", err)
return 0
}
if !found {
return 0
}
n, err := strconv.Atoi(v)
if err != nil {
klog.Warningf("cannot parse ephemeral_storage_local_ssd_count value, default to 0: %v", err)
return 0
}
return int64(n)
}
func getLocalSSDEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties, ssdCount int64) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("instance properties disks is nil")
}
var count int64
for _, disk := range instanceProperties.Disks {
if disk != nil && disk.InitializeParams != nil {
if disk.Type == "SCRATCH" && disk.InitializeParams.DiskType == "local-ssd" {
count++
}
}
}
if count < ssdCount {
return 0, fmt.Errorf("actual local SSD count is lower than ephemeral_storage_local_ssd_count")
}
return ssdCount * LocalSSDDiskSizeInGiB * units.GiB, nil
}
// isBootDiskEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
// picked up from Instance Template and used as Ephemeral Storage, in case other type of storage are used
// as ephemeral storage
func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
func isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "BLOCK_EPH_STORAGE_BOOT_DISK")
if err == nil && found && v == "true" {
return true
@ -239,7 +290,7 @@ func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
return false
}
func getEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
func getBootDiskEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("unable to get ephemeral storage because instance properties disks is nil")
}

View File

@ -34,23 +34,35 @@ import (
quota "k8s.io/apiserver/pkg/quota/v1"
)
// TestBuildNodeFromTemplateSetsResources tests that capacity and allocatable
// are loaded into the node template status, a few error scenarios, and physical
// ephemeral storage (an intermediate result), but it doesn't test that capacity
// and allocatable are computed correctly, (the test itself calls
// GceTemplateBuilder.BuildCapacity, GceTemplateBuilder.CalculateAllocatable,
// and ParseEvictionHardOrGetDefault to compute expected values); computations
// are tested separately.
func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
var thirtyPodsPerNode int64 = 30
type testCase struct {
scenario string
kubeEnv string
accelerators []*gce.AcceleratorConfig
mig Mig
physicalCpu int64
physicalMemory int64
physicalEphemeralStorage int64
kubeReserved bool
reservedCpu string
reservedMemory string
reservedEphemeralStorage string
isEphemeralStorageBlocked bool
expectedErr bool
pods *int64
scenario string
// test inputs
kubeEnv string
accelerators []*gce.AcceleratorConfig
attachedLocalSSDCount int64
pods *int64
// other test inputs (constant across test cases, because they are test invariants for now)
physicalCpu int64
physicalMemory int64
bootDiskSizeGiB int64
// dependent inputs, should match kubeEnv, used to compute expected capacity and allocatable, out of test scope
kubeReserved bool
reservedCpu string
reservedMemory string
reservedEphemeralStorage string
isEphemeralStorageBlocked bool
ephemeralStorageLocalSSDCount int64
// test outputs
expectedErr bool
}
testCases := []testCase{
{
@ -66,7 +78,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
},
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
bootDiskSizeGiB: 300,
kubeReserved: true,
reservedCpu: "1000m",
reservedMemory: fmt.Sprintf("%v", 1*units.MiB),
@ -112,7 +124,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
"NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
bootDiskSizeGiB: 300,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
@ -127,15 +139,49 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
"DNS_SERVER_IP: '10.0.0.10'\n" +
"AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;kube_reserved=cpu=0,memory=0,ephemeral-storage=0;BLOCK_EPH_STORAGE_BOOT_DISK=false\n" +
"NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
kubeReserved: true,
isEphemeralStorageBlocked: false,
expectedErr: false,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
kubeReserved: true,
expectedErr: false,
},
{
scenario: "more local SSDs requested for ephemeral storage than attached",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=1\n",
ephemeralStorageLocalSSDCount: 1,
attachedLocalSSDCount: 0,
expectedErr: true,
},
{
scenario: "all attached local SSDs requested for ephemeral storage",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
ephemeralStorageLocalSSDCount: 2,
attachedLocalSSDCount: 2,
expectedErr: false,
},
{
scenario: "more local SSDs attached than requested for ephemeral storage",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
ephemeralStorageLocalSSDCount: 2,
attachedLocalSSDCount: 4,
expectedErr: false,
},
{
scenario: "ephemeral storage on local SSDs with kube-reserved",
kubeEnv: "AUTOSCALER_ENV_VARS: kube_reserved=cpu=0,memory=0,ephemeral-storage=10Gi;os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
ephemeralStorageLocalSSDCount: 2,
kubeReserved: true,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "10Gi",
attachedLocalSSDCount: 4,
expectedErr: false,
},
}
for _, tc := range testCases {
@ -158,12 +204,20 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
{
Boot: true,
InitializeParams: &gce.AttachedDiskInitializeParams{
DiskSizeGb: tc.physicalEphemeralStorage,
DiskSizeGb: tc.bootDiskSizeGiB,
},
},
},
},
}
for i := int64(0); i < tc.attachedLocalSSDCount; i++ {
template.Properties.Disks = append(template.Properties.Disks, &gce.AttachedDisk{
Type: "SCRATCH",
InitializeParams: &gce.AttachedDiskInitializeParams{
DiskType: "local-ssd",
},
})
}
if tc.kubeEnv != "" {
template.Properties.Metadata.Items = []*gce.MetadataItems{{Key: "kube-env", Value: &tc.kubeEnv}}
}
@ -176,11 +230,15 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
assert.NotNil(t, node.Status)
assert.NotNil(t, node.Status.Capacity)
assert.NotNil(t, node.Status.Allocatable)
physicalEphemeralStorage := tc.physicalEphemeralStorage
if tc.isEphemeralStorageBlocked {
physicalEphemeralStorage = 0
// this logic is a duplicate of logic under test and would best be captured by
// specifying physicalEphemeralStorageGiB in the testCase struct
physicalEphemeralStorageGiB := tc.bootDiskSizeGiB
if tc.ephemeralStorageLocalSSDCount > 0 {
physicalEphemeralStorageGiB = tc.ephemeralStorageLocalSSDCount * LocalSSDDiskSizeInGiB
} else if tc.isEphemeralStorageBlocked {
physicalEphemeralStorageGiB = 0
}
capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorage*units.GiB, tc.pods)
capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorageGiB*units.GiB, tc.ephemeralStorageLocalSSDCount, tc.pods)
assert.NoError(t, err)
assertEqualResourceLists(t, "Capacity", capacity, node.Status.Capacity)
if !tc.kubeReserved {
@ -371,17 +429,17 @@ func TestParseEvictionHard(t *testing.T) {
testCases := []testCase{{
memory: "200Mi",
ephemeralStorage: "15%",
memoryExpected: 200 * 1024 * 1024,
memoryExpected: 200 * MiB,
ephemeralStorageRatioExpected: 0.15,
}, {
memory: "2Gi",
ephemeralStorage: "11.5%",
memoryExpected: 2 * 1024 * 1024 * 1024,
memoryExpected: 2 * GiB,
ephemeralStorageRatioExpected: 0.115,
}, {
memory: "",
ephemeralStorage: "", // empty string, fallback to default
memoryExpected: 100 * 1024 * 1024,
memoryExpected: 100 * MiB,
ephemeralStorageRatioExpected: 0.1,
}, {
memory: "110292",
@ -391,7 +449,7 @@ func TestParseEvictionHard(t *testing.T) {
}, {
memory: "abcb12", // unparsable, fallback to default
ephemeralStorage: "-11%", // negative percentage, should fallback to default
memoryExpected: 100 * 1024 * 1024,
memoryExpected: 100 * MiB,
ephemeralStorageRatioExpected: 0.1,
}}
for _, tc := range testCases {
@ -474,7 +532,7 @@ func TestBuildCapacityMemory(t *testing.T) {
t.Run(fmt.Sprintf("%v", idx), func(t *testing.T) {
tb := GceTemplateBuilder{}
noAccelerators := make([]*gce.AcceleratorConfig, 0)
buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, nil)
buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, 0, nil)
assert.NoError(t, err)
expectedCapacity, err := makeResourceList2(tc.physicalCpu, tc.expectedCapacityMemory, 0, 110)
assert.NoError(t, err)