Merge pull request #5275 from navinjoy/oom-params

VPA: make parameters oomBumpUpRatio and oomMinBumpUp configurable
2023-01-30 04:46:51 -08:00 · 2023-01-30 04:46:51 -08:00 · 65c098b6f5
parent 4d94120223 c36f6ca3c4
commit 65c098b6f5
4 changed files with 36 additions and 7 deletions
--- a/vertical-pod-autoscaler/README.md
+++ b/vertical-pod-autoscaler/README.md
@ -304,6 +304,22 @@ Please note the usage of the following arguments to override default names and p

 You can then choose which recommender to use by setting `recommenders` inside the `VerticalPodAutoscaler` spec.

+
+### Custom memory bump-up after OOMKill
+After an OOMKill event was observed, VPA increases the memory recommendation based on the observed memory usage in the event according to this formula: `recommendation = memory-usage-in-oomkill-event + max(oom-min-bump-up-bytes, memory-usage-in-oomkill-event * oom-bump-up-ratio)`. 
+You can configure the minimum bump-up as well as the multiplier by specifying startup arguments for the recommender:
+`oom-bump-up-ratio` specifies the memory bump up ratio when OOM occurred, default is `1.2`. This means, memory will be increased by 20% after an OOMKill event.
+`oom-min-bump-up-bytes` specifies minimal increase of memory after observing OOM. Defaults to `100 * 1024 * 1024` (=100MiB)
+
+Usage in recommender deployment
+```
+  containers:
+  - name: recommender
+    args:
+      - --oom-bump-up-ratio=2.0
+      - --oom-min-bump-up-bytes=524288000
+```
+
 ### Using CPU management with static policy

 If you are using the [CPU management with static policy](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/#static-policy) for some containers,
@ -314,7 +330,7 @@ The annotation format is the following:
 ```
 vpa-post-processor.kubernetes.io/{containerName}_integerCPU=true
 ```
- 
+
 # Known limitations

 * Whenever VPA updates the pod resources, the pod is recreated, which causes all
--- a/vertical-pod-autoscaler/pkg/recommender/main.go
+++ b/vertical-pod-autoscaler/pkg/recommender/main.go
@ -18,9 +18,10 @@ package main

 import (
 	"flag"
-	"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/input"
 	"time"

+	"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/input"
+
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/autoscaler/vertical-pod-autoscaler/common"
 	"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/input/history"
@ -65,6 +66,8 @@ var (
 	memoryAggregationIntervalCount = flag.Int64("memory-aggregation-interval-count", model.DefaultMemoryAggregationIntervalCount, `The number of consecutive memory-aggregation-intervals which make up the MemoryAggregationWindowLength which in turn is the period for memory usage aggregation by VPA. In other words, MemoryAggregationWindowLength = memory-aggregation-interval * memory-aggregation-interval-count.`)
 	memoryHistogramDecayHalfLife   = flag.Duration("memory-histogram-decay-half-life", model.DefaultMemoryHistogramDecayHalfLife, `The amount of time it takes a historical memory usage sample to lose half of its weight. In other words, a fresh usage sample is twice as 'important' as one with age equal to the half life period.`)
 	cpuHistogramDecayHalfLife      = flag.Duration("cpu-histogram-decay-half-life", model.DefaultCPUHistogramDecayHalfLife, `The amount of time it takes a historical CPU usage sample to lose half of its weight.`)
+	oomBumpUpRatio                 = flag.Float64("oom-bump-up-ratio", model.DefaultOOMBumpUpRatio, `The memory bump up ratio when OOM occurred, default is 1.2.`)
+	oomMinBumpUp                   = flag.Float64("oom-min-bump-up-bytes", model.DefaultOOMMinBumpUp, `The minimal increase of memory when OOM occurred in bytes, default is 100 * 1024 * 1024`)
 )

 // Post processors flags
@ -80,7 +83,7 @@ func main() {

 	config := common.CreateKubeConfigOrDie(*kubeconfig, float32(*kubeApiQps), int(*kubeApiBurst))

-	model.InitializeAggregationsConfig(model.NewAggregationsConfig(*memoryAggregationInterval, *memoryAggregationIntervalCount, *memoryHistogramDecayHalfLife, *cpuHistogramDecayHalfLife))
+	model.InitializeAggregationsConfig(model.NewAggregationsConfig(*memoryAggregationInterval, *memoryAggregationIntervalCount, *memoryHistogramDecayHalfLife, *cpuHistogramDecayHalfLife, *oomBumpUpRatio, *oomMinBumpUp))

 	healthCheck := metrics.NewHealthCheck(*metricsFetcherInterval*5, true)
 	metrics.Initialize(*address, healthCheck)
--- a/vertical-pod-autoscaler/pkg/recommender/model/aggregations_config.go
+++ b/vertical-pod-autoscaler/pkg/recommender/model/aggregations_config.go
@ -51,6 +51,10 @@ type AggregationsConfig struct {
 	// CPUHistogramDecayHalfLife is the amount of time it takes a historical
 	// CPU usage sample to lose half of its weight.
 	CPUHistogramDecayHalfLife time.Duration
+	// OOMBumpUpRatio specifies the memory bump up ratio when OOM occurred.
+	OOMBumpUpRatio float64
+	// OOMMinBumpUp specifies the minimal increase of memory when OOM occurred in bytes.
+	OOMMinBumpUp float64
 }

 const (
@ -71,6 +75,10 @@ const (
 	// DefaultCPUHistogramDecayHalfLife is the default value for CPUHistogramDecayHalfLife.
 	// CPU usage sample to lose half of its weight.
 	DefaultCPUHistogramDecayHalfLife = time.Hour * 24
+	// DefaultOOMBumpUpRatio is the default value for OOMBumpUpRatio.
+	DefaultOOMBumpUpRatio float64 = 1.2 // Memory is increased by 20% after an OOMKill.
+	// DefaultOOMMinBumpUp is the default value for OOMMinBumpUp.
+	DefaultOOMMinBumpUp float64 = 100 * 1024 * 1024 // Memory is increased by at least 100MB after an OOMKill.
 )

 // GetMemoryAggregationWindowLength returns the total length of the memory usage history aggregated by VPA.
@ -103,13 +111,15 @@ func (a *AggregationsConfig) memoryHistogramOptions() util.HistogramOptions {
 }

 // NewAggregationsConfig creates a new AggregationsConfig based on the supplied parameters and default values.
-func NewAggregationsConfig(memoryAggregationInterval time.Duration, memoryAggregationIntervalCount int64, memoryHistogramDecayHalfLife, cpuHistogramDecayHalfLife time.Duration) *AggregationsConfig {
+func NewAggregationsConfig(memoryAggregationInterval time.Duration, memoryAggregationIntervalCount int64, memoryHistogramDecayHalfLife, cpuHistogramDecayHalfLife time.Duration, oomBumpUpRatio float64, oomMinBumpUp float64) *AggregationsConfig {
 	a := &AggregationsConfig{
 		MemoryAggregationInterval:      memoryAggregationInterval,
 		MemoryAggregationIntervalCount: memoryAggregationIntervalCount,
 		HistogramBucketSizeGrowth:      DefaultHistogramBucketSizeGrowth,
 		MemoryHistogramDecayHalfLife:   memoryHistogramDecayHalfLife,
 		CPUHistogramDecayHalfLife:      cpuHistogramDecayHalfLife,
+		OOMBumpUpRatio:                 oomBumpUpRatio,
+		OOMMinBumpUp:                   oomMinBumpUp,
 	}
 	a.CPUHistogramOptions = a.cpuHistogramOptions()
 	a.MemoryHistogramOptions = a.memoryHistogramOptions()
@ -121,7 +131,7 @@ var aggregationsConfig *AggregationsConfig
 // GetAggregationsConfig gets the aggregations config. Initializes to default values if not initialized already.
 func GetAggregationsConfig() *AggregationsConfig {
 	if aggregationsConfig == nil {
-		aggregationsConfig = NewAggregationsConfig(DefaultMemoryAggregationInterval, DefaultMemoryAggregationIntervalCount, DefaultMemoryHistogramDecayHalfLife, DefaultCPUHistogramDecayHalfLife)
+		aggregationsConfig = NewAggregationsConfig(DefaultMemoryAggregationInterval, DefaultMemoryAggregationIntervalCount, DefaultMemoryHistogramDecayHalfLife, DefaultCPUHistogramDecayHalfLife, DefaultOOMBumpUpRatio, DefaultOOMMinBumpUp)
 	}

 	return aggregationsConfig
--- a/vertical-pod-autoscaler/pkg/recommender/model/container.go
+++ b/vertical-pod-autoscaler/pkg/recommender/model/container.go
@ -199,8 +199,8 @@ func (container *ContainerState) RecordOOM(timestamp time.Time, requestedMemory
 	// Get max of the request and the recent usage-based memory peak.
 	// Omitting oomPeak here to protect against recommendation running too high on subsequent OOMs.
 	memoryUsed := ResourceAmountMax(requestedMemory, container.memoryPeak)
-	memoryNeeded := ResourceAmountMax(memoryUsed+MemoryAmountFromBytes(OOMMinBumpUp),
-		ScaleResource(memoryUsed, OOMBumpUpRatio))
+	memoryNeeded := ResourceAmountMax(memoryUsed+MemoryAmountFromBytes(GetAggregationsConfig().OOMMinBumpUp),
+		ScaleResource(memoryUsed, GetAggregationsConfig().OOMBumpUpRatio))

 	oomMemorySample := ContainerUsageSample{
 		MeasureStart: timestamp,