Add update_mode label to VPA updater metrics

This commit is contained in:
Luiz Oliveira 2025-07-31 11:24:42 -04:00
parent 3d748040d9
commit 36804f199c
4 changed files with 354 additions and 21 deletions

View File

@ -41,6 +41,7 @@ require (
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect

View File

@ -230,7 +230,8 @@ func (u *updater) RunOnce(ctx context.Context) {
// to contain only Pods controlled by a VPA in auto, recreate, or inPlaceOrRecreate mode
for vpa, livePods := range controlledPods {
vpaSize := len(livePods)
controlledPodsCounter.Add(vpaSize, vpaSize)
updateMode := vpa_api_util.GetUpdateMode(vpa)
controlledPodsCounter.Add(vpaSize, updateMode, vpaSize)
creatorToSingleGroupStatsMap, podToReplicaCreatorMap, err := u.restrictionFactory.GetCreatorMaps(livePods, vpa)
if err != nil {
klog.ErrorS(err, "Failed to get creator maps")
@ -242,7 +243,6 @@ func (u *updater) RunOnce(ctx context.Context) {
podsForInPlace := make([]*apiv1.Pod, 0)
podsForEviction := make([]*apiv1.Pod, 0)
updateMode := vpa_api_util.GetUpdateMode(vpa)
if updateMode == vpa_types.UpdateModeInPlaceOrRecreate && features.Enabled(features.InPlaceOrRecreate) {
podsForInPlace = u.getPodsUpdateOrder(filterNonInPlaceUpdatablePods(livePods, inPlaceLimiter), vpa)
@ -253,7 +253,7 @@ func (u *updater) RunOnce(ctx context.Context) {
klog.InfoS("Warning: feature gate is not enabled for this updateMode", "featuregate", features.InPlaceOrRecreate, "updateMode", vpa_types.UpdateModeInPlaceOrRecreate)
}
podsForEviction = u.getPodsUpdateOrder(filterNonEvictablePods(livePods, evictionLimiter), vpa)
evictablePodsCounter.Add(vpaSize, len(podsForEviction))
evictablePodsCounter.Add(vpaSize, updateMode, len(podsForEviction))
}
withInPlaceUpdatable := false
@ -304,7 +304,7 @@ func (u *updater) RunOnce(ctx context.Context) {
klog.V(0).InfoS("Eviction failed", "error", evictErr, "pod", klog.KObj(pod))
} else {
withEvicted = true
metrics_updater.AddEvictedPod(vpaSize)
metrics_updater.AddEvictedPod(vpaSize, updateMode)
}
}
@ -315,10 +315,10 @@ func (u *updater) RunOnce(ctx context.Context) {
vpasWithInPlaceUpdatedPodsCounter.Add(vpaSize, 1)
}
if withEvictable {
vpasWithEvictablePodsCounter.Add(vpaSize, 1)
vpasWithEvictablePodsCounter.Add(vpaSize, updateMode, 1)
}
if withEvicted {
vpasWithEvictedPodsCounter.Add(vpaSize, 1)
vpasWithEvictedPodsCounter.Add(vpaSize, updateMode, 1)
}
}
timer.ObserveStep("EvictPods")

View File

@ -22,6 +22,7 @@ import (
"github.com/prometheus/client_golang/prometheus"
vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/metrics"
)
@ -35,13 +36,20 @@ type SizeBasedGauge struct {
gauge *prometheus.GaugeVec
}
// UpdateModeAndSizeBasedGauge is a wrapper for incrementally recording values
// indexed by log2(VPA size) and update mode
type UpdateModeAndSizeBasedGauge struct {
values [metrics.MaxVpaSizeLog]map[vpa_types.UpdateMode]int
gauge *prometheus.GaugeVec
}
var (
controlledCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "controlled_pods_total",
Help: "Number of Pods controlled by VPA updater.",
}, []string{"vpa_size_log2"},
}, []string{"vpa_size_log2", "update_mode"},
)
evictableCount = prometheus.NewGaugeVec(
@ -49,7 +57,7 @@ var (
Namespace: metricsNamespace,
Name: "evictable_pods_total",
Help: "Number of Pods matching evicition criteria.",
}, []string{"vpa_size_log2"},
}, []string{"vpa_size_log2", "update_mode"},
)
evictedCount = prometheus.NewCounterVec(
@ -57,7 +65,7 @@ var (
Namespace: metricsNamespace,
Name: "evicted_pods_total",
Help: "Number of Pods evicted by Updater to apply a new recommendation.",
}, []string{"vpa_size_log2"},
}, []string{"vpa_size_log2", "update_mode"},
)
vpasWithEvictablePodsCount = prometheus.NewGaugeVec(
@ -65,7 +73,7 @@ var (
Namespace: metricsNamespace,
Name: "vpas_with_evictable_pods_total",
Help: "Number of VPA objects with at least one Pod matching evicition criteria.",
}, []string{"vpa_size_log2"},
}, []string{"vpa_size_log2", "update_mode"},
)
vpasWithEvictedPodsCount = prometheus.NewGaugeVec(
@ -73,7 +81,7 @@ var (
Namespace: metricsNamespace,
Name: "vpas_with_evicted_pods_total",
Help: "Number of VPA objects with at least one evicted Pod.",
}, []string{"vpa_size_log2"},
}, []string{"vpa_size_log2", "update_mode"},
)
inPlaceUpdatableCount = prometheus.NewGaugeVec(
@ -138,30 +146,41 @@ func newSizeBasedGauge(gauge *prometheus.GaugeVec) *SizeBasedGauge {
}
}
// newModeAndSizeBasedGauge provides a wrapper for counting items in a loop
func newModeAndSizeBasedGauge(gauge *prometheus.GaugeVec) *UpdateModeAndSizeBasedGauge {
g := &UpdateModeAndSizeBasedGauge{
gauge: gauge,
}
for i := range g.values {
g.values[i] = make(map[vpa_types.UpdateMode]int)
}
return g
}
// NewControlledPodsCounter returns a wrapper for counting Pods controlled by Updater
func NewControlledPodsCounter() *SizeBasedGauge {
return newSizeBasedGauge(controlledCount)
func NewControlledPodsCounter() *UpdateModeAndSizeBasedGauge {
return newModeAndSizeBasedGauge(controlledCount)
}
// NewEvictablePodsCounter returns a wrapper for counting Pods which are matching eviction criteria
func NewEvictablePodsCounter() *SizeBasedGauge {
return newSizeBasedGauge(evictableCount)
func NewEvictablePodsCounter() *UpdateModeAndSizeBasedGauge {
return newModeAndSizeBasedGauge(evictableCount)
}
// NewVpasWithEvictablePodsCounter returns a wrapper for counting VPA objects with Pods matching eviction criteria
func NewVpasWithEvictablePodsCounter() *SizeBasedGauge {
return newSizeBasedGauge(vpasWithEvictablePodsCount)
func NewVpasWithEvictablePodsCounter() *UpdateModeAndSizeBasedGauge {
return newModeAndSizeBasedGauge(vpasWithEvictablePodsCount)
}
// NewVpasWithEvictedPodsCounter returns a wrapper for counting VPA objects with evicted Pods
func NewVpasWithEvictedPodsCounter() *SizeBasedGauge {
return newSizeBasedGauge(vpasWithEvictedPodsCount)
func NewVpasWithEvictedPodsCounter() *UpdateModeAndSizeBasedGauge {
return newModeAndSizeBasedGauge(vpasWithEvictedPodsCount)
}
// AddEvictedPod increases the counter of pods evicted by Updater, by given VPA size
func AddEvictedPod(vpaSize int) {
func AddEvictedPod(vpaSize int, mode vpa_types.UpdateMode) {
log2 := metrics.GetVpaSizeLog2(vpaSize)
evictedCount.WithLabelValues(strconv.Itoa(log2)).Inc()
evictedCount.WithLabelValues(strconv.Itoa(log2), string(mode)).Inc()
}
// NewInPlaceUpdatablePodsCounter returns a wrapper for counting Pods which are matching in-place update criteria
@ -203,3 +222,19 @@ func (g *SizeBasedGauge) Observe() {
g.gauge.WithLabelValues(strconv.Itoa(log2)).Set(float64(value))
}
}
// Add increases the counter for the given VPA size and VPA update mode.
func (g *UpdateModeAndSizeBasedGauge) Add(vpaSize int, vpaUpdateMode vpa_types.UpdateMode, value int) {
log2 := metrics.GetVpaSizeLog2(vpaSize)
g.values[log2][vpaUpdateMode] += value
}
// Observe stores the recorded values into metrics object associated with the
// wrapper
func (g *UpdateModeAndSizeBasedGauge) Observe() {
for log2, valueMap := range g.values {
for vpaMode, value := range valueMap {
g.gauge.WithLabelValues(strconv.Itoa(log2), string(vpaMode)).Set(float64(value))
}
}
}

View File

@ -0,0 +1,297 @@
/*
Copyright 2025 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package updater
import (
"testing"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
)
func TestAddEvictedPod(t *testing.T) {
testCases := []struct {
desc string
vpaSize int
mode vpa_types.UpdateMode
log2 string
}{
{
desc: "VPA size 5, mode Auto",
vpaSize: 5,
mode: vpa_types.UpdateModeAuto,
log2: "2",
},
{
desc: "VPA size 10, mode Off",
vpaSize: 10,
mode: vpa_types.UpdateModeOff,
log2: "3",
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
t.Cleanup(evictedCount.Reset)
AddEvictedPod(tc.vpaSize, tc.mode)
val := testutil.ToFloat64(evictedCount.WithLabelValues(tc.log2, string(tc.mode)))
if val != 1 {
t.Errorf("Unexpected value for evictedCount metric with labels (%s, %s): got %v, want 1", tc.log2, string(tc.mode), val)
}
})
}
}
func TestAddInPlaceUpdatedPod(t *testing.T) {
testCases := []struct {
desc string
vpaSize int
log2 string
}{
{
desc: "VPA size 10",
vpaSize: 10,
log2: "3",
},
{
desc: "VPA size 1",
vpaSize: 1,
log2: "0",
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
t.Cleanup(inPlaceUpdatedCount.Reset)
AddInPlaceUpdatedPod(tc.vpaSize)
val := testutil.ToFloat64(inPlaceUpdatedCount.WithLabelValues(tc.log2))
if val != 1 {
t.Errorf("Unexpected value for InPlaceUpdatedPod metric with labels (%s): got %v, want 1", tc.log2, val)
}
})
}
}
func TestRecordFailedInPlaceUpdate(t *testing.T) {
testCases := []struct {
desc string
vpaSize int
reason string
log2 string
}{
{
desc: "VPA size 2, some reason",
vpaSize: 2,
reason: "some_reason",
log2: "1",
},
{
desc: "VPA size 20, another reason",
vpaSize: 20,
reason: "another_reason",
log2: "4",
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
t.Cleanup(failedInPlaceUpdateAttempts.Reset)
RecordFailedInPlaceUpdate(tc.vpaSize, tc.reason)
val := testutil.ToFloat64(failedInPlaceUpdateAttempts.WithLabelValues(tc.log2, tc.reason))
if val != 1 {
t.Errorf("Unexpected value for FailedInPlaceUpdate metric with labels (%s, %s): got %v, want 1", tc.log2, tc.reason, val)
}
})
}
}
func TestUpdateModeAndSizeBasedGauge(t *testing.T) {
type addition struct {
vpaSize int
mode vpa_types.UpdateMode
value int
}
type expectation struct {
labels []string
value float64
}
testCases := []struct {
desc string
newCounter func() *UpdateModeAndSizeBasedGauge
metric *prometheus.GaugeVec
metricName string
additions []addition
expectedMetrics []expectation
}{
{
desc: "ControlledPodsCounter",
newCounter: NewControlledPodsCounter,
metric: controlledCount,
metricName: "vpa_updater_controlled_pods_total",
additions: []addition{
{1, vpa_types.UpdateModeAuto, 5},
{2, vpa_types.UpdateModeOff, 10},
{2, vpa_types.UpdateModeAuto, 2},
{2, vpa_types.UpdateModeAuto, 7},
},
expectedMetrics: []expectation{
{[]string{"0" /* log2(1) */, "Auto"}, 5},
{[]string{"1" /* log2(2) */, "Auto"}, 9},
{[]string{"1" /* log2(2) */, "Off"}, 10},
},
},
{
desc: "EvictablePodsCounter",
newCounter: NewEvictablePodsCounter,
metric: evictableCount,
metricName: "vpa_updater_evictable_pods_total",
additions: []addition{
{4, vpa_types.UpdateModeAuto, 3},
{1, vpa_types.UpdateModeRecreate, 8},
},
expectedMetrics: []expectation{
{[]string{"2" /* log2(4) */, "Auto"}, 3},
{[]string{"0" /* log2(1) */, "Recreate"}, 8},
},
},
{
desc: "VpasWithEvictablePodsCounter",
newCounter: NewVpasWithEvictablePodsCounter,
metric: vpasWithEvictablePodsCount,
metricName: "vpa_updater_vpas_with_evictable_pods_total",
additions: []addition{
{1, vpa_types.UpdateModeOff, 1},
{2, vpa_types.UpdateModeAuto, 1},
},
expectedMetrics: []expectation{
{[]string{"0" /* log2(1) */, "Off"}, 1},
{[]string{"1" /* log2(2) */, "Auto"}, 1},
},
},
{
desc: "VpasWithEvictedPodsCounter",
newCounter: NewVpasWithEvictedPodsCounter,
metric: vpasWithEvictedPodsCount,
metricName: "vpa_updater_vpas_with_evicted_pods_total",
additions: []addition{
{1, vpa_types.UpdateModeAuto, 2},
{1, vpa_types.UpdateModeAuto, 3},
},
expectedMetrics: []expectation{
{[]string{"0" /* log2(1) */, "Auto"}, 5},
},
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
t.Cleanup(tc.metric.Reset)
counter := tc.newCounter()
for _, add := range tc.additions {
counter.Add(add.vpaSize, add.mode, add.value)
}
counter.Observe()
for _, expected := range tc.expectedMetrics {
val := testutil.ToFloat64(tc.metric.WithLabelValues(expected.labels...))
if val != expected.value {
t.Errorf("Unexpected value for metric %s with labels %v: got %v, want %v", tc.metricName, expected.labels, val, expected.value)
}
}
})
}
}
func TestSizeBasedGauge(t *testing.T) {
type addition struct {
vpaSize int
value int
}
type expectation struct {
labels []string
value float64
}
testCases := []struct {
desc string
newCounter func() *SizeBasedGauge
metric *prometheus.GaugeVec
metricName string
additions []addition
expectedMetrics []expectation
}{
{
desc: "InPlaceUpdatablePodsCounter",
newCounter: NewInPlaceUpdatablePodsCounter,
metric: inPlaceUpdatableCount,
metricName: "vpa_updater_in_place_updatable_pods_total",
additions: []addition{
{1, 5},
{2, 10},
},
expectedMetrics: []expectation{
{[]string{"0" /* log2(1) */}, 5},
{[]string{"1" /* log2(2) */}, 10},
},
},
{
desc: "VpasWithInPlaceUpdatablePodsCounter",
newCounter: NewVpasWithInPlaceUpdatablePodsCounter,
metric: vpasWithInPlaceUpdatablePodsCount,
metricName: "vpa_updater_vpas_with_in_place_updatable_pods_total",
additions: []addition{
{10, 1},
{20, 1},
},
expectedMetrics: []expectation{
{[]string{"3" /* log2(10) */}, 1},
{[]string{"4" /* log2(20) */}, 1},
},
},
{
desc: "VpasWithInPlaceUpdatedPodsCounter",
newCounter: NewVpasWithInPlaceUpdatedPodsCounter,
metric: vpasWithInPlaceUpdatedPodsCount,
metricName: "vpa_updater_vpas_with_in_place_updated_pods_total",
additions: []addition{
{2, 4},
{4, 5},
},
expectedMetrics: []expectation{
{[]string{"1" /* log2(2) */}, 4},
{[]string{"2" /* log2(4) */}, 5},
},
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
t.Cleanup(tc.metric.Reset)
counter := tc.newCounter()
for _, add := range tc.additions {
counter.Add(add.vpaSize, add.value)
}
counter.Observe()
for _, expected := range tc.expectedMetrics {
val := testutil.ToFloat64(tc.metric.WithLabelValues(expected.labels...))
if val != expected.value {
t.Errorf("Unexpected value for metric %s with labels %v: got %v, want %v", tc.metricName, expected.labels, val, expected.value)
}
}
})
}
}