Merge pull request #2028 from bskiba/quality-cont
VPA: Record more quality metrics
This commit is contained in:
commit
d9a2dcf5aa
|
|
@ -73,6 +73,9 @@ type ContainerStateAggregator interface {
|
|||
// NeedsRecommendation returns true if this aggregator should have
|
||||
// a recommendation calculated.
|
||||
NeedsRecommendation() bool
|
||||
// GetUpdateMode returns the update mode of VPA controlling this aggregator,
|
||||
// nil if aggregator is not autoscaled.
|
||||
GetUpdateMode() *vpa_types.UpdateMode
|
||||
}
|
||||
|
||||
// AggregateContainerState holds input signals aggregated from a set of containers.
|
||||
|
|
@ -93,6 +96,7 @@ type AggregateContainerState struct {
|
|||
CreationTime time.Time
|
||||
LastRecommendation corev1.ResourceList
|
||||
IsUnderVPA bool
|
||||
UpdateMode *vpa_types.UpdateMode
|
||||
}
|
||||
|
||||
// GetLastRecommendation returns last recorded recommendation.
|
||||
|
|
@ -105,11 +109,18 @@ func (a *AggregateContainerState) NeedsRecommendation() bool {
|
|||
return a.IsUnderVPA
|
||||
}
|
||||
|
||||
// GetUpdateMode returns the update mode of VPA controlling this aggregator,
|
||||
// nil if aggregator is not autoscaled.
|
||||
func (a *AggregateContainerState) GetUpdateMode() *vpa_types.UpdateMode {
|
||||
return a.UpdateMode
|
||||
}
|
||||
|
||||
// MarkNotAutoscaled registers that this container state is not contorled by
|
||||
// a VPA object.
|
||||
func (a *AggregateContainerState) MarkNotAutoscaled() {
|
||||
a.IsUnderVPA = false
|
||||
a.LastRecommendation = nil
|
||||
a.UpdateMode = nil
|
||||
}
|
||||
|
||||
// MergeContainerState merges two AggregateContainerStates.
|
||||
|
|
@ -284,3 +295,9 @@ func (p *ContainerStateAggregatorProxy) NeedsRecommendation() bool {
|
|||
aggregator := p.cluster.findOrCreateAggregateContainerState(p.containerID)
|
||||
return aggregator.NeedsRecommendation()
|
||||
}
|
||||
|
||||
// GetUpdateMode returns update mode of VPA controlling the aggregator.
|
||||
func (p *ContainerStateAggregatorProxy) GetUpdateMode() *vpa_types.UpdateMode {
|
||||
aggregator := p.cluster.findOrCreateAggregateContainerState(p.containerID)
|
||||
return aggregator.GetUpdateMode()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -87,39 +87,44 @@ func (container *ContainerState) addCPUSample(sample *ContainerUsageSample) bool
|
|||
if !sample.isValid(ResourceCPU) || !sample.MeasureStart.After(container.LastCPUSampleStart) {
|
||||
return false // Discard invalid, duplicate or out-of-order samples.
|
||||
}
|
||||
container.observeRecommendationUsageDiff(sample.Usage, false, corev1.ResourceCPU)
|
||||
container.observeQualityMetrics(sample.Usage, false, corev1.ResourceCPU)
|
||||
container.aggregator.AddSample(sample)
|
||||
container.LastCPUSampleStart = sample.MeasureStart
|
||||
return true
|
||||
}
|
||||
|
||||
func (container *ContainerState) observeRecommendationUsageDiff(usage ResourceAmount, isOOM bool, resource corev1.ResourceName) {
|
||||
func (container *ContainerState) observeQualityMetrics(usage ResourceAmount, isOOM bool, resource corev1.ResourceName) {
|
||||
if !container.aggregator.NeedsRecommendation() {
|
||||
return
|
||||
}
|
||||
updateMode := container.aggregator.GetUpdateMode()
|
||||
var usageValue float64
|
||||
switch resource {
|
||||
case corev1.ResourceCPU:
|
||||
usageValue = CoresFromCPUAmount(usage)
|
||||
case corev1.ResourceMemory:
|
||||
usageValue = BytesFromMemoryAmount(usage)
|
||||
}
|
||||
if container.aggregator.GetLastRecommendation() == nil {
|
||||
metrics_quality.ObserveMissingRecommendation(isOOM, resource)
|
||||
metrics_quality.ObserveQualityMetricsRecommendationMissing(usageValue, isOOM, resource, updateMode)
|
||||
return
|
||||
}
|
||||
recommendation := container.aggregator.GetLastRecommendation()[resource]
|
||||
var recommendationValue float64
|
||||
var usageValue float64
|
||||
if recommendation.IsZero() {
|
||||
metrics_quality.ObserveMissingRecommendation(isOOM, resource)
|
||||
metrics_quality.ObserveQualityMetricsRecommendationMissing(usageValue, isOOM, resource, updateMode)
|
||||
return
|
||||
}
|
||||
var recommendationValue float64
|
||||
switch resource {
|
||||
case corev1.ResourceCPU:
|
||||
recommendationValue = float64(recommendation.MilliValue()) / 1000.0
|
||||
usageValue = CoresFromCPUAmount(usage)
|
||||
case corev1.ResourceMemory:
|
||||
recommendationValue = float64(recommendation.Value())
|
||||
usageValue = BytesFromMemoryAmount(usage)
|
||||
default:
|
||||
klog.Warningf("Unknown resource: %v", resource)
|
||||
return
|
||||
}
|
||||
metrics_quality.ObserveUsageRecommendationRelativeDiff(usageValue, recommendationValue, isOOM, resource)
|
||||
metrics_quality.ObserveQualityMetrics(usageValue, recommendationValue, isOOM, resource, updateMode)
|
||||
}
|
||||
|
||||
// GetMaxMemoryPeak returns maximum memory usage in the sample, possibly estimated from OOM
|
||||
|
|
@ -163,7 +168,7 @@ func (container *ContainerState) addMemorySample(sample *ContainerUsageSample, i
|
|||
container.oomPeak = 0
|
||||
addNewPeak = true
|
||||
}
|
||||
container.observeRecommendationUsageDiff(sample.Usage, isOOM, corev1.ResourceMemory)
|
||||
container.observeQualityMetrics(sample.Usage, isOOM, corev1.ResourceMemory)
|
||||
if addNewPeak {
|
||||
newPeak := ContainerUsageSample{
|
||||
MeasureStart: container.WindowEnd,
|
||||
|
|
|
|||
|
|
@ -128,6 +128,7 @@ func (vpa *Vpa) UseAggregationIfMatching(aggregationKey AggregateStateKey, aggre
|
|||
if vpa.matchesAggregation(aggregationKey) {
|
||||
vpa.aggregateContainerStates[aggregationKey] = aggregation
|
||||
aggregation.IsUnderVPA = true
|
||||
aggregation.UpdateMode = vpa.UpdateMode
|
||||
return true
|
||||
}
|
||||
return false
|
||||
|
|
|
|||
|
|
@ -219,52 +219,64 @@ func TestUpdateRecommendation(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestUseAggregationIfMatching(t *testing.T) {
|
||||
modeOff := vpa_types.UpdateModeOff
|
||||
modeAuto := vpa_types.UpdateModeAuto
|
||||
cases := []struct {
|
||||
name string
|
||||
aggregations []string
|
||||
vpaSelector string
|
||||
updateMode *vpa_types.UpdateMode
|
||||
container string
|
||||
containerLabels map[string]string
|
||||
isUnderVPA bool
|
||||
expectedAggregations []string
|
||||
expectedUpdateMode *vpa_types.UpdateMode
|
||||
expectedNeedsRecommendation bool
|
||||
}{
|
||||
{
|
||||
name: "First matching aggregation",
|
||||
aggregations: []string{},
|
||||
vpaSelector: testSelectorStr,
|
||||
updateMode: &modeOff,
|
||||
container: "test-container",
|
||||
containerLabels: testLabels,
|
||||
isUnderVPA: false,
|
||||
expectedAggregations: []string{"test-container"},
|
||||
expectedNeedsRecommendation: true,
|
||||
expectedUpdateMode: &modeOff,
|
||||
}, {
|
||||
name: "New matching aggregation",
|
||||
aggregations: []string{"test-container"},
|
||||
vpaSelector: testSelectorStr,
|
||||
updateMode: &modeAuto,
|
||||
container: "second-container",
|
||||
containerLabels: testLabels,
|
||||
isUnderVPA: false,
|
||||
expectedAggregations: []string{"test-container", "second-container"},
|
||||
expectedNeedsRecommendation: true,
|
||||
expectedUpdateMode: &modeAuto,
|
||||
}, {
|
||||
name: "Existing matching aggregation",
|
||||
aggregations: []string{"test-container"},
|
||||
vpaSelector: testSelectorStr,
|
||||
updateMode: &modeOff,
|
||||
container: "test-container",
|
||||
containerLabels: testLabels,
|
||||
isUnderVPA: true,
|
||||
expectedAggregations: []string{"test-container"},
|
||||
expectedNeedsRecommendation: true,
|
||||
expectedUpdateMode: &modeOff,
|
||||
}, {
|
||||
name: "Aggregation not matching",
|
||||
aggregations: []string{"test-container"},
|
||||
vpaSelector: testSelectorStr,
|
||||
updateMode: &modeAuto,
|
||||
container: "second-container",
|
||||
containerLabels: map[string]string{"different": "labels"},
|
||||
isUnderVPA: false,
|
||||
expectedAggregations: []string{"test-container"},
|
||||
expectedNeedsRecommendation: false,
|
||||
expectedUpdateMode: nil,
|
||||
},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
|
|
@ -276,13 +288,7 @@ func TestUseAggregationIfMatching(t *testing.T) {
|
|||
t.FailNow()
|
||||
}
|
||||
vpa := NewVpa(VpaID{Namespace: namespace, VpaName: "my-favourite-vpa"}, selector, anyTime)
|
||||
for _, container := range tc.aggregations {
|
||||
vpa.aggregateContainerStates[mockAggregateStateKey{
|
||||
namespace: namespace,
|
||||
containerName: container,
|
||||
labels: labels.Set(testLabels).String(),
|
||||
}] = &AggregateContainerState{}
|
||||
}
|
||||
vpa.UpdateMode = tc.updateMode
|
||||
key := mockAggregateStateKey{
|
||||
namespace: namespace,
|
||||
containerName: tc.container,
|
||||
|
|
@ -291,6 +297,20 @@ func TestUseAggregationIfMatching(t *testing.T) {
|
|||
aggregation := &AggregateContainerState{
|
||||
IsUnderVPA: tc.isUnderVPA,
|
||||
}
|
||||
for _, container := range tc.aggregations {
|
||||
containerKey := mockAggregateStateKey{
|
||||
namespace: namespace,
|
||||
containerName: container,
|
||||
labels: labels.Set(testLabels).String(),
|
||||
}
|
||||
if container == tc.container {
|
||||
aggregation.UpdateMode = vpa.UpdateMode
|
||||
vpa.aggregateContainerStates[containerKey] = aggregation
|
||||
} else {
|
||||
vpa.aggregateContainerStates[key] = &AggregateContainerState{UpdateMode: vpa.UpdateMode}
|
||||
}
|
||||
}
|
||||
|
||||
vpa.UseAggregationIfMatching(key, aggregation)
|
||||
assert.Equal(t, len(tc.expectedAggregations), len(vpa.aggregateContainerStates), "AggregateContainerStates has unexpected size")
|
||||
for _, container := range tc.expectedAggregations {
|
||||
|
|
@ -304,6 +324,13 @@ func TestUseAggregationIfMatching(t *testing.T) {
|
|||
assert.True(t, found, "Container %s not found in aggregateContainerStates", container)
|
||||
}
|
||||
assert.Equal(t, tc.expectedNeedsRecommendation, aggregation.NeedsRecommendation())
|
||||
if tc.expectedUpdateMode == nil {
|
||||
assert.Nil(t, aggregation.GetUpdateMode())
|
||||
} else {
|
||||
if assert.NotNil(t, aggregation.GetUpdateMode()) {
|
||||
assert.Equal(t, *tc.expectedUpdateMode, *aggregation.GetUpdateMode())
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,10 +18,13 @@ limitations under the License.
|
|||
package quality
|
||||
|
||||
import (
|
||||
"math"
|
||||
"strconv"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1beta2"
|
||||
"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/metrics"
|
||||
"k8s.io/klog"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
|
@ -30,22 +33,77 @@ const (
|
|||
metricsNamespace = metrics.TopMetricsNamespace + "quality"
|
||||
)
|
||||
|
||||
var (
|
||||
// Buckets between 0.01 and 655.36 cores
|
||||
cpuBuckets = prometheus.ExponentialBuckets(0.01, 2., 17)
|
||||
// Buckets between 1MB and 65.5 GB
|
||||
memoryBuckets = prometheus.ExponentialBuckets(1e6, 2., 17)
|
||||
)
|
||||
|
||||
var (
|
||||
usageRecommendationRelativeDiff = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "usage_recommendation_relative_diffs",
|
||||
Help: "Diffs between usage and recommendation, normalized by recommendation value",
|
||||
Help: "Diffs between recommendation and usage, normalized by recommendation value",
|
||||
Buckets: []float64{-1., -.75, -.5, -.25, -.1, -.05, -0.025, -.01, -.005, -0.0025, -.001, 0.,
|
||||
.001, .0025, .005, .01, .025, .05, .1, .25, .5, .75, 1., 2.5, 5., 10., 25., 50., 100.},
|
||||
}, []string{"resource", "is_oom"},
|
||||
}, []string{"update_mode", "resource", "is_oom"},
|
||||
)
|
||||
usageMissingRecommendationCounter = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "usage_sample_missing_recommendation_count",
|
||||
Help: "Count of usage samples when a recommendation should be present but is missing.",
|
||||
}, []string{"resource", "is_oom"},
|
||||
Help: "Count of usage samples when a recommendation should be present but is missing",
|
||||
}, []string{"update_mode", "resource", "is_oom"},
|
||||
)
|
||||
cpuRecommendationOverUsageDiff = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "cpu_recommendation_over_usage_diffs_cores",
|
||||
Help: "Absolute diffs between recommendation and usage for CPU when recommendation > usage",
|
||||
Buckets: cpuBuckets,
|
||||
}, []string{"update_mode", "recommendation_missing"},
|
||||
)
|
||||
memoryRecommendationOverUsageDiff = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "mem_recommendation_over_usage_diffs_bytes",
|
||||
Help: "Absolute diffs between recommendation and usage for memory when recommendation > usage",
|
||||
Buckets: memoryBuckets,
|
||||
}, []string{"update_mode", "recommendation_missing", "is_oom"},
|
||||
)
|
||||
cpuRecommendationLowerOrEqualUsageDiff = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "cpu_recommendation_lower_equal_usage_diffs_cores",
|
||||
Help: "Absolute diffs between recommendation and usage for CPU when recommendation <= usage",
|
||||
Buckets: cpuBuckets,
|
||||
}, []string{"update_mode", "recommendation_missing"},
|
||||
)
|
||||
memoryRecommendationLowerOrEqualUsageDiff = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "mem_recommendation_lower_equal_usage_diffs_bytes",
|
||||
Help: "Absolute diffs between recommendation and usage for memory when recommendation <= usage",
|
||||
Buckets: memoryBuckets,
|
||||
}, []string{"update_mode", "recommendation_missing", "is_oom"},
|
||||
)
|
||||
cpuRecommendations = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "cpu_recommendations_cores",
|
||||
Help: "CPU recommendation values as observed on recorded usage sample",
|
||||
Buckets: cpuBuckets,
|
||||
}, []string{"update_mode"},
|
||||
)
|
||||
memoryRecommendations = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "mem_recommendations_bytes",
|
||||
Help: "Memory recommendation values as observed on recorded usage sample",
|
||||
Buckets: memoryBuckets,
|
||||
}, []string{"update_mode", "is_oom"},
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -53,17 +111,82 @@ var (
|
|||
func Register() {
|
||||
prometheus.MustRegister(usageRecommendationRelativeDiff)
|
||||
prometheus.MustRegister(usageMissingRecommendationCounter)
|
||||
prometheus.MustRegister(cpuRecommendationOverUsageDiff)
|
||||
prometheus.MustRegister(memoryRecommendationOverUsageDiff)
|
||||
prometheus.MustRegister(cpuRecommendationLowerOrEqualUsageDiff)
|
||||
prometheus.MustRegister(memoryRecommendationLowerOrEqualUsageDiff)
|
||||
prometheus.MustRegister(cpuRecommendations)
|
||||
prometheus.MustRegister(memoryRecommendations)
|
||||
}
|
||||
|
||||
// ObserveUsageRecommendationRelativeDiff records relative diff between usage and
|
||||
// observeUsageRecommendationRelativeDiff records relative diff between usage and
|
||||
// recommendation if recommendation has a positive value.
|
||||
func ObserveUsageRecommendationRelativeDiff(usage, recommendation float64, isOOM bool, resource corev1.ResourceName) {
|
||||
func observeUsageRecommendationRelativeDiff(usage, recommendation float64, isOOM bool, resource corev1.ResourceName, updateMode *vpa_types.UpdateMode) {
|
||||
if recommendation > 0 {
|
||||
usageRecommendationRelativeDiff.WithLabelValues(string(resource), strconv.FormatBool(isOOM)).Observe((usage - recommendation) / recommendation)
|
||||
usageRecommendationRelativeDiff.WithLabelValues(updateModeToString(updateMode), string(resource), strconv.FormatBool(isOOM)).Observe((usage - recommendation) / recommendation)
|
||||
}
|
||||
}
|
||||
|
||||
// ObserveMissingRecommendation counts usage samples with missing recommendations.
|
||||
func ObserveMissingRecommendation(isOOM bool, resource corev1.ResourceName) {
|
||||
usageMissingRecommendationCounter.WithLabelValues(string(resource), strconv.FormatBool(isOOM)).Inc()
|
||||
// observeMissingRecommendation counts usage samples with missing recommendations.
|
||||
func observeMissingRecommendation(isOOM bool, resource corev1.ResourceName, updateMode *vpa_types.UpdateMode) {
|
||||
usageMissingRecommendationCounter.WithLabelValues(updateModeToString(updateMode), string(resource), strconv.FormatBool(isOOM)).Inc()
|
||||
}
|
||||
|
||||
// observeUsageRecommendationDiff records absolute diff between usage and
|
||||
// recommendation.
|
||||
func observeUsageRecommendationDiff(usage, recommendation float64, isRecommendationMissing, isOOM bool, resource corev1.ResourceName, updateMode *vpa_types.UpdateMode) {
|
||||
recommendationOverUsage := recommendation > usage
|
||||
diff := math.Abs(usage - recommendation)
|
||||
switch resource {
|
||||
case corev1.ResourceCPU:
|
||||
if recommendationOverUsage {
|
||||
cpuRecommendationOverUsageDiff.WithLabelValues(updateModeToString(updateMode),
|
||||
strconv.FormatBool(isRecommendationMissing)).Observe(diff)
|
||||
} else {
|
||||
cpuRecommendationLowerOrEqualUsageDiff.WithLabelValues(updateModeToString(updateMode),
|
||||
strconv.FormatBool(isRecommendationMissing)).Observe(diff)
|
||||
}
|
||||
case corev1.ResourceMemory:
|
||||
if recommendationOverUsage {
|
||||
memoryRecommendationOverUsageDiff.WithLabelValues(updateModeToString(updateMode),
|
||||
strconv.FormatBool(isRecommendationMissing), strconv.FormatBool(isOOM)).Observe(diff)
|
||||
} else {
|
||||
memoryRecommendationLowerOrEqualUsageDiff.WithLabelValues(updateModeToString(updateMode),
|
||||
strconv.FormatBool(isRecommendationMissing), strconv.FormatBool(isOOM)).Observe(diff)
|
||||
}
|
||||
default:
|
||||
klog.Warningf("Unknown resource: %v", resource)
|
||||
}
|
||||
}
|
||||
|
||||
// observeRecommendation records the value of recommendation.
|
||||
func observeRecommendation(recommendation float64, isOOM bool, resource corev1.ResourceName, updateMode *vpa_types.UpdateMode) {
|
||||
switch resource {
|
||||
case corev1.ResourceCPU:
|
||||
cpuRecommendations.WithLabelValues(updateModeToString(updateMode)).Observe(recommendation)
|
||||
case corev1.ResourceMemory:
|
||||
memoryRecommendations.WithLabelValues(updateModeToString(updateMode), strconv.FormatBool(isOOM)).Observe(recommendation)
|
||||
default:
|
||||
klog.Warningf("Unknown resource: %v", resource)
|
||||
}
|
||||
}
|
||||
|
||||
// ObserveQualityMetrics records all quality metrics that we can derive from recommendation and usage.
|
||||
func ObserveQualityMetrics(usage, recommendation float64, isOOM bool, resource corev1.ResourceName, updateMode *vpa_types.UpdateMode) {
|
||||
observeRecommendation(recommendation, isOOM, resource, updateMode)
|
||||
observeUsageRecommendationDiff(usage, recommendation, false, isOOM, resource, updateMode)
|
||||
observeUsageRecommendationRelativeDiff(usage, recommendation, isOOM, resource, updateMode)
|
||||
}
|
||||
|
||||
// ObserveQualityMetricsRecommendationMissing records all quality metrics that we can derive from usage when recommendation is missing.
|
||||
func ObserveQualityMetricsRecommendationMissing(usage float64, isOOM bool, resource corev1.ResourceName, updateMode *vpa_types.UpdateMode) {
|
||||
observeMissingRecommendation(isOOM, resource, updateMode)
|
||||
observeUsageRecommendationDiff(usage, 0, true, isOOM, resource, updateMode)
|
||||
}
|
||||
|
||||
func updateModeToString(updateMode *vpa_types.UpdateMode) string {
|
||||
if updateMode == nil {
|
||||
return ""
|
||||
}
|
||||
return string(*updateMode)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue