Merge pull request #132893 from serathius/resource-size

Add apiserver_resource_size_estimate_bytes metric

Kubernetes-commit: 81986595c87365f2155b20af1d361bf2fb96b544
This commit is contained in:
Kubernetes Publisher 2025-07-16 15:28:23 -07:00
commit cb7f41362b
7 changed files with 104 additions and 49 deletions

2
go.mod
View File

@ -49,7 +49,7 @@ require (
gopkg.in/go-jose/go-jose.v2 v2.6.3
gopkg.in/natefinch/lumberjack.v2 v2.2.1
k8s.io/api v0.0.0-20250715090528-7da28ad7db85
k8s.io/apimachinery v0.0.0-20250715090235-1ebcba2516a6
k8s.io/apimachinery v0.0.0-20250716210236-a75d3d8a0f22
k8s.io/client-go v0.0.0-20250715090929-f78427e36774
k8s.io/component-base v0.0.0-20250715211315-7e84d47542a1
k8s.io/klog/v2 v2.130.1

4
go.sum
View File

@ -298,8 +298,8 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
k8s.io/api v0.0.0-20250715090528-7da28ad7db85 h1:QfiiAaLAQKj+yvMS1ySLpj3UWzxJixBEaWvKNtzwnQM=
k8s.io/api v0.0.0-20250715090528-7da28ad7db85/go.mod h1:MCwhlGL+lP5brf6CuU20fWxaLh/8tUSlu4VM1cOD8Lo=
k8s.io/apimachinery v0.0.0-20250715090235-1ebcba2516a6 h1:lH8NMkqxmCWN1CvOiqbfphUcl+EAk95z+3Le9bfEbJ4=
k8s.io/apimachinery v0.0.0-20250715090235-1ebcba2516a6/go.mod h1:TP8uyOuDEOnzGpLOdffo8hPnIjVDljZCxCM/fruV+5M=
k8s.io/apimachinery v0.0.0-20250716210236-a75d3d8a0f22 h1:ffG+yKzD6c5mG3PMuQU3j+ifyOUViYFMNRrijfsFVSM=
k8s.io/apimachinery v0.0.0-20250716210236-a75d3d8a0f22/go.mod h1:TP8uyOuDEOnzGpLOdffo8hPnIjVDljZCxCM/fruV+5M=
k8s.io/client-go v0.0.0-20250715090929-f78427e36774 h1:OJXhumReMNIzlpFEEQvl89u+u7KmQ6fa4I3TpZQYjIg=
k8s.io/client-go v0.0.0-20250715090929-f78427e36774/go.mod h1:y02d1W5RQ3IDA7qs1unUQEkERwkgLrd7fuDANdUN31E=
k8s.io/component-base v0.0.0-20250715211315-7e84d47542a1 h1:K9Ew2I/QQt4qsJkMKrumXF94mlbf800mRkc1KJPCADs=

View File

@ -1668,18 +1668,18 @@ func (e *Store) startObservingCount(period time.Duration, objectCountTracker flo
stopCh := make(chan struct{})
go wait.JitterUntil(func() {
stats, err := e.Storage.Stats(ctx)
metrics.UpdateStoreStats(e.DefaultQualifiedResource, stats, err)
if err != nil {
klog.V(5).InfoS("Failed to update storage count metric", "err", err)
stats.ObjectCount = -1
return
}
metrics.UpdateObjectCount(e.DefaultQualifiedResource, stats.ObjectCount)
if objectCountTracker != nil {
objectCountTracker.Set(resourceName, stats)
}
}, period, resourceCountPollPeriodJitter, true, stopCh)
return func() {
metrics.DeleteObjectCount(e.DefaultQualifiedResource)
metrics.DeleteStoreStats(e.DefaultQualifiedResource)
close(stopCh)
}
}

View File

@ -23,6 +23,9 @@ import (
"time"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apiserver/pkg/features"
"k8s.io/apiserver/pkg/storage"
utilfeature "k8s.io/apiserver/pkg/util/feature"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
@ -75,6 +78,14 @@ var (
},
[]string{"resource"},
)
resourceSizeEstimate = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "apiserver_resource_size_estimate_bytes",
Help: "Estimated size of stored objects in database. Estimate is based on sum of last observed sizes of serialized objects. In case of a fetching error, the value will be -1.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"group", "resource"},
)
dbTotalSize = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Subsystem: "apiserver",
@ -166,6 +177,7 @@ func Register() {
legacyregistry.MustRegister(etcdRequestCounts)
legacyregistry.MustRegister(etcdRequestErrorCounts)
legacyregistry.MustRegister(objectCounts)
legacyregistry.MustRegister(resourceSizeEstimate)
legacyregistry.MustRegister(dbTotalSize)
legacyregistry.CustomMustRegister(storageMonitor)
legacyregistry.MustRegister(etcdEventsReceivedCounts)
@ -179,14 +191,31 @@ func Register() {
})
}
// UpdateObjectCount sets the apiserver_storage_object_counts metric.
func UpdateObjectCount(groupResource schema.GroupResource, count int64) {
objectCounts.WithLabelValues(groupResource.String()).Set(float64(count))
// UpdateStoreStats sets the stats metrics.
func UpdateStoreStats(groupResource schema.GroupResource, stats storage.Stats, err error) {
if err != nil {
objectCounts.WithLabelValues(groupResource.String()).Set(-1)
if utilfeature.DefaultFeatureGate.Enabled(features.SizeBasedListCostEstimate) {
resourceSizeEstimate.WithLabelValues(groupResource.Group, groupResource.Resource).Set(-1)
}
return
}
objectCounts.WithLabelValues(groupResource.String()).Set(float64(stats.ObjectCount))
if utilfeature.DefaultFeatureGate.Enabled(features.SizeBasedListCostEstimate) {
if stats.ObjectCount > 0 && stats.EstimatedAverageObjectSizeBytes == 0 {
resourceSizeEstimate.WithLabelValues(groupResource.Group, groupResource.Resource).Set(-1)
} else {
resourceSizeEstimate.WithLabelValues(groupResource.Group, groupResource.Resource).Set(float64(stats.EstimatedAverageObjectSizeBytes * stats.ObjectCount))
}
}
}
// DeleteObjectCount delete the apiserver_storage_object_counts metric.
func DeleteObjectCount(groupResource schema.GroupResource) {
// DeleteStoreStats delete the stats metrics.
func DeleteStoreStats(groupResource schema.GroupResource) {
objectCounts.DeleteLabelValues(groupResource.String())
if utilfeature.DefaultFeatureGate.Enabled(features.SizeBasedListCostEstimate) {
resourceSizeEstimate.DeleteLabelValues(groupResource.Group, groupResource.Resource)
}
}
// RecordEtcdRequest updates and sets the etcd_request_duration_seconds,

View File

@ -24,6 +24,7 @@ import (
"time"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apiserver/pkg/storage"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/testutil"
)
@ -226,33 +227,64 @@ func TestStorageSizeCollector(t *testing.T) {
}
func TestUpdateObjectCount(t *testing.T) {
func TestUpdateStoreStats(t *testing.T) {
registry := metrics.NewKubeRegistry()
registry.Register(objectCounts)
testedMetrics := "apiserver_storage_objects"
registry.MustRegister(resourceSizeEstimate)
testCases := []struct {
desc string
resource schema.GroupResource
count int64
stats storage.Stats
err error
want string
}{
{
desc: "successful fetch",
desc: "successful object count",
resource: schema.GroupResource{Group: "foo", Resource: "bar"},
count: 10,
want: `# HELP apiserver_storage_objects [STABLE] Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.
stats: storage.Stats{ObjectCount: 10},
want: `# HELP apiserver_resource_size_estimate_bytes [ALPHA] Estimated size of stored objects in database. Estimate is based on sum of last observed sizes of serialized objects. In case of a fetching error, the value will be -1.
# TYPE apiserver_resource_size_estimate_bytes gauge
apiserver_resource_size_estimate_bytes{group="foo",resource="bar"} -1
# HELP apiserver_storage_objects [STABLE] Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.
# TYPE apiserver_storage_objects gauge
apiserver_storage_objects{resource="bar.foo"} 10
`,
},
{
desc: "successful object count and size",
resource: schema.GroupResource{Group: "foo", Resource: "bar"},
stats: storage.Stats{ObjectCount: 10, EstimatedAverageObjectSizeBytes: 10},
want: `# HELP apiserver_resource_size_estimate_bytes [ALPHA] Estimated size of stored objects in database. Estimate is based on sum of last observed sizes of serialized objects. In case of a fetching error, the value will be -1.
# TYPE apiserver_resource_size_estimate_bytes gauge
apiserver_resource_size_estimate_bytes{group="foo",resource="bar"} 100
# HELP apiserver_storage_objects [STABLE] Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.
# TYPE apiserver_storage_objects gauge
apiserver_storage_objects{resource="bar.foo"} 10
`,
},
{
desc: "empty object count",
resource: schema.GroupResource{Group: "foo", Resource: "bar"},
stats: storage.Stats{ObjectCount: 0, EstimatedAverageObjectSizeBytes: 0},
want: `# HELP apiserver_resource_size_estimate_bytes [ALPHA] Estimated size of stored objects in database. Estimate is based on sum of last observed sizes of serialized objects. In case of a fetching error, the value will be -1.
# TYPE apiserver_resource_size_estimate_bytes gauge
apiserver_resource_size_estimate_bytes{group="foo",resource="bar"} 0
# HELP apiserver_storage_objects [STABLE] Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.
# TYPE apiserver_storage_objects gauge
apiserver_storage_objects{resource="bar.foo"} 0
`,
},
{
desc: "failed fetch",
resource: schema.GroupResource{Group: "foo", Resource: "bar"},
count: -1,
err: errors.New("dummy"),
want: `# HELP apiserver_storage_objects [STABLE] Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.
# TYPE apiserver_storage_objects gauge
apiserver_storage_objects{resource="bar.foo"} -1
# HELP apiserver_resource_size_estimate_bytes [ALPHA] Estimated size of stored objects in database. Estimate is based on sum of last observed sizes of serialized objects. In case of a fetching error, the value will be -1.
# TYPE apiserver_resource_size_estimate_bytes gauge
apiserver_resource_size_estimate_bytes{group="foo",resource="bar"} -1
`,
},
}
@ -260,46 +292,53 @@ apiserver_storage_objects{resource="bar.foo"} -1
for _, test := range testCases {
t.Run(test.desc, func(t *testing.T) {
defer registry.Reset()
UpdateObjectCount(test.resource, test.count)
if err := testutil.GatherAndCompare(registry, strings.NewReader(test.want), testedMetrics); err != nil {
UpdateStoreStats(test.resource, test.stats, test.err)
if err := testutil.GatherAndCompare(registry, strings.NewReader(test.want), "apiserver_storage_objects", "apiserver_resource_size_estimate_bytes"); err != nil {
t.Fatal(err)
}
})
}
}
func TestDeleteObjectCount(t *testing.T) {
func TestDeleteStoreStats(t *testing.T) {
registry := metrics.NewKubeRegistry()
registry.MustRegister(objectCounts)
testedMetrics := "apiserver_storage_objects"
registry.MustRegister(resourceSizeEstimate)
UpdateObjectCount(schema.GroupResource{Group: "foo1", Resource: "bar1"}, int64(10))
UpdateObjectCount(schema.GroupResource{Group: "foo2", Resource: "bar2"}, int64(20))
UpdateStoreStats(schema.GroupResource{Group: "foo1", Resource: "bar1"}, storage.Stats{ObjectCount: 10}, nil)
UpdateStoreStats(schema.GroupResource{Group: "foo2", Resource: "bar2"}, storage.Stats{ObjectCount: 20, EstimatedAverageObjectSizeBytes: 10}, nil)
expectedMetrics := `# HELP apiserver_storage_objects [STABLE] Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.
expectedMetrics := `# HELP apiserver_resource_size_estimate_bytes [ALPHA] Estimated size of stored objects in database. Estimate is based on sum of last observed sizes of serialized objects. In case of a fetching error, the value will be -1.
# TYPE apiserver_resource_size_estimate_bytes gauge
apiserver_resource_size_estimate_bytes{group="foo1",resource="bar1"} -1
apiserver_resource_size_estimate_bytes{group="foo2",resource="bar2"} 200
# HELP apiserver_storage_objects [STABLE] Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.
# TYPE apiserver_storage_objects gauge
apiserver_storage_objects{resource="bar1.foo1"} 10
apiserver_storage_objects{resource="bar2.foo2"} 20
`
if err := testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), testedMetrics); err != nil {
if err := testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), "apiserver_storage_objects", "apiserver_resource_size_estimate_bytes"); err != nil {
t.Fatal(err)
}
DeleteObjectCount(schema.GroupResource{Group: "foo1", Resource: "bar1"})
DeleteStoreStats(schema.GroupResource{Group: "foo1", Resource: "bar1"})
expectedMetrics = `# HELP apiserver_storage_objects [STABLE] Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.
expectedMetrics = `# HELP apiserver_resource_size_estimate_bytes [ALPHA] Estimated size of stored objects in database. Estimate is based on sum of last observed sizes of serialized objects. In case of a fetching error, the value will be -1.
# TYPE apiserver_resource_size_estimate_bytes gauge
apiserver_resource_size_estimate_bytes{group="foo2",resource="bar2"} 200
# HELP apiserver_storage_objects [STABLE] Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.
# TYPE apiserver_storage_objects gauge
apiserver_storage_objects{resource="bar2.foo2"} 20
`
if err := testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), testedMetrics); err != nil {
if err := testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), "apiserver_storage_objects", "apiserver_resource_size_estimate_bytes"); err != nil {
t.Fatal(err)
}
DeleteObjectCount(schema.GroupResource{Group: "foo2", Resource: "bar2"})
DeleteStoreStats(schema.GroupResource{Group: "foo2", Resource: "bar2"})
expectedMetrics = `# HELP apiserver_storage_objects [STABLE] Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.
# TYPE apiserver_storage_objects gauge
`
if err := testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), testedMetrics); err != nil {
if err := testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), "apiserver_storage_objects", "apiserver_resource_size_estimate_bytes"); err != nil {
t.Fatal(err)
}
}

View File

@ -97,17 +97,6 @@ type objectCountTracker struct {
}
func (t *objectCountTracker) Set(groupResource string, stats storage.Stats) {
if stats.ObjectCount <= -1 {
// a value of -1 indicates that the 'Count' call failed to contact
// the storage layer, in most cases this error can be transient.
// we will continue to work with the count that is in the cache
// up to a certain threshold defined by staleTolerationThreshold.
// in case this becomes a non transient error then the count for
// the given resource will will eventually be removed from
// the cache by the pruner.
return
}
now := t.clock.Now()
// lock for writing

View File

@ -30,18 +30,14 @@ func TestStorageObjectCountTracker(t *testing.T) {
tests := []struct {
name string
lastUpdated time.Duration
skipSetting bool
count int64
errExpected error
countExpected int64
}{
{
name: "object count not tracked for given resource",
count: -2,
errExpected: ObjectCountNotFoundErr,
},
{
name: "transient failure",
count: -1,
skipSetting: true,
errExpected: ObjectCountNotFoundErr,
},
{
@ -76,7 +72,9 @@ func TestStorageObjectCountTracker(t *testing.T) {
key := "foo.bar.resource"
now := time.Now()
fakeClock.SetTime(now.Add(-test.lastUpdated))
tracker.Set(key, storage.Stats{ObjectCount: test.count})
if !test.skipSetting {
tracker.Set(key, storage.Stats{ObjectCount: test.count})
}
fakeClock.SetTime(now)
stats, err := tracker.Get(key)