grpc-go/xds/internal/xdsclient/metrics_test.go

331 lines
12 KiB
Go

/*
*
* Copyright 2025 gRPC authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package xdsclient
import (
"context"
"encoding/json"
"fmt"
"testing"
"github.com/google/uuid"
"google.golang.org/grpc/internal/testutils"
"google.golang.org/grpc/internal/testutils/stats"
"google.golang.org/grpc/internal/testutils/xds/e2e"
"google.golang.org/grpc/internal/xds/bootstrap"
"google.golang.org/grpc/xds/internal/xdsclient/xdsresource"
v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3"
)
type noopListenerWatcher struct{}
func (noopListenerWatcher) ResourceChanged(_ *xdsresource.ListenerResourceData, onDone func()) {
onDone()
}
func (noopListenerWatcher) ResourceError(_ error, onDone func()) {
onDone()
}
func (noopListenerWatcher) AmbientError(_ error, onDone func()) {
onDone()
}
// TestResourceUpdateMetrics configures an xDS client, and a management server
// to send valid and invalid LDS updates, and verifies that the expected metrics
// for both good and bad updates are emitted.
func (s) TestResourceUpdateMetrics(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
defer cancel()
tmr := stats.NewTestMetricsRecorder()
l, err := testutils.LocalTCPListener()
if err != nil {
t.Fatalf("net.Listen() failed: %v", err)
}
mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: l})
const listenerResourceName = "test-listener-resource"
const routeConfigurationName = "test-route-configuration-resource"
nodeID := uuid.New().String()
resources := e2e.UpdateOptions{
NodeID: nodeID,
Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)},
SkipValidation: true,
}
if err := mgmtServer.Update(ctx, resources); err != nil {
t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err)
}
bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
Servers: []byte(fmt.Sprintf(`[{
"server_uri": %q,
"channel_creds": [{"type": "insecure"}]
}]`, mgmtServer.Address)),
Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
Authorities: map[string]json.RawMessage{
"authority": []byte("{}"),
},
})
if err != nil {
t.Fatalf("Failed to create bootstrap configuration: %v", err)
}
config, err := bootstrap.NewConfigFromContents(bootstrapContents)
if err != nil {
t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
}
pool := NewPool(config)
client, close, err := pool.NewClientForTesting(OptionsForTesting{
Name: t.Name(),
WatchExpiryTimeout: defaultTestWatchExpiryTimeout,
MetricsRecorder: tmr,
})
if err != nil {
t.Fatalf("Failed to create an xDS client: %v", err)
}
defer close()
// Watch the valid listener configured on the management server. This should
// cause a resource updates valid count to emit eventually.
xdsresource.WatchListener(client, listenerResourceName, noopListenerWatcher{})
mdWant := stats.MetricsData{
Handle: xdsClientResourceUpdatesValidMetric.Descriptor(),
IntIncr: 1,
LabelKeys: []string{"grpc.target", "grpc.xds.server", "grpc.xds.resource_type"},
LabelVals: []string{"Test/ResourceUpdateMetrics", mgmtServer.Address, "ListenerResource"},
}
if err := tmr.WaitForInt64Count(ctx, mdWant); err != nil {
t.Fatal(err.Error())
}
// Invalid should have no recording point.
if got, _ := tmr.Metric("grpc.xds_client.resource_updates_invalid"); got != 0 {
t.Fatalf("Unexpected data for metric \"grpc.xds_client.resource_updates_invalid\", got: %v, want: %v", got, 0)
}
// Update management server with a bad update. Eventually, tmr should
// receive an invalid count received metric. The successful metric should
// stay the same.
resources = e2e.UpdateOptions{
NodeID: nodeID,
Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)},
SkipValidation: true,
}
resources.Listeners[0].ApiListener = nil
if err := mgmtServer.Update(ctx, resources); err != nil {
t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err)
}
mdWant = stats.MetricsData{
Handle: xdsClientResourceUpdatesInvalidMetric.Descriptor(),
IntIncr: 1,
LabelKeys: []string{"grpc.target", "grpc.xds.server", "grpc.xds.resource_type"},
LabelVals: []string{"Test/ResourceUpdateMetrics", mgmtServer.Address, "ListenerResource"},
}
if err := tmr.WaitForInt64Count(ctx, mdWant); err != nil {
t.Fatal(err.Error())
}
// Valid should stay the same at 1.
if got, _ := tmr.Metric("grpc.xds_client.resource_updates_valid"); got != 1 {
t.Fatalf("Unexpected data for metric \"grpc.xds_client.resource_updates_invalid\", got: %v, want: %v", got, 1)
}
}
// TestServerFailureMetrics_BeforeResponseRecv configures an xDS client, and a
// management server. It then register a watcher and stops the management
// server before sending a resource update, and verifies that the expected
// metrics for server failure are emitted.
func (s) TestServerFailureMetrics_BeforeResponseRecv(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
defer cancel()
tmr := stats.NewTestMetricsRecorder()
l, err := testutils.LocalTCPListener()
if err != nil {
t.Fatalf("net.Listen() failed: %v", err)
}
lis := testutils.NewRestartableListener(l)
streamOpened := make(chan struct{}, 1)
mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
Listener: lis,
OnStreamOpen: func(context.Context, int64, string) error {
select {
case streamOpened <- struct{}{}:
default:
}
return nil
},
})
nodeID := uuid.New().String()
bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
Servers: []byte(fmt.Sprintf(`[{
"server_uri": %q,
"channel_creds": [{"type": "insecure"}]
}]`, mgmtServer.Address)),
Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
Authorities: map[string]json.RawMessage{
"authority": []byte("{}"),
},
})
if err != nil {
t.Fatalf("Failed to create bootstrap configuration: %v", err)
}
config, err := bootstrap.NewConfigFromContents(bootstrapContents)
if err != nil {
t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
}
pool := NewPool(config)
client, close, err := pool.NewClientForTesting(OptionsForTesting{
Name: t.Name(),
WatchExpiryTimeout: defaultTestWatchExpiryTimeout,
MetricsRecorder: tmr,
})
if err != nil {
t.Fatalf("Failed to create an xDS client: %v", err)
}
defer close()
const listenerResourceName = "test-listener-resource"
// Watch for the listener on the above management server.
xdsresource.WatchListener(client, listenerResourceName, noopListenerWatcher{})
// Verify that an ADS stream is opened and an LDS request with the above
// resource name is sent.
select {
case <-streamOpened:
case <-ctx.Done():
t.Fatal("Timeout when waiting for ADS stream to open")
}
// Close the listener and ensure that the ADS stream breaks. This should
// cause a server failure count to emit eventually.
lis.Stop()
// Restart to prevent the attempt to create a new ADS stream after back off.
lis.Restart()
mdWant := stats.MetricsData{
Handle: xdsClientServerFailureMetric.Descriptor(),
IntIncr: 1,
LabelKeys: []string{"grpc.target", "grpc.xds.server"},
LabelVals: []string{"Test/ServerFailureMetrics_BeforeResponseRecv", mgmtServer.Address},
}
if err := tmr.WaitForInt64Count(ctx, mdWant); err != nil {
t.Fatal(err.Error())
}
}
// TestServerFailureMetrics_AfterResponseRecv configures an xDS client, and a
// management server to send a valid LDS updates, and verifies that the
// server failure metric is not emitted. It then closes the management server
// listener to close the ADS stream and verifies that the server failure metric
// is still not emitted because the the ADS stream was closed after having
// received a response on the stream.
func (s) TestServerFailureMetrics_AfterResponseRecv(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
defer cancel()
tmr := stats.NewTestMetricsRecorder()
l, err := testutils.LocalTCPListener()
if err != nil {
t.Fatalf("net.Listen() failed: %v", err)
}
lis := testutils.NewRestartableListener(l)
mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: lis})
const listenerResourceName = "test-listener-resource"
const routeConfigurationName = "test-route-configuration-resource"
nodeID := uuid.New().String()
resources := e2e.UpdateOptions{
NodeID: nodeID,
Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)},
SkipValidation: true,
}
if err := mgmtServer.Update(ctx, resources); err != nil {
t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err)
}
bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
Servers: []byte(fmt.Sprintf(`[{
"server_uri": %q,
"channel_creds": [{"type": "insecure"}]
}]`, mgmtServer.Address)),
Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
Authorities: map[string]json.RawMessage{
"authority": []byte("{}"),
},
})
if err != nil {
t.Fatalf("Failed to create bootstrap configuration: %v", err)
}
config, err := bootstrap.NewConfigFromContents(bootstrapContents)
if err != nil {
t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
}
pool := NewPool(config)
client, close, err := pool.NewClientForTesting(OptionsForTesting{
Name: t.Name(),
MetricsRecorder: tmr,
})
if err != nil {
t.Fatalf("Failed to create an xDS client: %v", err)
}
defer close()
// Watch the valid listener configured on the management server. This should
// cause a resource updates valid count to emit eventually.
xdsresource.WatchListener(client, listenerResourceName, noopListenerWatcher{})
mdWant := stats.MetricsData{
Handle: xdsClientResourceUpdatesValidMetric.Descriptor(),
IntIncr: 1,
LabelKeys: []string{"grpc.target", "grpc.xds.server", "grpc.xds.resource_type"},
LabelVals: []string{"Test/ServerFailureMetrics_AfterResponseRecv", mgmtServer.Address, "ListenerResource"},
}
if err := tmr.WaitForInt64Count(ctx, mdWant); err != nil {
t.Fatal(err.Error())
}
// Server failure should have no recording point.
if got, _ := tmr.Metric("grpc.xds_client.server_failure"); got != 0 {
t.Fatalf("Unexpected data for metric \"grpc.xds_client.server_failure\", got: %v, want: %v", got, 0)
}
// Close the listener and ensure that the ADS stream breaks. This should
// cause a server failure count to emit eventually.
lis.Stop()
if ctx.Err() != nil {
t.Fatalf("Timeout when waiting for ADS stream to close")
}
// Restart to prevent the attempt to create a new ADS stream after back off.
lis.Restart()
mdWant = stats.MetricsData{
Handle: xdsClientServerFailureMetric.Descriptor(),
IntIncr: 1,
LabelKeys: []string{"grpc.target", "grpc.xds.server"},
LabelVals: []string{"Test/ServerFailureMetrics_AfterResponseRecv", mgmtServer.Address},
}
// Server failure should still have no recording point.
if err := tmr.WaitForInt64Count(ctx, mdWant); err == nil {
t.Fatal("tmr.WaitForInt64Count(ctx, mdWant) succeeded when expected to timeout.")
}
}