From 9adb3ee3c0a64f1c75ac2b89a21877cfb47f3e01 Mon Sep 17 00:00:00 2001 From: Jordan Liggitt Date: Sat, 2 Mar 2024 01:44:28 -0500 Subject: [PATCH] Add authorization webhook duration/count/failopen metrics Kubernetes-commit: 79b344d85e3e2f8f3192a3dcabb384cfe87136a6 --- .../authorizerfactory/metrics.go | 2 + .../pkg/authorizer/webhook/metrics/metrics.go | 117 ++++++++++++++++++ .../webhook/metrics/metrics_test.go | 86 +++++++++++++ plugin/pkg/authorizer/webhook/metrics_test.go | 79 +++++++++++- plugin/pkg/authorizer/webhook/webhook.go | 23 ++++ .../pkg/authorizer/webhook/webhook_v1_test.go | 8 ++ 6 files changed, 314 insertions(+), 1 deletion(-) create mode 100644 plugin/pkg/authorizer/webhook/metrics/metrics_test.go diff --git a/pkg/authorization/authorizerfactory/metrics.go b/pkg/authorization/authorizerfactory/metrics.go index df30479b7..3f72a25b7 100644 --- a/pkg/authorization/authorizerfactory/metrics.go +++ b/pkg/authorization/authorizerfactory/metrics.go @@ -60,6 +60,8 @@ var ( var _ = webhookmetrics.AuthorizerMetrics(delegatingAuthorizerMetrics{}) type delegatingAuthorizerMetrics struct { + // no-op for webhook metrics for now, delegating authorization reports original total/latency metrics + webhookmetrics.NoopWebhookMetrics // no-op for matchCondition metrics for now, delegating authorization doesn't configure match conditions celmetrics.NoopMatcherMetrics } diff --git a/plugin/pkg/authorizer/webhook/metrics/metrics.go b/plugin/pkg/authorizer/webhook/metrics/metrics.go index 312f6ed89..23f82cc65 100644 --- a/plugin/pkg/authorizer/webhook/metrics/metrics.go +++ b/plugin/pkg/authorizer/webhook/metrics/metrics.go @@ -18,20 +18,26 @@ package metrics import ( "context" + "sync" "k8s.io/apiserver/pkg/authorization/cel" + compbasemetrics "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" ) // AuthorizerMetrics specifies a set of methods that are used to register various metrics for the webhook authorizer type AuthorizerMetrics interface { // Request total and latency metrics RequestMetrics + // Webhook count, latency, and fail open metrics + WebhookMetrics // match condition metrics cel.MatcherMetrics } type NoopAuthorizerMetrics struct { NoopRequestMetrics + NoopWebhookMetrics cel.NoopMatcherMetrics } @@ -47,3 +53,114 @@ type NoopRequestMetrics struct{} func (NoopRequestMetrics) RecordRequestTotal(context.Context, string) {} func (NoopRequestMetrics) RecordRequestLatency(context.Context, string, float64) {} + +type WebhookMetrics interface { + // RecordWebhookEvaluation increments with each round-trip of a webhook authorizer. + // result is one of: + // - canceled: the call invoking the webhook request was canceled + // - timeout: the webhook request timed out + // - error: the webhook response completed and was invalid + // - success: the webhook response completed and was well-formed + RecordWebhookEvaluation(ctx context.Context, name, result string) + // RecordWebhookDuration records latency for each round-trip of a webhook authorizer. + // result is one of: + // - canceled: the call invoking the webhook request was canceled + // - timeout: the webhook request timed out + // - error: the webhook response completed and was invalid + // - success: the webhook response completed and was well-formed + RecordWebhookDuration(ctx context.Context, name, result string, duration float64) + // RecordWebhookFailOpen increments when a webhook timeout or error results in a fail open + // of a request which has not been canceled. + // result is one of: + // - timeout: the webhook request timed out + // - error: the webhook response completed and was invalid + RecordWebhookFailOpen(ctx context.Context, name, result string) +} + +type NoopWebhookMetrics struct{} + +func (NoopWebhookMetrics) RecordWebhookEvaluation(ctx context.Context, name, result string) {} +func (NoopWebhookMetrics) RecordWebhookDuration(ctx context.Context, name, result string, duration float64) { +} +func (NoopWebhookMetrics) RecordWebhookFailOpen(ctx context.Context, name, result string) {} + +var registerWebhookMetrics sync.Once + +// RegisterMetrics registers authorizer metrics. +func RegisterWebhookMetrics() { + registerWebhookMetrics.Do(func() { + legacyregistry.MustRegister(webhookEvaluations) + legacyregistry.MustRegister(webhookDuration) + legacyregistry.MustRegister(webhookFailOpen) + }) +} + +func ResetMetricsForTest() { + webhookEvaluations.Reset() + webhookDuration.Reset() + webhookFailOpen.Reset() +} + +const ( + namespace = "apiserver" + subsystem = "authorization" +) + +var ( + webhookEvaluations = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "webhook_evaluations_total", + Help: "Round-trips to authorization webhooks.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"name", "result"}, + ) + + webhookDuration = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "webhook_duration_seconds", + Help: "Request latency in seconds.", + Buckets: compbasemetrics.DefBuckets, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"name", "result"}, + ) + + webhookFailOpen = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "webhook_evaluations_fail_open_total", + Help: "NoOpinion results due to webhook timeout or error.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"name", "result"}, + ) +) + +type webhookMetrics struct{} + +func NewWebhookMetrics() WebhookMetrics { + RegisterWebhookMetrics() + return webhookMetrics{} +} + +func ResetWebhookMetricsForTest() { + webhookEvaluations.Reset() + webhookDuration.Reset() + webhookFailOpen.Reset() +} + +func (webhookMetrics) RecordWebhookEvaluation(ctx context.Context, name, result string) { + webhookEvaluations.WithContext(ctx).WithLabelValues(name, result).Inc() +} +func (webhookMetrics) RecordWebhookDuration(ctx context.Context, name, result string, duration float64) { + webhookDuration.WithContext(ctx).WithLabelValues(name, result).Observe(duration) +} +func (webhookMetrics) RecordWebhookFailOpen(ctx context.Context, name, result string) { + webhookFailOpen.WithContext(ctx).WithLabelValues(name, result).Inc() +} diff --git a/plugin/pkg/authorizer/webhook/metrics/metrics_test.go b/plugin/pkg/authorizer/webhook/metrics/metrics_test.go new file mode 100644 index 000000000..1af0435aa --- /dev/null +++ b/plugin/pkg/authorizer/webhook/metrics/metrics_test.go @@ -0,0 +1,86 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "strings" + "testing" + + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/component-base/metrics/testutil" +) + +func TestRecordWebhookMetrics(t *testing.T) { + testCases := []struct { + desc string + metrics []string + name string + result string + duration float64 + want string + }{ + { + desc: "evaluation failure total", + metrics: []string{ + "apiserver_authorization_webhook_duration_seconds", + "apiserver_authorization_webhook_evaluations_total", + "apiserver_authorization_webhook_evaluations_fail_open_total", + }, + name: "wh1.example.com", + result: "timeout", + duration: 1.5, + want: ` + # HELP apiserver_authorization_webhook_duration_seconds [ALPHA] Request latency in seconds. + # TYPE apiserver_authorization_webhook_duration_seconds histogram + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.005"} 0 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.01"} 0 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.025"} 0 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.05"} 0 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.1"} 0 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.25"} 0 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.5"} 0 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="1"} 0 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="2.5"} 1 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="5"} 1 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="10"} 1 + apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="+Inf"} 1 + apiserver_authorization_webhook_duration_seconds_sum{name="wh1.example.com",result="timeout"} 1.5 + apiserver_authorization_webhook_duration_seconds_count{name="wh1.example.com",result="timeout"} 1 + # HELP apiserver_authorization_webhook_evaluations_fail_open_total [ALPHA] NoOpinion results due to webhook timeout or error. + # TYPE apiserver_authorization_webhook_evaluations_fail_open_total counter + apiserver_authorization_webhook_evaluations_fail_open_total{name="wh1.example.com",result="timeout"} 1 + # HELP apiserver_authorization_webhook_evaluations_total [ALPHA] Round-trips to authorization webhooks. + # TYPE apiserver_authorization_webhook_evaluations_total counter + apiserver_authorization_webhook_evaluations_total{name="wh1.example.com",result="timeout"} 1 + `, + }, + } + + for _, tt := range testCases { + t.Run(tt.desc, func(t *testing.T) { + ResetWebhookMetricsForTest() + m := NewWebhookMetrics() + m.RecordWebhookDuration(context.Background(), tt.name, tt.result, tt.duration) + m.RecordWebhookEvaluation(context.Background(), tt.name, tt.result) + m.RecordWebhookFailOpen(context.Background(), tt.name, tt.result) + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(tt.want), tt.metrics...); err != nil { + t.Fatal(err) + } + }) + } +} diff --git a/plugin/pkg/authorizer/webhook/metrics_test.go b/plugin/pkg/authorizer/webhook/metrics_test.go index d8385bb0a..422358f14 100644 --- a/plugin/pkg/authorizer/webhook/metrics_test.go +++ b/plugin/pkg/authorizer/webhook/metrics_test.go @@ -18,8 +18,10 @@ package webhook import ( "context" + "net/http" "testing" + authorizationv1 "k8s.io/api/authorization/v1" "k8s.io/apiserver/pkg/apis/apiserver" "k8s.io/apiserver/pkg/authentication/user" "k8s.io/apiserver/pkg/authorization/authorizer" @@ -29,11 +31,15 @@ import ( func TestAuthorizerMetrics(t *testing.T) { scenarios := []struct { name string + canceledRequest bool clientCert, clientKey, clientCA []byte serverCert, serverKey, serverCA []byte authzFakeServiceStatusCode int authFakeServiceDeny bool expectedRegisteredStatusCode string + expectEvalutionResult string + expectDurationResult string + expectFailOpenResult string wantErr bool }{ { @@ -41,6 +47,31 @@ func TestAuthorizerMetrics(t *testing.T) { clientCert: clientCert, clientKey: clientKey, clientCA: caCert, serverCert: serverCert, serverKey: serverKey, serverCA: caCert, expectedRegisteredStatusCode: "200", + expectEvalutionResult: "success", + expectDurationResult: "success", + expectFailOpenResult: "", + }, + + { + name: "timed out request", + clientCert: clientCert, clientKey: clientKey, clientCA: caCert, + serverCert: serverCert, serverKey: serverKey, serverCA: caCert, + authzFakeServiceStatusCode: http.StatusGatewayTimeout, + expectedRegisteredStatusCode: "504", + expectEvalutionResult: "timeout", + expectDurationResult: "timeout", + expectFailOpenResult: "timeout", + }, + + { + name: "canceled request", + clientCert: clientCert, clientKey: clientKey, clientCA: caCert, + serverCert: serverCert, serverKey: serverKey, serverCA: caCert, + canceledRequest: true, + expectedRegisteredStatusCode: "", + expectEvalutionResult: "canceled", + expectDurationResult: "canceled", + expectFailOpenResult: "", }, { @@ -49,6 +80,9 @@ func TestAuthorizerMetrics(t *testing.T) { serverCert: serverCert, serverKey: serverKey, serverCA: caCert, authzFakeServiceStatusCode: 500, expectedRegisteredStatusCode: "500", + expectEvalutionResult: "error", + expectDurationResult: "error", + expectFailOpenResult: "error", }, { @@ -56,17 +90,28 @@ func TestAuthorizerMetrics(t *testing.T) { clientCert: clientCert, clientKey: clientKey, clientCA: caCert, serverCert: serverCert, serverKey: serverKey, serverCA: badCACert, expectedRegisteredStatusCode: "", + expectEvalutionResult: "error", + expectDurationResult: "error", + expectFailOpenResult: "error", wantErr: true, }, } for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + service := new(mockV1Service) service.statusCode = scenario.authzFakeServiceStatusCode if service.statusCode == 0 { service.statusCode = 200 } + service.reviewHook = func(*authorizationv1.SubjectAccessReview) { + if scenario.canceledRequest { + cancel() + } + } service.allow = !scenario.authFakeServiceDeny server, err := NewV1TestServer(service, scenario.serverCert, scenario.serverKey, scenario.serverCA) @@ -84,7 +129,7 @@ func TestAuthorizerMetrics(t *testing.T) { } attr := authorizer.AttributesRecord{User: &user.DefaultInfo{}} - _, _, err = wh.Authorize(context.Background(), attr) + _, _, err = wh.Authorize(ctx, attr) if scenario.wantErr { if err == nil { t.Errorf("expected error making authorization request: %v", err) @@ -98,6 +143,16 @@ func TestAuthorizerMetrics(t *testing.T) { if fakeAuthzMetrics.latencyCode != scenario.expectedRegisteredStatusCode { t.Errorf("incorrect status code recorded for RecordRequestLatency method, expected = %v, got %v", scenario.expectedRegisteredStatusCode, fakeAuthzMetrics.latencyCode) } + + if fakeAuthzMetrics.evaluationsResult != scenario.expectEvalutionResult { + t.Errorf("expected evaluationsResult %q, got %q", scenario.expectEvalutionResult, fakeAuthzMetrics.evaluationsResult) + } + if fakeAuthzMetrics.durationResult != scenario.expectDurationResult { + t.Errorf("expected durationResult %q, got %q", scenario.expectDurationResult, fakeAuthzMetrics.durationResult) + } + if fakeAuthzMetrics.failOpenResult != scenario.expectFailOpenResult { + t.Errorf("expected failOpenResult %q, got %q", scenario.expectFailOpenResult, fakeAuthzMetrics.failOpenResult) + } }) } } @@ -108,6 +163,15 @@ type fakeAuthorizerMetrics struct { latency float64 latencyCode string + evaluations int + evaluationsResult string + + duration float64 + durationResult string + + failOpen int + failOpenResult string + cel.NoopMatcherMetrics } @@ -119,3 +183,16 @@ func (f *fakeAuthorizerMetrics) RecordRequestLatency(_ context.Context, code str f.latency = latency f.latencyCode = code } + +func (f *fakeAuthorizerMetrics) RecordWebhookEvaluation(ctx context.Context, name, result string) { + f.evaluations += 1 + f.evaluationsResult = result +} +func (f *fakeAuthorizerMetrics) RecordWebhookDuration(ctx context.Context, name, result string, duration float64) { + f.duration = duration + f.durationResult = result +} +func (f *fakeAuthorizerMetrics) RecordWebhookFailOpen(ctx context.Context, name, result string) { + f.failOpen += 1 + f.failOpenResult = result +} diff --git a/plugin/pkg/authorizer/webhook/webhook.go b/plugin/pkg/authorizer/webhook/webhook.go index 65c320f6d..d97b12145 100644 --- a/plugin/pkg/authorizer/webhook/webhook.go +++ b/plugin/pkg/authorizer/webhook/webhook.go @@ -20,12 +20,15 @@ package webhook import ( "context" "encoding/json" + "errors" "fmt" + "net/http" "strconv" "time" authorizationv1 "k8s.io/api/authorization/v1" authorizationv1beta1 "k8s.io/api/authorization/v1beta1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" @@ -233,6 +236,7 @@ func (w *WebhookAuthorizer) Authorize(ctx context.Context, attr authorizer.Attri r.Status = entry.(authorizationv1.SubjectAccessReviewStatus) } else { var result *authorizationv1.SubjectAccessReview + var metricsResult string // WithExponentialBackoff will return SAR create error (sarErr) if any. if err := webhook.WithExponentialBackoff(ctx, w.retryBackoff, func() error { var sarErr error @@ -242,6 +246,19 @@ func (w *WebhookAuthorizer) Authorize(ctx context.Context, attr authorizer.Attri result, statusCode, sarErr = w.subjectAccessReview.Create(ctx, r, metav1.CreateOptions{}) latency := time.Since(start) + switch { + case sarErr == nil: + metricsResult = "success" + case ctx.Err() != nil: + metricsResult = "canceled" + case errors.Is(sarErr, context.DeadlineExceeded) || apierrors.IsTimeout(sarErr) || statusCode == http.StatusGatewayTimeout: + metricsResult = "timeout" + default: + metricsResult = "error" + } + w.metrics.RecordWebhookEvaluation(ctx, w.name, metricsResult) + w.metrics.RecordWebhookDuration(ctx, w.name, metricsResult, latency.Seconds()) + if statusCode != 0 { w.metrics.RecordRequestTotal(ctx, strconv.Itoa(statusCode)) w.metrics.RecordRequestLatency(ctx, strconv.Itoa(statusCode), latency.Seconds()) @@ -256,6 +273,12 @@ func (w *WebhookAuthorizer) Authorize(ctx context.Context, attr authorizer.Attri return sarErr }, webhook.DefaultShouldRetry); err != nil { klog.Errorf("Failed to make webhook authorizer request: %v", err) + + // we're returning NoOpinion, and the parent context has not timed out or been canceled + if w.decisionOnError == authorizer.DecisionNoOpinion && ctx.Err() == nil { + w.metrics.RecordWebhookFailOpen(ctx, w.name, metricsResult) + } + return w.decisionOnError, "", err } diff --git a/plugin/pkg/authorizer/webhook/webhook_v1_test.go b/plugin/pkg/authorizer/webhook/webhook_v1_test.go index a130a2f15..2f5125e37 100644 --- a/plugin/pkg/authorizer/webhook/webhook_v1_test.go +++ b/plugin/pkg/authorizer/webhook/webhook_v1_test.go @@ -315,11 +315,18 @@ type mockV1Service struct { allow bool statusCode int called int + + // reviewHook is called just before returning from the Review() method + reviewHook func(*authorizationv1.SubjectAccessReview) } func (m *mockV1Service) Review(r *authorizationv1.SubjectAccessReview) { m.called++ r.Status.Allowed = m.allow + + if m.reviewHook != nil { + m.reviewHook(r) + } } func (m *mockV1Service) Allow() { m.allow = true } func (m *mockV1Service) Deny() { m.allow = false } @@ -1414,5 +1421,6 @@ func celAuthorizerMetrics() metrics.AuthorizerMetrics { type celAuthorizerMetricsType struct { metrics.NoopRequestMetrics + metrics.NoopWebhookMetrics celmetrics.MatcherMetrics }