Add authorization webhook duration/count/failopen metrics

Kubernetes-commit: 79b344d85e3e2f8f3192a3dcabb384cfe87136a6
This commit is contained in:
Jordan Liggitt 2024-03-02 01:44:28 -05:00 committed by Kubernetes Publisher
parent 250f19d55f
commit 9adb3ee3c0
6 changed files with 314 additions and 1 deletions

View File

@ -60,6 +60,8 @@ var (
var _ = webhookmetrics.AuthorizerMetrics(delegatingAuthorizerMetrics{})
type delegatingAuthorizerMetrics struct {
// no-op for webhook metrics for now, delegating authorization reports original total/latency metrics
webhookmetrics.NoopWebhookMetrics
// no-op for matchCondition metrics for now, delegating authorization doesn't configure match conditions
celmetrics.NoopMatcherMetrics
}

View File

@ -18,20 +18,26 @@ package metrics
import (
"context"
"sync"
"k8s.io/apiserver/pkg/authorization/cel"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
// AuthorizerMetrics specifies a set of methods that are used to register various metrics for the webhook authorizer
type AuthorizerMetrics interface {
// Request total and latency metrics
RequestMetrics
// Webhook count, latency, and fail open metrics
WebhookMetrics
// match condition metrics
cel.MatcherMetrics
}
type NoopAuthorizerMetrics struct {
NoopRequestMetrics
NoopWebhookMetrics
cel.NoopMatcherMetrics
}
@ -47,3 +53,114 @@ type NoopRequestMetrics struct{}
func (NoopRequestMetrics) RecordRequestTotal(context.Context, string) {}
func (NoopRequestMetrics) RecordRequestLatency(context.Context, string, float64) {}
type WebhookMetrics interface {
// RecordWebhookEvaluation increments with each round-trip of a webhook authorizer.
// result is one of:
// - canceled: the call invoking the webhook request was canceled
// - timeout: the webhook request timed out
// - error: the webhook response completed and was invalid
// - success: the webhook response completed and was well-formed
RecordWebhookEvaluation(ctx context.Context, name, result string)
// RecordWebhookDuration records latency for each round-trip of a webhook authorizer.
// result is one of:
// - canceled: the call invoking the webhook request was canceled
// - timeout: the webhook request timed out
// - error: the webhook response completed and was invalid
// - success: the webhook response completed and was well-formed
RecordWebhookDuration(ctx context.Context, name, result string, duration float64)
// RecordWebhookFailOpen increments when a webhook timeout or error results in a fail open
// of a request which has not been canceled.
// result is one of:
// - timeout: the webhook request timed out
// - error: the webhook response completed and was invalid
RecordWebhookFailOpen(ctx context.Context, name, result string)
}
type NoopWebhookMetrics struct{}
func (NoopWebhookMetrics) RecordWebhookEvaluation(ctx context.Context, name, result string) {}
func (NoopWebhookMetrics) RecordWebhookDuration(ctx context.Context, name, result string, duration float64) {
}
func (NoopWebhookMetrics) RecordWebhookFailOpen(ctx context.Context, name, result string) {}
var registerWebhookMetrics sync.Once
// RegisterMetrics registers authorizer metrics.
func RegisterWebhookMetrics() {
registerWebhookMetrics.Do(func() {
legacyregistry.MustRegister(webhookEvaluations)
legacyregistry.MustRegister(webhookDuration)
legacyregistry.MustRegister(webhookFailOpen)
})
}
func ResetMetricsForTest() {
webhookEvaluations.Reset()
webhookDuration.Reset()
webhookFailOpen.Reset()
}
const (
namespace = "apiserver"
subsystem = "authorization"
)
var (
webhookEvaluations = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "webhook_evaluations_total",
Help: "Round-trips to authorization webhooks.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"name", "result"},
)
webhookDuration = compbasemetrics.NewHistogramVec(
&compbasemetrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "webhook_duration_seconds",
Help: "Request latency in seconds.",
Buckets: compbasemetrics.DefBuckets,
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"name", "result"},
)
webhookFailOpen = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "webhook_evaluations_fail_open_total",
Help: "NoOpinion results due to webhook timeout or error.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"name", "result"},
)
)
type webhookMetrics struct{}
func NewWebhookMetrics() WebhookMetrics {
RegisterWebhookMetrics()
return webhookMetrics{}
}
func ResetWebhookMetricsForTest() {
webhookEvaluations.Reset()
webhookDuration.Reset()
webhookFailOpen.Reset()
}
func (webhookMetrics) RecordWebhookEvaluation(ctx context.Context, name, result string) {
webhookEvaluations.WithContext(ctx).WithLabelValues(name, result).Inc()
}
func (webhookMetrics) RecordWebhookDuration(ctx context.Context, name, result string, duration float64) {
webhookDuration.WithContext(ctx).WithLabelValues(name, result).Observe(duration)
}
func (webhookMetrics) RecordWebhookFailOpen(ctx context.Context, name, result string) {
webhookFailOpen.WithContext(ctx).WithLabelValues(name, result).Inc()
}

View File

@ -0,0 +1,86 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"context"
"strings"
"testing"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/component-base/metrics/testutil"
)
func TestRecordWebhookMetrics(t *testing.T) {
testCases := []struct {
desc string
metrics []string
name string
result string
duration float64
want string
}{
{
desc: "evaluation failure total",
metrics: []string{
"apiserver_authorization_webhook_duration_seconds",
"apiserver_authorization_webhook_evaluations_total",
"apiserver_authorization_webhook_evaluations_fail_open_total",
},
name: "wh1.example.com",
result: "timeout",
duration: 1.5,
want: `
# HELP apiserver_authorization_webhook_duration_seconds [ALPHA] Request latency in seconds.
# TYPE apiserver_authorization_webhook_duration_seconds histogram
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.005"} 0
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.01"} 0
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.025"} 0
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.05"} 0
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.1"} 0
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.25"} 0
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="0.5"} 0
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="1"} 0
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="2.5"} 1
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="5"} 1
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="10"} 1
apiserver_authorization_webhook_duration_seconds_bucket{name="wh1.example.com",result="timeout",le="+Inf"} 1
apiserver_authorization_webhook_duration_seconds_sum{name="wh1.example.com",result="timeout"} 1.5
apiserver_authorization_webhook_duration_seconds_count{name="wh1.example.com",result="timeout"} 1
# HELP apiserver_authorization_webhook_evaluations_fail_open_total [ALPHA] NoOpinion results due to webhook timeout or error.
# TYPE apiserver_authorization_webhook_evaluations_fail_open_total counter
apiserver_authorization_webhook_evaluations_fail_open_total{name="wh1.example.com",result="timeout"} 1
# HELP apiserver_authorization_webhook_evaluations_total [ALPHA] Round-trips to authorization webhooks.
# TYPE apiserver_authorization_webhook_evaluations_total counter
apiserver_authorization_webhook_evaluations_total{name="wh1.example.com",result="timeout"} 1
`,
},
}
for _, tt := range testCases {
t.Run(tt.desc, func(t *testing.T) {
ResetWebhookMetricsForTest()
m := NewWebhookMetrics()
m.RecordWebhookDuration(context.Background(), tt.name, tt.result, tt.duration)
m.RecordWebhookEvaluation(context.Background(), tt.name, tt.result)
m.RecordWebhookFailOpen(context.Background(), tt.name, tt.result)
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(tt.want), tt.metrics...); err != nil {
t.Fatal(err)
}
})
}
}

View File

@ -18,8 +18,10 @@ package webhook
import (
"context"
"net/http"
"testing"
authorizationv1 "k8s.io/api/authorization/v1"
"k8s.io/apiserver/pkg/apis/apiserver"
"k8s.io/apiserver/pkg/authentication/user"
"k8s.io/apiserver/pkg/authorization/authorizer"
@ -29,11 +31,15 @@ import (
func TestAuthorizerMetrics(t *testing.T) {
scenarios := []struct {
name string
canceledRequest bool
clientCert, clientKey, clientCA []byte
serverCert, serverKey, serverCA []byte
authzFakeServiceStatusCode int
authFakeServiceDeny bool
expectedRegisteredStatusCode string
expectEvalutionResult string
expectDurationResult string
expectFailOpenResult string
wantErr bool
}{
{
@ -41,6 +47,31 @@ func TestAuthorizerMetrics(t *testing.T) {
clientCert: clientCert, clientKey: clientKey, clientCA: caCert,
serverCert: serverCert, serverKey: serverKey, serverCA: caCert,
expectedRegisteredStatusCode: "200",
expectEvalutionResult: "success",
expectDurationResult: "success",
expectFailOpenResult: "",
},
{
name: "timed out request",
clientCert: clientCert, clientKey: clientKey, clientCA: caCert,
serverCert: serverCert, serverKey: serverKey, serverCA: caCert,
authzFakeServiceStatusCode: http.StatusGatewayTimeout,
expectedRegisteredStatusCode: "504",
expectEvalutionResult: "timeout",
expectDurationResult: "timeout",
expectFailOpenResult: "timeout",
},
{
name: "canceled request",
clientCert: clientCert, clientKey: clientKey, clientCA: caCert,
serverCert: serverCert, serverKey: serverKey, serverCA: caCert,
canceledRequest: true,
expectedRegisteredStatusCode: "<error>",
expectEvalutionResult: "canceled",
expectDurationResult: "canceled",
expectFailOpenResult: "",
},
{
@ -49,6 +80,9 @@ func TestAuthorizerMetrics(t *testing.T) {
serverCert: serverCert, serverKey: serverKey, serverCA: caCert,
authzFakeServiceStatusCode: 500,
expectedRegisteredStatusCode: "500",
expectEvalutionResult: "error",
expectDurationResult: "error",
expectFailOpenResult: "error",
},
{
@ -56,17 +90,28 @@ func TestAuthorizerMetrics(t *testing.T) {
clientCert: clientCert, clientKey: clientKey, clientCA: caCert,
serverCert: serverCert, serverKey: serverKey, serverCA: badCACert,
expectedRegisteredStatusCode: "<error>",
expectEvalutionResult: "error",
expectDurationResult: "error",
expectFailOpenResult: "error",
wantErr: true,
},
}
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := new(mockV1Service)
service.statusCode = scenario.authzFakeServiceStatusCode
if service.statusCode == 0 {
service.statusCode = 200
}
service.reviewHook = func(*authorizationv1.SubjectAccessReview) {
if scenario.canceledRequest {
cancel()
}
}
service.allow = !scenario.authFakeServiceDeny
server, err := NewV1TestServer(service, scenario.serverCert, scenario.serverKey, scenario.serverCA)
@ -84,7 +129,7 @@ func TestAuthorizerMetrics(t *testing.T) {
}
attr := authorizer.AttributesRecord{User: &user.DefaultInfo{}}
_, _, err = wh.Authorize(context.Background(), attr)
_, _, err = wh.Authorize(ctx, attr)
if scenario.wantErr {
if err == nil {
t.Errorf("expected error making authorization request: %v", err)
@ -98,6 +143,16 @@ func TestAuthorizerMetrics(t *testing.T) {
if fakeAuthzMetrics.latencyCode != scenario.expectedRegisteredStatusCode {
t.Errorf("incorrect status code recorded for RecordRequestLatency method, expected = %v, got %v", scenario.expectedRegisteredStatusCode, fakeAuthzMetrics.latencyCode)
}
if fakeAuthzMetrics.evaluationsResult != scenario.expectEvalutionResult {
t.Errorf("expected evaluationsResult %q, got %q", scenario.expectEvalutionResult, fakeAuthzMetrics.evaluationsResult)
}
if fakeAuthzMetrics.durationResult != scenario.expectDurationResult {
t.Errorf("expected durationResult %q, got %q", scenario.expectDurationResult, fakeAuthzMetrics.durationResult)
}
if fakeAuthzMetrics.failOpenResult != scenario.expectFailOpenResult {
t.Errorf("expected failOpenResult %q, got %q", scenario.expectFailOpenResult, fakeAuthzMetrics.failOpenResult)
}
})
}
}
@ -108,6 +163,15 @@ type fakeAuthorizerMetrics struct {
latency float64
latencyCode string
evaluations int
evaluationsResult string
duration float64
durationResult string
failOpen int
failOpenResult string
cel.NoopMatcherMetrics
}
@ -119,3 +183,16 @@ func (f *fakeAuthorizerMetrics) RecordRequestLatency(_ context.Context, code str
f.latency = latency
f.latencyCode = code
}
func (f *fakeAuthorizerMetrics) RecordWebhookEvaluation(ctx context.Context, name, result string) {
f.evaluations += 1
f.evaluationsResult = result
}
func (f *fakeAuthorizerMetrics) RecordWebhookDuration(ctx context.Context, name, result string, duration float64) {
f.duration = duration
f.durationResult = result
}
func (f *fakeAuthorizerMetrics) RecordWebhookFailOpen(ctx context.Context, name, result string) {
f.failOpen += 1
f.failOpenResult = result
}

View File

@ -20,12 +20,15 @@ package webhook
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"strconv"
"time"
authorizationv1 "k8s.io/api/authorization/v1"
authorizationv1beta1 "k8s.io/api/authorization/v1beta1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
@ -233,6 +236,7 @@ func (w *WebhookAuthorizer) Authorize(ctx context.Context, attr authorizer.Attri
r.Status = entry.(authorizationv1.SubjectAccessReviewStatus)
} else {
var result *authorizationv1.SubjectAccessReview
var metricsResult string
// WithExponentialBackoff will return SAR create error (sarErr) if any.
if err := webhook.WithExponentialBackoff(ctx, w.retryBackoff, func() error {
var sarErr error
@ -242,6 +246,19 @@ func (w *WebhookAuthorizer) Authorize(ctx context.Context, attr authorizer.Attri
result, statusCode, sarErr = w.subjectAccessReview.Create(ctx, r, metav1.CreateOptions{})
latency := time.Since(start)
switch {
case sarErr == nil:
metricsResult = "success"
case ctx.Err() != nil:
metricsResult = "canceled"
case errors.Is(sarErr, context.DeadlineExceeded) || apierrors.IsTimeout(sarErr) || statusCode == http.StatusGatewayTimeout:
metricsResult = "timeout"
default:
metricsResult = "error"
}
w.metrics.RecordWebhookEvaluation(ctx, w.name, metricsResult)
w.metrics.RecordWebhookDuration(ctx, w.name, metricsResult, latency.Seconds())
if statusCode != 0 {
w.metrics.RecordRequestTotal(ctx, strconv.Itoa(statusCode))
w.metrics.RecordRequestLatency(ctx, strconv.Itoa(statusCode), latency.Seconds())
@ -256,6 +273,12 @@ func (w *WebhookAuthorizer) Authorize(ctx context.Context, attr authorizer.Attri
return sarErr
}, webhook.DefaultShouldRetry); err != nil {
klog.Errorf("Failed to make webhook authorizer request: %v", err)
// we're returning NoOpinion, and the parent context has not timed out or been canceled
if w.decisionOnError == authorizer.DecisionNoOpinion && ctx.Err() == nil {
w.metrics.RecordWebhookFailOpen(ctx, w.name, metricsResult)
}
return w.decisionOnError, "", err
}

View File

@ -315,11 +315,18 @@ type mockV1Service struct {
allow bool
statusCode int
called int
// reviewHook is called just before returning from the Review() method
reviewHook func(*authorizationv1.SubjectAccessReview)
}
func (m *mockV1Service) Review(r *authorizationv1.SubjectAccessReview) {
m.called++
r.Status.Allowed = m.allow
if m.reviewHook != nil {
m.reviewHook(r)
}
}
func (m *mockV1Service) Allow() { m.allow = true }
func (m *mockV1Service) Deny() { m.allow = false }
@ -1414,5 +1421,6 @@ func celAuthorizerMetrics() metrics.AuthorizerMetrics {
type celAuthorizerMetricsType struct {
metrics.NoopRequestMetrics
metrics.NoopWebhookMetrics
celmetrics.MatcherMetrics
}