apiserver/pkg/storage/etcd3/store.go

1099 lines
36 KiB
Go

/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package etcd3
import (
"bytes"
"context"
"errors"
"fmt"
"path"
"reflect"
"strings"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
"go.opentelemetry.io/otel/attribute"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/conversion"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/audit"
endpointsrequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/features"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
etcdfeature "k8s.io/apiserver/pkg/storage/feature"
"k8s.io/apiserver/pkg/storage/value"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-base/tracing"
"k8s.io/klog/v2"
)
const (
// maxLimit is a maximum page limit increase used when fetching objects from etcd.
// This limit is used only for increasing page size by kube-apiserver. If request
// specifies larger limit initially, it won't be changed.
maxLimit = 10000
)
// authenticatedDataString satisfies the value.Context interface. It uses the key to
// authenticate the stored data. This does not defend against reuse of previously
// encrypted values under the same key, but will prevent an attacker from using an
// encrypted value from a different key. A stronger authenticated data segment would
// include the etcd3 Version field (which is incremented on each write to a key and
// reset when the key is deleted), but an attacker with write access to etcd can
// force deletion and recreation of keys to weaken that angle.
type authenticatedDataString string
// AuthenticatedData implements the value.Context interface.
func (d authenticatedDataString) AuthenticatedData() []byte {
return []byte(string(d))
}
var _ value.Context = authenticatedDataString("")
type store struct {
client *clientv3.Client
codec runtime.Codec
versioner storage.Versioner
transformer value.Transformer
pathPrefix string
groupResource schema.GroupResource
groupResourceString string
watcher *watcher
leaseManager *leaseManager
}
func (s *store) RequestWatchProgress(ctx context.Context) error {
// Use watchContext to match ctx metadata provided when creating the watch.
// In best case scenario we would use the same context that watch was created, but there is no way access it from watchCache.
return s.client.RequestProgress(s.watchContext(ctx))
}
type objState struct {
obj runtime.Object
meta *storage.ResponseMeta
rev int64
data []byte
stale bool
}
// New returns an etcd3 implementation of storage.Interface.
func New(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) storage.Interface {
return newStore(c, codec, newFunc, newListFunc, prefix, resourcePrefix, groupResource, transformer, leaseManagerConfig)
}
func newStore(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) *store {
versioner := storage.APIObjectVersioner{}
// for compatibility with etcd2 impl.
// no-op for default prefix of '/registry'.
// keeps compatibility with etcd2 impl for custom prefixes that don't start with '/'
pathPrefix := path.Join("/", prefix)
if !strings.HasSuffix(pathPrefix, "/") {
// Ensure the pathPrefix ends in "/" here to simplify key concatenation later.
pathPrefix += "/"
}
w := &watcher{
client: c,
codec: codec,
newFunc: newFunc,
groupResource: groupResource,
versioner: versioner,
transformer: transformer,
}
if newFunc == nil {
w.objectType = "<unknown>"
} else {
w.objectType = reflect.TypeOf(newFunc()).String()
}
s := &store{
client: c,
codec: codec,
versioner: versioner,
transformer: transformer,
pathPrefix: pathPrefix,
groupResource: groupResource,
groupResourceString: groupResource.String(),
watcher: w,
leaseManager: newDefaultLeaseManager(c, leaseManagerConfig),
}
w.getCurrentStorageRV = func(ctx context.Context) (uint64, error) {
return storage.GetCurrentResourceVersionFromStorage(ctx, s, newListFunc, resourcePrefix, w.objectType)
}
if utilfeature.DefaultFeatureGate.Enabled(features.ConsistentListFromCache) || utilfeature.DefaultFeatureGate.Enabled(features.WatchList) {
etcdfeature.DefaultFeatureSupportChecker.CheckClient(c.Ctx(), c, storage.RequestWatchProgress)
}
return s
}
// Versioner implements storage.Interface.Versioner.
func (s *store) Versioner() storage.Versioner {
return s.versioner
}
// Get implements storage.Interface.Get.
func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, out runtime.Object) error {
preparedKey, err := s.prepareKey(key)
if err != nil {
return err
}
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, preparedKey)
metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil {
return err
}
if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil {
return err
}
if len(getResp.Kvs) == 0 {
if opts.IgnoreNotFound {
return runtime.SetZeroValue(out)
}
return storage.NewKeyNotFoundError(preparedKey, 0)
}
kv := getResp.Kvs[0]
data, _, err := s.transformer.TransformFromStorage(ctx, kv.Value, authenticatedDataString(preparedKey))
if err != nil {
return storage.NewInternalError(err.Error())
}
err = decode(s.codec, s.versioner, data, out, kv.ModRevision)
if err != nil {
recordDecodeError(s.groupResourceString, preparedKey)
return err
}
return nil
}
// Create implements storage.Interface.Create.
func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object, ttl uint64) error {
preparedKey, err := s.prepareKey(key)
if err != nil {
return err
}
ctx, span := tracing.Start(ctx, "Create etcd3",
attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
attribute.String("key", key),
attribute.String("type", getTypeName(obj)),
attribute.String("resource", s.groupResourceString),
)
defer span.End(500 * time.Millisecond)
if version, err := s.versioner.ObjectResourceVersion(obj); err == nil && version != 0 {
return storage.ErrResourceVersionSetOnCreate
}
if err := s.versioner.PrepareObjectForStorage(obj); err != nil {
return fmt.Errorf("PrepareObjectForStorage failed: %v", err)
}
span.AddEvent("About to Encode")
data, err := runtime.Encode(s.codec, obj)
if err != nil {
span.AddEvent("Encode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
return err
}
span.AddEvent("Encode succeeded", attribute.Int("len", len(data)))
opts, err := s.ttlOpts(ctx, int64(ttl))
if err != nil {
return err
}
newData, err := s.transformer.TransformToStorage(ctx, data, authenticatedDataString(preparedKey))
if err != nil {
span.AddEvent("TransformToStorage failed", attribute.String("err", err.Error()))
return storage.NewInternalError(err.Error())
}
span.AddEvent("TransformToStorage succeeded")
startTime := time.Now()
txnResp, err := s.client.KV.Txn(ctx).If(
notFound(preparedKey),
).Then(
clientv3.OpPut(preparedKey, string(newData), opts...),
).Commit()
metrics.RecordEtcdRequest("create", s.groupResourceString, err, startTime)
if err != nil {
span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
return err
}
span.AddEvent("Txn call succeeded")
if !txnResp.Succeeded {
return storage.NewKeyExistsError(preparedKey, 0)
}
if out != nil {
putResp := txnResp.Responses[0].GetResponsePut()
err = decode(s.codec, s.versioner, data, out, putResp.Header.Revision)
if err != nil {
span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
recordDecodeError(s.groupResourceString, preparedKey)
return err
}
span.AddEvent("decode succeeded", attribute.Int("len", len(data)))
}
return nil
}
// Delete implements storage.Interface.Delete.
func (s *store) Delete(
ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions,
validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object, opts storage.DeleteOptions) error {
preparedKey, err := s.prepareKey(key)
if err != nil {
return err
}
v, err := conversion.EnforcePtr(out)
if err != nil {
return fmt.Errorf("unable to convert output object to pointer: %v", err)
}
skipTransformDecode := false
return s.conditionalDelete(ctx, preparedKey, out, v, preconditions, validateDeletion, cachedExistingObject, skipTransformDecode)
}
func (s *store) conditionalDelete(
ctx context.Context, key string, out runtime.Object, v reflect.Value, preconditions *storage.Preconditions,
validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object, skipTransformDecode bool) error {
getCurrentState := s.getCurrentState(ctx, key, v, false, skipTransformDecode)
var origState *objState
var err error
var origStateIsCurrent bool
if cachedExistingObject != nil {
origState, err = s.getStateFromObject(cachedExistingObject)
} else {
origState, err = getCurrentState()
origStateIsCurrent = true
}
if err != nil {
return err
}
for {
if preconditions != nil {
if err := preconditions.Check(key, origState.obj); err != nil {
if origStateIsCurrent {
return err
}
// It's possible we're working with stale data.
// Remember the revision of the potentially stale data and the resulting update error
cachedRev := origState.rev
cachedUpdateErr := err
// Actually fetch
origState, err = getCurrentState()
if err != nil {
return err
}
origStateIsCurrent = true
// it turns out our cached data was not stale, return the error
if cachedRev == origState.rev {
return cachedUpdateErr
}
// Retry
continue
}
}
if err := validateDeletion(ctx, origState.obj); err != nil {
if origStateIsCurrent {
return err
}
// It's possible we're working with stale data.
// Remember the revision of the potentially stale data and the resulting update error
cachedRev := origState.rev
cachedUpdateErr := err
// Actually fetch
origState, err = getCurrentState()
if err != nil {
return err
}
origStateIsCurrent = true
// it turns out our cached data was not stale, return the error
if cachedRev == origState.rev {
return cachedUpdateErr
}
// Retry
continue
}
startTime := time.Now()
txnResp, err := s.client.KV.Txn(ctx).If(
clientv3.Compare(clientv3.ModRevision(key), "=", origState.rev),
).Then(
clientv3.OpDelete(key),
).Else(
clientv3.OpGet(key),
).Commit()
metrics.RecordEtcdRequest("delete", s.groupResourceString, err, startTime)
if err != nil {
return err
}
if !txnResp.Succeeded {
getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange())
klog.V(4).Infof("deletion of %s failed because of a conflict, going to retry", key)
origState, err = s.getState(ctx, getResp, key, v, false, skipTransformDecode)
if err != nil {
return err
}
origStateIsCurrent = true
continue
}
if len(txnResp.Responses) == 0 || txnResp.Responses[0].GetResponseDeleteRange() == nil {
return errors.New(fmt.Sprintf("invalid DeleteRange response: %v", txnResp.Responses))
}
deleteResp := txnResp.Responses[0].GetResponseDeleteRange()
if deleteResp.Header == nil {
return errors.New("invalid DeleteRange response - nil header")
}
if !skipTransformDecode {
err = decode(s.codec, s.versioner, origState.data, out, deleteResp.Header.Revision)
if err != nil {
recordDecodeError(s.groupResourceString, key)
return err
}
}
return nil
}
}
// GuaranteedUpdate implements storage.Interface.GuaranteedUpdate.
func (s *store) GuaranteedUpdate(
ctx context.Context, key string, destination runtime.Object, ignoreNotFound bool,
preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, cachedExistingObject runtime.Object) error {
preparedKey, err := s.prepareKey(key)
if err != nil {
return err
}
ctx, span := tracing.Start(ctx, "GuaranteedUpdate etcd3",
attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
attribute.String("key", key),
attribute.String("type", getTypeName(destination)),
attribute.String("resource", s.groupResourceString))
defer span.End(500 * time.Millisecond)
v, err := conversion.EnforcePtr(destination)
if err != nil {
return fmt.Errorf("unable to convert output object to pointer: %v", err)
}
skipTransformDecode := false
getCurrentState := s.getCurrentState(ctx, preparedKey, v, ignoreNotFound, skipTransformDecode)
var origState *objState
var origStateIsCurrent bool
if cachedExistingObject != nil {
origState, err = s.getStateFromObject(cachedExistingObject)
} else {
origState, err = getCurrentState()
origStateIsCurrent = true
}
if err != nil {
return err
}
span.AddEvent("initial value restored")
transformContext := authenticatedDataString(preparedKey)
for {
if err := preconditions.Check(preparedKey, origState.obj); err != nil {
// If our data is already up to date, return the error
if origStateIsCurrent {
return err
}
// It's possible we were working with stale data
// Actually fetch
origState, err = getCurrentState()
if err != nil {
return err
}
origStateIsCurrent = true
// Retry
continue
}
ret, ttl, err := s.updateState(origState, tryUpdate)
if err != nil {
// If our data is already up to date, return the error
if origStateIsCurrent {
return err
}
// It's possible we were working with stale data
// Remember the revision of the potentially stale data and the resulting update error
cachedRev := origState.rev
cachedUpdateErr := err
// Actually fetch
origState, err = getCurrentState()
if err != nil {
return err
}
origStateIsCurrent = true
// it turns out our cached data was not stale, return the error
if cachedRev == origState.rev {
return cachedUpdateErr
}
// Retry
continue
}
span.AddEvent("About to Encode")
data, err := runtime.Encode(s.codec, ret)
if err != nil {
span.AddEvent("Encode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
return err
}
span.AddEvent("Encode succeeded", attribute.Int("len", len(data)))
if !origState.stale && bytes.Equal(data, origState.data) {
// if we skipped the original Get in this loop, we must refresh from
// etcd in order to be sure the data in the store is equivalent to
// our desired serialization
if !origStateIsCurrent {
origState, err = getCurrentState()
if err != nil {
return err
}
origStateIsCurrent = true
if !bytes.Equal(data, origState.data) {
// original data changed, restart loop
continue
}
}
// recheck that the data from etcd is not stale before short-circuiting a write
if !origState.stale {
err = decode(s.codec, s.versioner, origState.data, destination, origState.rev)
if err != nil {
recordDecodeError(s.groupResourceString, preparedKey)
return err
}
return nil
}
}
newData, err := s.transformer.TransformToStorage(ctx, data, transformContext)
if err != nil {
span.AddEvent("TransformToStorage failed", attribute.String("err", err.Error()))
return storage.NewInternalError(err.Error())
}
span.AddEvent("TransformToStorage succeeded")
opts, err := s.ttlOpts(ctx, int64(ttl))
if err != nil {
return err
}
span.AddEvent("Transaction prepared")
startTime := time.Now()
txnResp, err := s.client.KV.Txn(ctx).If(
clientv3.Compare(clientv3.ModRevision(preparedKey), "=", origState.rev),
).Then(
clientv3.OpPut(preparedKey, string(newData), opts...),
).Else(
clientv3.OpGet(preparedKey),
).Commit()
metrics.RecordEtcdRequest("update", s.groupResourceString, err, startTime)
if err != nil {
span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
return err
}
span.AddEvent("Txn call completed")
span.AddEvent("Transaction committed")
if !txnResp.Succeeded {
getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange())
klog.V(4).Infof("GuaranteedUpdate of %s failed because of a conflict, going to retry", preparedKey)
skipTransformDecode := false
origState, err = s.getState(ctx, getResp, preparedKey, v, ignoreNotFound, skipTransformDecode)
if err != nil {
return err
}
span.AddEvent("Retry value restored")
origStateIsCurrent = true
continue
}
putResp := txnResp.Responses[0].GetResponsePut()
err = decode(s.codec, s.versioner, data, destination, putResp.Header.Revision)
if err != nil {
span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
recordDecodeError(s.groupResourceString, preparedKey)
return err
}
span.AddEvent("decode succeeded", attribute.Int("len", len(data)))
return nil
}
}
func getNewItemFunc(listObj runtime.Object, v reflect.Value) func() runtime.Object {
// For unstructured lists with a target group/version, preserve the group/version in the instantiated list items
if unstructuredList, isUnstructured := listObj.(*unstructured.UnstructuredList); isUnstructured {
if apiVersion := unstructuredList.GetAPIVersion(); len(apiVersion) > 0 {
return func() runtime.Object {
return &unstructured.Unstructured{Object: map[string]interface{}{"apiVersion": apiVersion}}
}
}
}
// Otherwise just instantiate an empty item
elem := v.Type().Elem()
return func() runtime.Object {
return reflect.New(elem).Interface().(runtime.Object)
}
}
func (s *store) Count(key string) (int64, error) {
preparedKey, err := s.prepareKey(key)
if err != nil {
return 0, err
}
// We need to make sure the key ended with "/" so that we only get children "directories".
// e.g. if we have key "/a", "/a/b", "/ab", getting keys with prefix "/a" will return all three,
// while with prefix "/a/" will return only "/a/b" which is the correct answer.
if !strings.HasSuffix(preparedKey, "/") {
preparedKey += "/"
}
startTime := time.Now()
getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly())
metrics.RecordEtcdRequest("listWithCount", preparedKey, err, startTime)
if err != nil {
return 0, err
}
return getResp.Count, nil
}
// ReadinessCheck implements storage.Interface.
func (s *store) ReadinessCheck() error {
return nil
}
// resolveGetListRev is used by GetList to resolve the rev to use in the client.KV.Get request.
func (s *store) resolveGetListRev(continueKey string, continueRV int64, opts storage.ListOptions) (int64, error) {
var withRev int64
// Uses continueRV if this is a continuation request.
if len(continueKey) > 0 {
if len(opts.ResourceVersion) > 0 && opts.ResourceVersion != "0" {
return withRev, apierrors.NewBadRequest("specifying resource version is not allowed when using continue")
}
// If continueRV > 0, the LIST request needs a specific resource version.
// continueRV==0 is invalid.
// If continueRV < 0, the request is for the latest resource version.
if continueRV > 0 {
withRev = continueRV
}
return withRev, nil
}
// Returns 0 if ResourceVersion is not specified.
if len(opts.ResourceVersion) == 0 {
return withRev, nil
}
parsedRV, err := s.versioner.ParseResourceVersion(opts.ResourceVersion)
if err != nil {
return withRev, apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
}
switch opts.ResourceVersionMatch {
case metav1.ResourceVersionMatchNotOlderThan:
// The not older than constraint is checked after we get a response from etcd,
// and returnedRV is then set to the revision we get from the etcd response.
case metav1.ResourceVersionMatchExact:
withRev = int64(parsedRV)
case "": // legacy case
if opts.Recursive && opts.Predicate.Limit > 0 && parsedRV > 0 {
withRev = int64(parsedRV)
}
default:
return withRev, fmt.Errorf("unknown ResourceVersionMatch value: %v", opts.ResourceVersionMatch)
}
return withRev, nil
}
// GetList implements storage.Interface.
func (s *store) GetList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error {
preparedKey, err := s.prepareKey(key)
if err != nil {
return err
}
ctx, span := tracing.Start(ctx, fmt.Sprintf("List(recursive=%v) etcd3", opts.Recursive),
attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
attribute.String("key", key),
attribute.String("resourceVersion", opts.ResourceVersion),
attribute.String("resourceVersionMatch", string(opts.ResourceVersionMatch)),
attribute.Int("limit", int(opts.Predicate.Limit)),
attribute.String("continue", opts.Predicate.Continue))
defer span.End(500 * time.Millisecond)
listPtr, err := meta.GetItemsPtr(listObj)
if err != nil {
return err
}
v, err := conversion.EnforcePtr(listPtr)
if err != nil || v.Kind() != reflect.Slice {
return fmt.Errorf("need ptr to slice: %v", err)
}
// For recursive lists, we need to make sure the key ended with "/" so that we only
// get children "directories". e.g. if we have key "/a", "/a/b", "/ab", getting keys
// with prefix "/a" will return all three, while with prefix "/a/" will return only
// "/a/b" which is the correct answer.
if opts.Recursive && !strings.HasSuffix(preparedKey, "/") {
preparedKey += "/"
}
keyPrefix := preparedKey
// set the appropriate clientv3 options to filter the returned data set
var limitOption *clientv3.OpOption
limit := opts.Predicate.Limit
var paging bool
options := make([]clientv3.OpOption, 0, 4)
if opts.Predicate.Limit > 0 {
paging = true
options = append(options, clientv3.WithLimit(limit))
limitOption = &options[len(options)-1]
}
if opts.Recursive {
rangeEnd := clientv3.GetPrefixRangeEnd(keyPrefix)
options = append(options, clientv3.WithRange(rangeEnd))
}
newItemFunc := getNewItemFunc(listObj, v)
var continueRV, withRev int64
var continueKey string
if opts.Recursive && len(opts.Predicate.Continue) > 0 {
continueKey, continueRV, err = storage.DecodeContinue(opts.Predicate.Continue, keyPrefix)
if err != nil {
return apierrors.NewBadRequest(fmt.Sprintf("invalid continue token: %v", err))
}
preparedKey = continueKey
}
if withRev, err = s.resolveGetListRev(continueKey, continueRV, opts); err != nil {
return err
}
if withRev != 0 {
options = append(options, clientv3.WithRev(withRev))
}
// loop until we have filled the requested limit from etcd or there are no more results
var lastKey []byte
var hasMore bool
var getResp *clientv3.GetResponse
var numFetched int
var numEvald int
// Because these metrics are for understanding the costs of handling LIST requests,
// get them recorded even in error cases.
defer func() {
numReturn := v.Len()
metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn)
}()
metricsOp := "get"
if opts.Recursive {
metricsOp = "list"
}
for {
startTime := time.Now()
getResp, err = s.client.KV.Get(ctx, preparedKey, options...)
metrics.RecordEtcdRequest(metricsOp, s.groupResourceString, err, startTime)
if err != nil {
return interpretListError(err, len(opts.Predicate.Continue) > 0, continueKey, keyPrefix)
}
numFetched += len(getResp.Kvs)
if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil {
return err
}
hasMore = getResp.More
if len(getResp.Kvs) == 0 && getResp.More {
return fmt.Errorf("no results were found, but etcd indicated there were more values remaining")
}
// indicate to the client which resource version was returned, and use the same resource version for subsequent requests.
if withRev == 0 {
withRev = getResp.Header.Revision
options = append(options, clientv3.WithRev(withRev))
}
// avoid small allocations for the result slice, since this can be called in many
// different contexts and we don't know how significantly the result will be filtered
if opts.Predicate.Empty() {
growSlice(v, len(getResp.Kvs))
} else {
growSlice(v, 2048, len(getResp.Kvs))
}
// take items from the response until the bucket is full, filtering as we go
for i, kv := range getResp.Kvs {
if paging && int64(v.Len()) >= opts.Predicate.Limit {
hasMore = true
break
}
lastKey = kv.Key
data, _, err := s.transformer.TransformFromStorage(ctx, kv.Value, authenticatedDataString(kv.Key))
if err != nil {
return storage.NewInternalErrorf("unable to transform key %q: %v", kv.Key, err)
}
// Check if the request has already timed out before decode object
select {
case <-ctx.Done():
// parent context is canceled or timed out, no point in continuing
return storage.NewTimeoutError(string(kv.Key), "request did not complete within requested timeout")
default:
}
obj, err := decodeListItem(ctx, data, uint64(kv.ModRevision), s.codec, s.versioner, newItemFunc)
if err != nil {
recordDecodeError(s.groupResourceString, string(kv.Key))
return err
}
// being unable to set the version does not prevent the object from being extracted
if matched, err := opts.Predicate.Matches(obj); err == nil && matched {
v.Set(reflect.Append(v, reflect.ValueOf(obj).Elem()))
}
numEvald++
// free kv early. Long lists can take O(seconds) to decode.
getResp.Kvs[i] = nil
}
// no more results remain or we didn't request paging
if !hasMore || !paging {
break
}
// we're paging but we have filled our bucket
if int64(v.Len()) >= opts.Predicate.Limit {
break
}
if limit < maxLimit {
// We got incomplete result due to field/label selector dropping the object.
// Double page size to reduce total number of calls to etcd.
limit *= 2
if limit > maxLimit {
limit = maxLimit
}
*limitOption = clientv3.WithLimit(limit)
}
preparedKey = string(lastKey) + "\x00"
}
if v.IsNil() {
// Ensure that we never return a nil Items pointer in the result for consistency.
v.Set(reflect.MakeSlice(v.Type(), 0, 0))
}
continueValue, remainingItemCount, err := storage.PrepareContinueToken(string(lastKey), keyPrefix, withRev, getResp.Count, hasMore, opts)
if err != nil {
return err
}
return s.versioner.UpdateList(listObj, uint64(withRev), continueValue, remainingItemCount)
}
// growSlice takes a slice value and grows its capacity up
// to the maximum of the passed sizes or maxCapacity, whichever
// is smaller. Above maxCapacity decisions about allocation are left
// to the Go runtime on append. This allows a caller to make an
// educated guess about the potential size of the total list while
// still avoiding overly aggressive initial allocation. If sizes
// is empty maxCapacity will be used as the size to grow.
func growSlice(v reflect.Value, maxCapacity int, sizes ...int) {
cap := v.Cap()
max := cap
for _, size := range sizes {
if size > max {
max = size
}
}
if len(sizes) == 0 || max > maxCapacity {
max = maxCapacity
}
if max <= cap {
return
}
if v.Len() > 0 {
extra := reflect.MakeSlice(v.Type(), v.Len(), max)
reflect.Copy(extra, v)
v.Set(extra)
} else {
extra := reflect.MakeSlice(v.Type(), 0, max)
v.Set(extra)
}
}
// Watch implements storage.Interface.Watch.
func (s *store) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) {
preparedKey, err := s.prepareKey(key)
if err != nil {
return nil, err
}
rev, err := s.versioner.ParseResourceVersion(opts.ResourceVersion)
if err != nil {
return nil, err
}
return s.watcher.Watch(s.watchContext(ctx), preparedKey, int64(rev), opts)
}
func (s *store) watchContext(ctx context.Context) context.Context {
// The etcd server waits until it cannot find a leader for 3 election
// timeouts to cancel existing streams. 3 is currently a hard coded
// constant. The election timeout defaults to 1000ms. If the cluster is
// healthy, when the leader is stopped, the leadership transfer should be
// smooth. (leader transfers its leadership before stopping). If leader is
// hard killed, other servers will take an election timeout to realize
// leader lost and start campaign.
return clientv3.WithRequireLeader(ctx)
}
func (s *store) getCurrentState(ctx context.Context, key string, v reflect.Value, ignoreNotFound bool, skipTransformDecode bool) func() (*objState, error) {
return func() (*objState, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key)
metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil {
return nil, err
}
return s.getState(ctx, getResp, key, v, ignoreNotFound, skipTransformDecode)
}
}
// getState constructs a new objState from the given response from the storage.
// skipTransformDecode: if true, the function will neither transform the data
// from the storage nor decode it into an object; otherwise, data from the
// storage will be transformed and decoded.
// NOTE: when skipTransformDecode is true, the 'data', and the 'obj' fields
// of the objState will be nil, and 'stale' will be set to true.
func (s *store) getState(ctx context.Context, getResp *clientv3.GetResponse, key string, v reflect.Value, ignoreNotFound bool, skipTransformDecode bool) (*objState, error) {
state := &objState{
meta: &storage.ResponseMeta{},
}
if u, ok := v.Addr().Interface().(runtime.Unstructured); ok {
state.obj = u.NewEmptyInstance()
} else {
state.obj = reflect.New(v.Type()).Interface().(runtime.Object)
}
if len(getResp.Kvs) == 0 {
if !ignoreNotFound {
return nil, storage.NewKeyNotFoundError(key, 0)
}
if err := runtime.SetZeroValue(state.obj); err != nil {
return nil, err
}
} else {
state.rev = getResp.Kvs[0].ModRevision
state.meta.ResourceVersion = uint64(state.rev)
if skipTransformDecode {
// be explicit that we don't have the object
state.obj = nil
state.stale = true // this seems a more sane value here
return state, nil
}
data, stale, err := s.transformer.TransformFromStorage(ctx, getResp.Kvs[0].Value, authenticatedDataString(key))
if err != nil {
return nil, storage.NewInternalError(err.Error())
}
state.data = data
state.stale = stale
if err := decode(s.codec, s.versioner, state.data, state.obj, state.rev); err != nil {
recordDecodeError(s.groupResourceString, key)
return nil, err
}
}
return state, nil
}
func (s *store) getStateFromObject(obj runtime.Object) (*objState, error) {
state := &objState{
obj: obj,
meta: &storage.ResponseMeta{},
}
rv, err := s.versioner.ObjectResourceVersion(obj)
if err != nil {
return nil, fmt.Errorf("couldn't get resource version: %v", err)
}
state.rev = int64(rv)
state.meta.ResourceVersion = uint64(state.rev)
// Compute the serialized form - for that we need to temporarily clean
// its resource version field (those are not stored in etcd).
if err := s.versioner.PrepareObjectForStorage(obj); err != nil {
return nil, fmt.Errorf("PrepareObjectForStorage failed: %v", err)
}
state.data, err = runtime.Encode(s.codec, obj)
if err != nil {
return nil, err
}
if err := s.versioner.UpdateObject(state.obj, uint64(rv)); err != nil {
klog.Errorf("failed to update object version: %v", err)
}
return state, nil
}
func (s *store) updateState(st *objState, userUpdate storage.UpdateFunc) (runtime.Object, uint64, error) {
ret, ttlPtr, err := userUpdate(st.obj, *st.meta)
if err != nil {
return nil, 0, err
}
if err := s.versioner.PrepareObjectForStorage(ret); err != nil {
return nil, 0, fmt.Errorf("PrepareObjectForStorage failed: %v", err)
}
var ttl uint64
if ttlPtr != nil {
ttl = *ttlPtr
}
return ret, ttl, nil
}
// ttlOpts returns client options based on given ttl.
// ttl: if ttl is non-zero, it will attach the key to a lease with ttl of roughly the same length
func (s *store) ttlOpts(ctx context.Context, ttl int64) ([]clientv3.OpOption, error) {
if ttl == 0 {
return nil, nil
}
id, err := s.leaseManager.GetLease(ctx, ttl)
if err != nil {
return nil, err
}
return []clientv3.OpOption{clientv3.WithLease(id)}, nil
}
// validateMinimumResourceVersion returns a 'too large resource' version error when the provided minimumResourceVersion is
// greater than the most recent actualRevision available from storage.
func (s *store) validateMinimumResourceVersion(minimumResourceVersion string, actualRevision uint64) error {
if minimumResourceVersion == "" {
return nil
}
minimumRV, err := s.versioner.ParseResourceVersion(minimumResourceVersion)
if err != nil {
return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
}
// Enforce the storage.Interface guarantee that the resource version of the returned data
// "will be at least 'resourceVersion'".
if minimumRV > actualRevision {
return storage.NewTooLargeResourceVersionError(minimumRV, actualRevision, 0)
}
return nil
}
func (s *store) prepareKey(key string) (string, error) {
if key == ".." ||
strings.HasPrefix(key, "../") ||
strings.HasSuffix(key, "/..") ||
strings.Contains(key, "/../") {
return "", fmt.Errorf("invalid key: %q", key)
}
if key == "." ||
strings.HasPrefix(key, "./") ||
strings.HasSuffix(key, "/.") ||
strings.Contains(key, "/./") {
return "", fmt.Errorf("invalid key: %q", key)
}
if key == "" || key == "/" {
return "", fmt.Errorf("empty key: %q", key)
}
// We ensured that pathPrefix ends in '/' in construction, so skip any leading '/' in the key now.
startIndex := 0
if key[0] == '/' {
startIndex = 1
}
return s.pathPrefix + key[startIndex:], nil
}
// decode decodes value of bytes into object. It will also set the object resource version to rev.
// On success, objPtr would be set to the object.
func decode(codec runtime.Codec, versioner storage.Versioner, value []byte, objPtr runtime.Object, rev int64) error {
if _, err := conversion.EnforcePtr(objPtr); err != nil {
return fmt.Errorf("unable to convert output object to pointer: %v", err)
}
_, _, err := codec.Decode(value, nil, objPtr)
if err != nil {
return err
}
// being unable to set the version does not prevent the object from being extracted
if err := versioner.UpdateObject(objPtr, uint64(rev)); err != nil {
klog.Errorf("failed to update object version: %v", err)
}
return nil
}
// decodeListItem decodes bytes value in array into object.
func decodeListItem(ctx context.Context, data []byte, rev uint64, codec runtime.Codec, versioner storage.Versioner, newItemFunc func() runtime.Object) (runtime.Object, error) {
startedAt := time.Now()
defer func() {
endpointsrequest.TrackDecodeLatency(ctx, time.Since(startedAt))
}()
obj, _, err := codec.Decode(data, nil, newItemFunc())
if err != nil {
return nil, err
}
if err := versioner.UpdateObject(obj, rev); err != nil {
klog.Errorf("failed to update object version: %v", err)
}
return obj, nil
}
// recordDecodeError record decode error split by object type.
func recordDecodeError(resource string, key string) {
metrics.RecordDecodeError(resource)
klog.V(4).Infof("Decoding %s \"%s\" failed", resource, key)
}
func notFound(key string) clientv3.Cmp {
return clientv3.Compare(clientv3.ModRevision(key), "=", 0)
}
// getTypeName returns type name of an object for reporting purposes.
func getTypeName(obj interface{}) string {
return reflect.TypeOf(obj).String()
}