fix using stale pod when evict failed and retry (#133461)
* fix using stale pod when evict failed and retry * simplify pod refresh process * use activePod at getPodFn * fix lint check * add ut * introduce EvictErrorRetryDelay Kubernetes-commit: 66fdbe105831e08b588dd01039a7e3130fd2d36f
This commit is contained in:
parent
4ce6135c24
commit
9e0a26615e
4
go.mod
4
go.mod
|
@ -30,10 +30,10 @@ require (
|
|||
golang.org/x/sys v0.31.0
|
||||
gopkg.in/evanphx/json-patch.v4 v4.12.0
|
||||
k8s.io/api v0.0.0-20250816062245-fa01e40890d0
|
||||
k8s.io/apimachinery v0.0.0-20250827234502-7a24dae0db84
|
||||
k8s.io/apimachinery v0.0.0-20250828034517-5992c1df72b8
|
||||
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67
|
||||
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554
|
||||
k8s.io/component-base v0.0.0-20250828000729-ee2825ba449c
|
||||
k8s.io/component-base v0.0.0-20250828040528-7068757664be
|
||||
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99
|
||||
k8s.io/klog/v2 v2.130.1
|
||||
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b
|
||||
|
|
8
go.sum
8
go.sum
|
@ -200,14 +200,14 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
|||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
k8s.io/api v0.0.0-20250816062245-fa01e40890d0 h1:WddRlAJwdWiTmGknuGqNHLxJ7RaF3bqjd933VhVCUes=
|
||||
k8s.io/api v0.0.0-20250816062245-fa01e40890d0/go.mod h1:PyEssxRzobRLFX/lEYzx5NDkS4JYE20SOKUZjTH0nvI=
|
||||
k8s.io/apimachinery v0.0.0-20250827234502-7a24dae0db84 h1:rMqDsUPA2nfIdTtHQT7BPacZ0SfH/oPG6zCKqy38wfQ=
|
||||
k8s.io/apimachinery v0.0.0-20250827234502-7a24dae0db84/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
|
||||
k8s.io/apimachinery v0.0.0-20250828034517-5992c1df72b8 h1:10c4JlI2tLfqaloDfee5QCraT3rUTchWawsjf2fOQvw=
|
||||
k8s.io/apimachinery v0.0.0-20250828034517-5992c1df72b8/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
|
||||
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67 h1:p6C/SYa7jZ4HpdNRIU/xJbY4lkMwl95iS6WkYmL3jE8=
|
||||
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67/go.mod h1:HIyOYCeQzm69fJySABxiMZ3Jw+tMJfIKesAiuPd+3No=
|
||||
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554 h1:nO7QX5+X24G4/feoMOeY98KqpHIxcntPLuhDeoacti8=
|
||||
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554/go.mod h1:xDExkHfWa76StdAOQuvSdZCD42iJau7+a60Bhd/QGL8=
|
||||
k8s.io/component-base v0.0.0-20250828000729-ee2825ba449c h1:FNLYAa5FW9WUcwMWWyi1eE9zU73X3buvsrAiAsJ7XMI=
|
||||
k8s.io/component-base v0.0.0-20250828000729-ee2825ba449c/go.mod h1:q5RKMm+VwuOM+tYS+fe3xrb+vsowL6MIE5peUlpLQPc=
|
||||
k8s.io/component-base v0.0.0-20250828040528-7068757664be h1:XsM2H6GUNYpLJKXZCfqsY4SFp4u9k/zsZv/xGXsn0mE=
|
||||
k8s.io/component-base v0.0.0-20250828040528-7068757664be/go.mod h1:wUK0PCYJm7+fH9q/BZoPjS6aEzhPqAB8C/8eGGKbVIc=
|
||||
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99 h1:RntyireBPwOrXJ89wKt6kNrB/yVT4eCWuv/il/czsoM=
|
||||
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99/go.mod h1:2PiD4/9sXcLwnpwFutLSOdjunPFkRZZD8D6+rWb8amI=
|
||||
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
|
||||
|
|
|
@ -19,6 +19,7 @@ package drain
|
|||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
|
||||
|
@ -151,10 +152,11 @@ func NewDrainCmdOptions(f cmdutil.Factory, ioStreams genericiooptions.IOStreams)
|
|||
PrintFlags: genericclioptions.NewPrintFlags("drained").WithTypeSetter(scheme.Scheme),
|
||||
IOStreams: ioStreams,
|
||||
drainer: &drain.Helper{
|
||||
GracePeriodSeconds: -1,
|
||||
Out: ioStreams.Out,
|
||||
ErrOut: ioStreams.ErrOut,
|
||||
ChunkSize: cmdutil.DefaultChunkSize,
|
||||
GracePeriodSeconds: -1,
|
||||
EvictErrorRetryDelay: 5 * time.Second,
|
||||
Out: ioStreams.Out,
|
||||
ErrOut: ioStreams.ErrOut,
|
||||
ChunkSize: cmdutil.DefaultChunkSize,
|
||||
},
|
||||
}
|
||||
o.drainer.OnPodDeletionOrEvictionFinished = o.onPodDeletionOrEvictionFinished
|
||||
|
|
|
@ -74,6 +74,9 @@ type Helper struct {
|
|||
// won't drain otherwise
|
||||
SkipWaitForDeleteTimeoutSeconds int
|
||||
|
||||
// EvictErrorRetryDelay is used to control the retry delay after a pod eviction error
|
||||
EvictErrorRetryDelay time.Duration
|
||||
|
||||
// AdditionalFilters are applied sequentially after base drain filters to
|
||||
// exclude pods using custom logic. Any filter that returns PodDeleteStatus
|
||||
// with Delete == false will immediately stop execution of further filters.
|
||||
|
@ -278,37 +281,27 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
|
|||
defer cancel()
|
||||
for _, pod := range pods {
|
||||
go func(pod corev1.Pod, returnCh chan error) {
|
||||
refreshPod := false
|
||||
activePod := pod
|
||||
for {
|
||||
switch d.DryRunStrategy {
|
||||
case cmdutil.DryRunServer:
|
||||
fmt.Fprintf(d.Out, "evicting pod %s/%s (server dry run)\n", pod.Namespace, pod.Name)
|
||||
//nolint:errcheck
|
||||
fmt.Fprintf(d.Out, "evicting pod %s/%s (server dry run)\n", activePod.Namespace, activePod.Name)
|
||||
default:
|
||||
if d.OnPodDeletionOrEvictionStarted != nil {
|
||||
d.OnPodDeletionOrEvictionStarted(&pod, true)
|
||||
d.OnPodDeletionOrEvictionStarted(&activePod, true)
|
||||
}
|
||||
fmt.Fprintf(d.Out, "evicting pod %s/%s\n", pod.Namespace, pod.Name)
|
||||
//nolint:errcheck
|
||||
fmt.Fprintf(d.Out, "evicting pod %s/%s\n", activePod.Namespace, activePod.Name)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// return here or we'll leak a goroutine.
|
||||
returnCh <- fmt.Errorf("error when evicting pods/%q -n %q: global timeout reached: %v", pod.Name, pod.Namespace, globalTimeout)
|
||||
returnCh <- fmt.Errorf("error when evicting pods/%q -n %q: global timeout reached: %v", activePod.Name, activePod.Namespace, globalTimeout)
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
// Create a temporary pod so we don't mutate the pod in the loop.
|
||||
activePod := pod
|
||||
if refreshPod {
|
||||
freshPod, err := getPodFn(pod.Namespace, pod.Name)
|
||||
// We ignore errors and let eviction sort it out with
|
||||
// the original pod.
|
||||
if err == nil {
|
||||
activePod = *freshPod
|
||||
}
|
||||
refreshPod = false
|
||||
}
|
||||
|
||||
err := d.EvictPod(activePod, evictionGroupVersion)
|
||||
if err == nil {
|
||||
break
|
||||
|
@ -316,8 +309,9 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
|
|||
returnCh <- nil
|
||||
return
|
||||
} else if apierrors.IsTooManyRequests(err) {
|
||||
fmt.Fprintf(d.ErrOut, "error when evicting pods/%q -n %q (will retry after 5s): %v\n", activePod.Name, activePod.Namespace, err)
|
||||
time.Sleep(5 * time.Second)
|
||||
//nolint:errcheck
|
||||
fmt.Fprintf(d.ErrOut, "error when evicting pods/%q -n %q (will retry after %v): %v\n", activePod.Name, activePod.Namespace, d.EvictErrorRetryDelay, err)
|
||||
time.Sleep(d.EvictErrorRetryDelay)
|
||||
} else if !activePod.ObjectMeta.DeletionTimestamp.IsZero() && apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) {
|
||||
// an eviction request in a deleting namespace will throw a forbidden error,
|
||||
// if the pod is already marked deleted, we can ignore this error, an eviction
|
||||
|
@ -326,12 +320,19 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
|
|||
} else if apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) {
|
||||
// an eviction request in a deleting namespace will throw a forbidden error,
|
||||
// if the pod is not marked deleted, we retry until it is.
|
||||
fmt.Fprintf(d.ErrOut, "error when evicting pod %q from terminating namespace %q (will retry after 5s): %v\n", activePod.Name, activePod.Namespace, err)
|
||||
time.Sleep(5 * time.Second)
|
||||
//nolint:errcheck
|
||||
fmt.Fprintf(d.ErrOut, "error when evicting pod %q from terminating namespace %q (will retry after %v): %v\n", activePod.Name, activePod.Namespace, d.EvictErrorRetryDelay, err)
|
||||
time.Sleep(d.EvictErrorRetryDelay)
|
||||
} else {
|
||||
returnCh <- fmt.Errorf("error when evicting pods/%q -n %q: %v", activePod.Name, activePod.Namespace, err)
|
||||
return
|
||||
}
|
||||
|
||||
freshPod, err := getPodFn(activePod.Namespace, activePod.Name)
|
||||
// we ignore errors and let eviction sort it out with the original pod.
|
||||
if err == nil {
|
||||
activePod = *freshPod
|
||||
}
|
||||
}
|
||||
if d.DryRunStrategy == cmdutil.DryRunServer {
|
||||
returnCh <- nil
|
||||
|
@ -339,7 +340,7 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
|
|||
}
|
||||
params := waitForDeleteParams{
|
||||
ctx: ctx,
|
||||
pods: []corev1.Pod{pod},
|
||||
pods: []corev1.Pod{activePod},
|
||||
interval: 1 * time.Second,
|
||||
timeout: time.Duration(math.MaxInt64),
|
||||
usingEviction: true,
|
||||
|
|
|
@ -529,3 +529,110 @@ func TestFilterPods(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvictDuringNamespaceTerminating(t *testing.T) {
|
||||
testPodUID := types.UID("test-uid")
|
||||
testPodName := "test-pod"
|
||||
testNamespace := "default"
|
||||
|
||||
retryDelay := 5 * time.Millisecond
|
||||
globalTimeout := 2 * retryDelay
|
||||
|
||||
tests := []struct {
|
||||
description string
|
||||
refresh bool
|
||||
err error
|
||||
}{
|
||||
{
|
||||
description: "Pod refreshed after NamespaceTerminating error",
|
||||
refresh: true,
|
||||
err: nil,
|
||||
},
|
||||
{
|
||||
description: "Pod not refreshed after NamespaceTerminating error",
|
||||
refresh: false,
|
||||
err: fmt.Errorf("error when evicting pods/%q -n %q: global timeout reached: %v", testPodName, testNamespace, globalTimeout),
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.description, func(t *testing.T) {
|
||||
var retry bool
|
||||
|
||||
initialPod := &corev1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: testPodName,
|
||||
Namespace: testNamespace,
|
||||
UID: testPodUID,
|
||||
},
|
||||
}
|
||||
|
||||
// pod with DeletionTimestamp, indicating deletion in progress
|
||||
deletedPod := &corev1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: testPodName,
|
||||
Namespace: testNamespace,
|
||||
UID: testPodUID,
|
||||
DeletionTimestamp: &metav1.Time{Time: time.Now()},
|
||||
},
|
||||
}
|
||||
|
||||
evictPods := []corev1.Pod{*initialPod}
|
||||
|
||||
k := fake.NewClientset(initialPod)
|
||||
addEvictionSupport(t, k, "v1")
|
||||
|
||||
// mock eviction to return NamespaceTerminating error
|
||||
k.PrependReactor("create", "pods", func(action ktest.Action) (bool, runtime.Object, error) {
|
||||
if action.GetSubresource() != "eviction" {
|
||||
return false, nil, nil
|
||||
}
|
||||
|
||||
err := apierrors.NewForbidden(
|
||||
schema.GroupResource{Resource: "pods"},
|
||||
testPodName,
|
||||
errors.New("namespace is terminating"),
|
||||
)
|
||||
|
||||
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{
|
||||
Type: corev1.NamespaceTerminatingCause,
|
||||
})
|
||||
|
||||
return true, nil, err
|
||||
})
|
||||
|
||||
k.PrependReactor("get", "pods", func(action ktest.Action) (bool, runtime.Object, error) {
|
||||
if !test.refresh {
|
||||
// for non-refresh test, always return the initial pod
|
||||
return true, initialPod, nil
|
||||
}
|
||||
|
||||
if retry {
|
||||
// second call, pod is deleted
|
||||
return true, nil, apierrors.NewNotFound(schema.GroupResource{Resource: "pods"}, testPodName)
|
||||
}
|
||||
|
||||
// first call, pod is being deleted
|
||||
retry = true
|
||||
|
||||
return true, deletedPod, nil
|
||||
})
|
||||
|
||||
h := &Helper{
|
||||
Client: k,
|
||||
DisableEviction: false,
|
||||
Out: os.Stdout,
|
||||
ErrOut: os.Stderr,
|
||||
Timeout: globalTimeout,
|
||||
EvictErrorRetryDelay: retryDelay,
|
||||
}
|
||||
|
||||
err := h.DeleteOrEvictPods(evictPods)
|
||||
if test.err == nil && err != nil {
|
||||
t.Errorf("expected no error, got: %v", err)
|
||||
} else if test.err != nil && (err == nil || err.Error() != test.err.Error()) {
|
||||
t.Errorf("%s: unexpected eviction; actual %v; expected %v", test.description, err, test.err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue