fix using stale pod when evict failed and retry (#133461)

* fix using stale pod when evict failed and retry

* simplify pod refresh process

* use activePod at getPodFn

* fix lint check

* add ut

* introduce EvictErrorRetryDelay

Kubernetes-commit: 66fdbe105831e08b588dd01039a7e3130fd2d36f
This commit is contained in:
Lorain 2025-08-28 09:29:39 +08:00 committed by Kubernetes Publisher
parent 4ce6135c24
commit 9e0a26615e
5 changed files with 142 additions and 32 deletions

4
go.mod
View File

@ -30,10 +30,10 @@ require (
golang.org/x/sys v0.31.0
gopkg.in/evanphx/json-patch.v4 v4.12.0
k8s.io/api v0.0.0-20250816062245-fa01e40890d0
k8s.io/apimachinery v0.0.0-20250827234502-7a24dae0db84
k8s.io/apimachinery v0.0.0-20250828034517-5992c1df72b8
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554
k8s.io/component-base v0.0.0-20250828000729-ee2825ba449c
k8s.io/component-base v0.0.0-20250828040528-7068757664be
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99
k8s.io/klog/v2 v2.130.1
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b

8
go.sum
View File

@ -200,14 +200,14 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
k8s.io/api v0.0.0-20250816062245-fa01e40890d0 h1:WddRlAJwdWiTmGknuGqNHLxJ7RaF3bqjd933VhVCUes=
k8s.io/api v0.0.0-20250816062245-fa01e40890d0/go.mod h1:PyEssxRzobRLFX/lEYzx5NDkS4JYE20SOKUZjTH0nvI=
k8s.io/apimachinery v0.0.0-20250827234502-7a24dae0db84 h1:rMqDsUPA2nfIdTtHQT7BPacZ0SfH/oPG6zCKqy38wfQ=
k8s.io/apimachinery v0.0.0-20250827234502-7a24dae0db84/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
k8s.io/apimachinery v0.0.0-20250828034517-5992c1df72b8 h1:10c4JlI2tLfqaloDfee5QCraT3rUTchWawsjf2fOQvw=
k8s.io/apimachinery v0.0.0-20250828034517-5992c1df72b8/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67 h1:p6C/SYa7jZ4HpdNRIU/xJbY4lkMwl95iS6WkYmL3jE8=
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67/go.mod h1:HIyOYCeQzm69fJySABxiMZ3Jw+tMJfIKesAiuPd+3No=
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554 h1:nO7QX5+X24G4/feoMOeY98KqpHIxcntPLuhDeoacti8=
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554/go.mod h1:xDExkHfWa76StdAOQuvSdZCD42iJau7+a60Bhd/QGL8=
k8s.io/component-base v0.0.0-20250828000729-ee2825ba449c h1:FNLYAa5FW9WUcwMWWyi1eE9zU73X3buvsrAiAsJ7XMI=
k8s.io/component-base v0.0.0-20250828000729-ee2825ba449c/go.mod h1:q5RKMm+VwuOM+tYS+fe3xrb+vsowL6MIE5peUlpLQPc=
k8s.io/component-base v0.0.0-20250828040528-7068757664be h1:XsM2H6GUNYpLJKXZCfqsY4SFp4u9k/zsZv/xGXsn0mE=
k8s.io/component-base v0.0.0-20250828040528-7068757664be/go.mod h1:wUK0PCYJm7+fH9q/BZoPjS6aEzhPqAB8C/8eGGKbVIc=
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99 h1:RntyireBPwOrXJ89wKt6kNrB/yVT4eCWuv/il/czsoM=
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99/go.mod h1:2PiD4/9sXcLwnpwFutLSOdjunPFkRZZD8D6+rWb8amI=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=

View File

@ -19,6 +19,7 @@ package drain
import (
"errors"
"fmt"
"time"
"github.com/spf13/cobra"
@ -151,10 +152,11 @@ func NewDrainCmdOptions(f cmdutil.Factory, ioStreams genericiooptions.IOStreams)
PrintFlags: genericclioptions.NewPrintFlags("drained").WithTypeSetter(scheme.Scheme),
IOStreams: ioStreams,
drainer: &drain.Helper{
GracePeriodSeconds: -1,
Out: ioStreams.Out,
ErrOut: ioStreams.ErrOut,
ChunkSize: cmdutil.DefaultChunkSize,
GracePeriodSeconds: -1,
EvictErrorRetryDelay: 5 * time.Second,
Out: ioStreams.Out,
ErrOut: ioStreams.ErrOut,
ChunkSize: cmdutil.DefaultChunkSize,
},
}
o.drainer.OnPodDeletionOrEvictionFinished = o.onPodDeletionOrEvictionFinished

View File

@ -74,6 +74,9 @@ type Helper struct {
// won't drain otherwise
SkipWaitForDeleteTimeoutSeconds int
// EvictErrorRetryDelay is used to control the retry delay after a pod eviction error
EvictErrorRetryDelay time.Duration
// AdditionalFilters are applied sequentially after base drain filters to
// exclude pods using custom logic. Any filter that returns PodDeleteStatus
// with Delete == false will immediately stop execution of further filters.
@ -278,37 +281,27 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
defer cancel()
for _, pod := range pods {
go func(pod corev1.Pod, returnCh chan error) {
refreshPod := false
activePod := pod
for {
switch d.DryRunStrategy {
case cmdutil.DryRunServer:
fmt.Fprintf(d.Out, "evicting pod %s/%s (server dry run)\n", pod.Namespace, pod.Name)
//nolint:errcheck
fmt.Fprintf(d.Out, "evicting pod %s/%s (server dry run)\n", activePod.Namespace, activePod.Name)
default:
if d.OnPodDeletionOrEvictionStarted != nil {
d.OnPodDeletionOrEvictionStarted(&pod, true)
d.OnPodDeletionOrEvictionStarted(&activePod, true)
}
fmt.Fprintf(d.Out, "evicting pod %s/%s\n", pod.Namespace, pod.Name)
//nolint:errcheck
fmt.Fprintf(d.Out, "evicting pod %s/%s\n", activePod.Namespace, activePod.Name)
}
select {
case <-ctx.Done():
// return here or we'll leak a goroutine.
returnCh <- fmt.Errorf("error when evicting pods/%q -n %q: global timeout reached: %v", pod.Name, pod.Namespace, globalTimeout)
returnCh <- fmt.Errorf("error when evicting pods/%q -n %q: global timeout reached: %v", activePod.Name, activePod.Namespace, globalTimeout)
return
default:
}
// Create a temporary pod so we don't mutate the pod in the loop.
activePod := pod
if refreshPod {
freshPod, err := getPodFn(pod.Namespace, pod.Name)
// We ignore errors and let eviction sort it out with
// the original pod.
if err == nil {
activePod = *freshPod
}
refreshPod = false
}
err := d.EvictPod(activePod, evictionGroupVersion)
if err == nil {
break
@ -316,8 +309,9 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
returnCh <- nil
return
} else if apierrors.IsTooManyRequests(err) {
fmt.Fprintf(d.ErrOut, "error when evicting pods/%q -n %q (will retry after 5s): %v\n", activePod.Name, activePod.Namespace, err)
time.Sleep(5 * time.Second)
//nolint:errcheck
fmt.Fprintf(d.ErrOut, "error when evicting pods/%q -n %q (will retry after %v): %v\n", activePod.Name, activePod.Namespace, d.EvictErrorRetryDelay, err)
time.Sleep(d.EvictErrorRetryDelay)
} else if !activePod.ObjectMeta.DeletionTimestamp.IsZero() && apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) {
// an eviction request in a deleting namespace will throw a forbidden error,
// if the pod is already marked deleted, we can ignore this error, an eviction
@ -326,12 +320,19 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
} else if apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) {
// an eviction request in a deleting namespace will throw a forbidden error,
// if the pod is not marked deleted, we retry until it is.
fmt.Fprintf(d.ErrOut, "error when evicting pod %q from terminating namespace %q (will retry after 5s): %v\n", activePod.Name, activePod.Namespace, err)
time.Sleep(5 * time.Second)
//nolint:errcheck
fmt.Fprintf(d.ErrOut, "error when evicting pod %q from terminating namespace %q (will retry after %v): %v\n", activePod.Name, activePod.Namespace, d.EvictErrorRetryDelay, err)
time.Sleep(d.EvictErrorRetryDelay)
} else {
returnCh <- fmt.Errorf("error when evicting pods/%q -n %q: %v", activePod.Name, activePod.Namespace, err)
return
}
freshPod, err := getPodFn(activePod.Namespace, activePod.Name)
// we ignore errors and let eviction sort it out with the original pod.
if err == nil {
activePod = *freshPod
}
}
if d.DryRunStrategy == cmdutil.DryRunServer {
returnCh <- nil
@ -339,7 +340,7 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
}
params := waitForDeleteParams{
ctx: ctx,
pods: []corev1.Pod{pod},
pods: []corev1.Pod{activePod},
interval: 1 * time.Second,
timeout: time.Duration(math.MaxInt64),
usingEviction: true,

View File

@ -529,3 +529,110 @@ func TestFilterPods(t *testing.T) {
})
}
}
func TestEvictDuringNamespaceTerminating(t *testing.T) {
testPodUID := types.UID("test-uid")
testPodName := "test-pod"
testNamespace := "default"
retryDelay := 5 * time.Millisecond
globalTimeout := 2 * retryDelay
tests := []struct {
description string
refresh bool
err error
}{
{
description: "Pod refreshed after NamespaceTerminating error",
refresh: true,
err: nil,
},
{
description: "Pod not refreshed after NamespaceTerminating error",
refresh: false,
err: fmt.Errorf("error when evicting pods/%q -n %q: global timeout reached: %v", testPodName, testNamespace, globalTimeout),
},
}
for _, test := range tests {
t.Run(test.description, func(t *testing.T) {
var retry bool
initialPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: testPodName,
Namespace: testNamespace,
UID: testPodUID,
},
}
// pod with DeletionTimestamp, indicating deletion in progress
deletedPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: testPodName,
Namespace: testNamespace,
UID: testPodUID,
DeletionTimestamp: &metav1.Time{Time: time.Now()},
},
}
evictPods := []corev1.Pod{*initialPod}
k := fake.NewClientset(initialPod)
addEvictionSupport(t, k, "v1")
// mock eviction to return NamespaceTerminating error
k.PrependReactor("create", "pods", func(action ktest.Action) (bool, runtime.Object, error) {
if action.GetSubresource() != "eviction" {
return false, nil, nil
}
err := apierrors.NewForbidden(
schema.GroupResource{Resource: "pods"},
testPodName,
errors.New("namespace is terminating"),
)
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{
Type: corev1.NamespaceTerminatingCause,
})
return true, nil, err
})
k.PrependReactor("get", "pods", func(action ktest.Action) (bool, runtime.Object, error) {
if !test.refresh {
// for non-refresh test, always return the initial pod
return true, initialPod, nil
}
if retry {
// second call, pod is deleted
return true, nil, apierrors.NewNotFound(schema.GroupResource{Resource: "pods"}, testPodName)
}
// first call, pod is being deleted
retry = true
return true, deletedPod, nil
})
h := &Helper{
Client: k,
DisableEviction: false,
Out: os.Stdout,
ErrOut: os.Stderr,
Timeout: globalTimeout,
EvictErrorRetryDelay: retryDelay,
}
err := h.DeleteOrEvictPods(evictPods)
if test.err == nil && err != nil {
t.Errorf("expected no error, got: %v", err)
} else if test.err != nil && (err == nil || err.Error() != test.err.Error()) {
t.Errorf("%s: unexpected eviction; actual %v; expected %v", test.description, err, test.err)
}
})
}
}