fix using stale pod when evict failed and retry (#133461)
* fix using stale pod when evict failed and retry * simplify pod refresh process * use activePod at getPodFn * fix lint check * add ut * introduce EvictErrorRetryDelay Kubernetes-commit: 66fdbe105831e08b588dd01039a7e3130fd2d36f
This commit is contained in:
parent
4ce6135c24
commit
9e0a26615e
4
go.mod
4
go.mod
|
|
@ -30,10 +30,10 @@ require (
|
||||||
golang.org/x/sys v0.31.0
|
golang.org/x/sys v0.31.0
|
||||||
gopkg.in/evanphx/json-patch.v4 v4.12.0
|
gopkg.in/evanphx/json-patch.v4 v4.12.0
|
||||||
k8s.io/api v0.0.0-20250816062245-fa01e40890d0
|
k8s.io/api v0.0.0-20250816062245-fa01e40890d0
|
||||||
k8s.io/apimachinery v0.0.0-20250827234502-7a24dae0db84
|
k8s.io/apimachinery v0.0.0-20250828034517-5992c1df72b8
|
||||||
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67
|
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67
|
||||||
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554
|
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554
|
||||||
k8s.io/component-base v0.0.0-20250828000729-ee2825ba449c
|
k8s.io/component-base v0.0.0-20250828040528-7068757664be
|
||||||
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99
|
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99
|
||||||
k8s.io/klog/v2 v2.130.1
|
k8s.io/klog/v2 v2.130.1
|
||||||
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b
|
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b
|
||||||
|
|
|
||||||
8
go.sum
8
go.sum
|
|
@ -200,14 +200,14 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
k8s.io/api v0.0.0-20250816062245-fa01e40890d0 h1:WddRlAJwdWiTmGknuGqNHLxJ7RaF3bqjd933VhVCUes=
|
k8s.io/api v0.0.0-20250816062245-fa01e40890d0 h1:WddRlAJwdWiTmGknuGqNHLxJ7RaF3bqjd933VhVCUes=
|
||||||
k8s.io/api v0.0.0-20250816062245-fa01e40890d0/go.mod h1:PyEssxRzobRLFX/lEYzx5NDkS4JYE20SOKUZjTH0nvI=
|
k8s.io/api v0.0.0-20250816062245-fa01e40890d0/go.mod h1:PyEssxRzobRLFX/lEYzx5NDkS4JYE20SOKUZjTH0nvI=
|
||||||
k8s.io/apimachinery v0.0.0-20250827234502-7a24dae0db84 h1:rMqDsUPA2nfIdTtHQT7BPacZ0SfH/oPG6zCKqy38wfQ=
|
k8s.io/apimachinery v0.0.0-20250828034517-5992c1df72b8 h1:10c4JlI2tLfqaloDfee5QCraT3rUTchWawsjf2fOQvw=
|
||||||
k8s.io/apimachinery v0.0.0-20250827234502-7a24dae0db84/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
|
k8s.io/apimachinery v0.0.0-20250828034517-5992c1df72b8/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
|
||||||
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67 h1:p6C/SYa7jZ4HpdNRIU/xJbY4lkMwl95iS6WkYmL3jE8=
|
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67 h1:p6C/SYa7jZ4HpdNRIU/xJbY4lkMwl95iS6WkYmL3jE8=
|
||||||
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67/go.mod h1:HIyOYCeQzm69fJySABxiMZ3Jw+tMJfIKesAiuPd+3No=
|
k8s.io/cli-runtime v0.0.0-20250816070916-f536649dab67/go.mod h1:HIyOYCeQzm69fJySABxiMZ3Jw+tMJfIKesAiuPd+3No=
|
||||||
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554 h1:nO7QX5+X24G4/feoMOeY98KqpHIxcntPLuhDeoacti8=
|
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554 h1:nO7QX5+X24G4/feoMOeY98KqpHIxcntPLuhDeoacti8=
|
||||||
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554/go.mod h1:xDExkHfWa76StdAOQuvSdZCD42iJau7+a60Bhd/QGL8=
|
k8s.io/client-go v0.0.0-20250828035311-d07f455e6554/go.mod h1:xDExkHfWa76StdAOQuvSdZCD42iJau7+a60Bhd/QGL8=
|
||||||
k8s.io/component-base v0.0.0-20250828000729-ee2825ba449c h1:FNLYAa5FW9WUcwMWWyi1eE9zU73X3buvsrAiAsJ7XMI=
|
k8s.io/component-base v0.0.0-20250828040528-7068757664be h1:XsM2H6GUNYpLJKXZCfqsY4SFp4u9k/zsZv/xGXsn0mE=
|
||||||
k8s.io/component-base v0.0.0-20250828000729-ee2825ba449c/go.mod h1:q5RKMm+VwuOM+tYS+fe3xrb+vsowL6MIE5peUlpLQPc=
|
k8s.io/component-base v0.0.0-20250828040528-7068757664be/go.mod h1:wUK0PCYJm7+fH9q/BZoPjS6aEzhPqAB8C/8eGGKbVIc=
|
||||||
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99 h1:RntyireBPwOrXJ89wKt6kNrB/yVT4eCWuv/il/czsoM=
|
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99 h1:RntyireBPwOrXJ89wKt6kNrB/yVT4eCWuv/il/czsoM=
|
||||||
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99/go.mod h1:2PiD4/9sXcLwnpwFutLSOdjunPFkRZZD8D6+rWb8amI=
|
k8s.io/component-helpers v0.0.0-20250816064315-d154920f3a99/go.mod h1:2PiD4/9sXcLwnpwFutLSOdjunPFkRZZD8D6+rWb8amI=
|
||||||
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
|
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,7 @@ package drain
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
|
|
||||||
|
|
@ -151,10 +152,11 @@ func NewDrainCmdOptions(f cmdutil.Factory, ioStreams genericiooptions.IOStreams)
|
||||||
PrintFlags: genericclioptions.NewPrintFlags("drained").WithTypeSetter(scheme.Scheme),
|
PrintFlags: genericclioptions.NewPrintFlags("drained").WithTypeSetter(scheme.Scheme),
|
||||||
IOStreams: ioStreams,
|
IOStreams: ioStreams,
|
||||||
drainer: &drain.Helper{
|
drainer: &drain.Helper{
|
||||||
GracePeriodSeconds: -1,
|
GracePeriodSeconds: -1,
|
||||||
Out: ioStreams.Out,
|
EvictErrorRetryDelay: 5 * time.Second,
|
||||||
ErrOut: ioStreams.ErrOut,
|
Out: ioStreams.Out,
|
||||||
ChunkSize: cmdutil.DefaultChunkSize,
|
ErrOut: ioStreams.ErrOut,
|
||||||
|
ChunkSize: cmdutil.DefaultChunkSize,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
o.drainer.OnPodDeletionOrEvictionFinished = o.onPodDeletionOrEvictionFinished
|
o.drainer.OnPodDeletionOrEvictionFinished = o.onPodDeletionOrEvictionFinished
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,9 @@ type Helper struct {
|
||||||
// won't drain otherwise
|
// won't drain otherwise
|
||||||
SkipWaitForDeleteTimeoutSeconds int
|
SkipWaitForDeleteTimeoutSeconds int
|
||||||
|
|
||||||
|
// EvictErrorRetryDelay is used to control the retry delay after a pod eviction error
|
||||||
|
EvictErrorRetryDelay time.Duration
|
||||||
|
|
||||||
// AdditionalFilters are applied sequentially after base drain filters to
|
// AdditionalFilters are applied sequentially after base drain filters to
|
||||||
// exclude pods using custom logic. Any filter that returns PodDeleteStatus
|
// exclude pods using custom logic. Any filter that returns PodDeleteStatus
|
||||||
// with Delete == false will immediately stop execution of further filters.
|
// with Delete == false will immediately stop execution of further filters.
|
||||||
|
|
@ -278,37 +281,27 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
|
||||||
defer cancel()
|
defer cancel()
|
||||||
for _, pod := range pods {
|
for _, pod := range pods {
|
||||||
go func(pod corev1.Pod, returnCh chan error) {
|
go func(pod corev1.Pod, returnCh chan error) {
|
||||||
refreshPod := false
|
activePod := pod
|
||||||
for {
|
for {
|
||||||
switch d.DryRunStrategy {
|
switch d.DryRunStrategy {
|
||||||
case cmdutil.DryRunServer:
|
case cmdutil.DryRunServer:
|
||||||
fmt.Fprintf(d.Out, "evicting pod %s/%s (server dry run)\n", pod.Namespace, pod.Name)
|
//nolint:errcheck
|
||||||
|
fmt.Fprintf(d.Out, "evicting pod %s/%s (server dry run)\n", activePod.Namespace, activePod.Name)
|
||||||
default:
|
default:
|
||||||
if d.OnPodDeletionOrEvictionStarted != nil {
|
if d.OnPodDeletionOrEvictionStarted != nil {
|
||||||
d.OnPodDeletionOrEvictionStarted(&pod, true)
|
d.OnPodDeletionOrEvictionStarted(&activePod, true)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(d.Out, "evicting pod %s/%s\n", pod.Namespace, pod.Name)
|
//nolint:errcheck
|
||||||
|
fmt.Fprintf(d.Out, "evicting pod %s/%s\n", activePod.Namespace, activePod.Name)
|
||||||
}
|
}
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
// return here or we'll leak a goroutine.
|
// return here or we'll leak a goroutine.
|
||||||
returnCh <- fmt.Errorf("error when evicting pods/%q -n %q: global timeout reached: %v", pod.Name, pod.Namespace, globalTimeout)
|
returnCh <- fmt.Errorf("error when evicting pods/%q -n %q: global timeout reached: %v", activePod.Name, activePod.Namespace, globalTimeout)
|
||||||
return
|
return
|
||||||
default:
|
default:
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a temporary pod so we don't mutate the pod in the loop.
|
|
||||||
activePod := pod
|
|
||||||
if refreshPod {
|
|
||||||
freshPod, err := getPodFn(pod.Namespace, pod.Name)
|
|
||||||
// We ignore errors and let eviction sort it out with
|
|
||||||
// the original pod.
|
|
||||||
if err == nil {
|
|
||||||
activePod = *freshPod
|
|
||||||
}
|
|
||||||
refreshPod = false
|
|
||||||
}
|
|
||||||
|
|
||||||
err := d.EvictPod(activePod, evictionGroupVersion)
|
err := d.EvictPod(activePod, evictionGroupVersion)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
break
|
break
|
||||||
|
|
@ -316,8 +309,9 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
|
||||||
returnCh <- nil
|
returnCh <- nil
|
||||||
return
|
return
|
||||||
} else if apierrors.IsTooManyRequests(err) {
|
} else if apierrors.IsTooManyRequests(err) {
|
||||||
fmt.Fprintf(d.ErrOut, "error when evicting pods/%q -n %q (will retry after 5s): %v\n", activePod.Name, activePod.Namespace, err)
|
//nolint:errcheck
|
||||||
time.Sleep(5 * time.Second)
|
fmt.Fprintf(d.ErrOut, "error when evicting pods/%q -n %q (will retry after %v): %v\n", activePod.Name, activePod.Namespace, d.EvictErrorRetryDelay, err)
|
||||||
|
time.Sleep(d.EvictErrorRetryDelay)
|
||||||
} else if !activePod.ObjectMeta.DeletionTimestamp.IsZero() && apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) {
|
} else if !activePod.ObjectMeta.DeletionTimestamp.IsZero() && apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) {
|
||||||
// an eviction request in a deleting namespace will throw a forbidden error,
|
// an eviction request in a deleting namespace will throw a forbidden error,
|
||||||
// if the pod is already marked deleted, we can ignore this error, an eviction
|
// if the pod is already marked deleted, we can ignore this error, an eviction
|
||||||
|
|
@ -326,12 +320,19 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
|
||||||
} else if apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) {
|
} else if apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) {
|
||||||
// an eviction request in a deleting namespace will throw a forbidden error,
|
// an eviction request in a deleting namespace will throw a forbidden error,
|
||||||
// if the pod is not marked deleted, we retry until it is.
|
// if the pod is not marked deleted, we retry until it is.
|
||||||
fmt.Fprintf(d.ErrOut, "error when evicting pod %q from terminating namespace %q (will retry after 5s): %v\n", activePod.Name, activePod.Namespace, err)
|
//nolint:errcheck
|
||||||
time.Sleep(5 * time.Second)
|
fmt.Fprintf(d.ErrOut, "error when evicting pod %q from terminating namespace %q (will retry after %v): %v\n", activePod.Name, activePod.Namespace, d.EvictErrorRetryDelay, err)
|
||||||
|
time.Sleep(d.EvictErrorRetryDelay)
|
||||||
} else {
|
} else {
|
||||||
returnCh <- fmt.Errorf("error when evicting pods/%q -n %q: %v", activePod.Name, activePod.Namespace, err)
|
returnCh <- fmt.Errorf("error when evicting pods/%q -n %q: %v", activePod.Name, activePod.Namespace, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
freshPod, err := getPodFn(activePod.Namespace, activePod.Name)
|
||||||
|
// we ignore errors and let eviction sort it out with the original pod.
|
||||||
|
if err == nil {
|
||||||
|
activePod = *freshPod
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if d.DryRunStrategy == cmdutil.DryRunServer {
|
if d.DryRunStrategy == cmdutil.DryRunServer {
|
||||||
returnCh <- nil
|
returnCh <- nil
|
||||||
|
|
@ -339,7 +340,7 @@ func (d *Helper) evictPods(pods []corev1.Pod, evictionGroupVersion schema.GroupV
|
||||||
}
|
}
|
||||||
params := waitForDeleteParams{
|
params := waitForDeleteParams{
|
||||||
ctx: ctx,
|
ctx: ctx,
|
||||||
pods: []corev1.Pod{pod},
|
pods: []corev1.Pod{activePod},
|
||||||
interval: 1 * time.Second,
|
interval: 1 * time.Second,
|
||||||
timeout: time.Duration(math.MaxInt64),
|
timeout: time.Duration(math.MaxInt64),
|
||||||
usingEviction: true,
|
usingEviction: true,
|
||||||
|
|
|
||||||
|
|
@ -529,3 +529,110 @@ func TestFilterPods(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEvictDuringNamespaceTerminating(t *testing.T) {
|
||||||
|
testPodUID := types.UID("test-uid")
|
||||||
|
testPodName := "test-pod"
|
||||||
|
testNamespace := "default"
|
||||||
|
|
||||||
|
retryDelay := 5 * time.Millisecond
|
||||||
|
globalTimeout := 2 * retryDelay
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
description string
|
||||||
|
refresh bool
|
||||||
|
err error
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
description: "Pod refreshed after NamespaceTerminating error",
|
||||||
|
refresh: true,
|
||||||
|
err: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
description: "Pod not refreshed after NamespaceTerminating error",
|
||||||
|
refresh: false,
|
||||||
|
err: fmt.Errorf("error when evicting pods/%q -n %q: global timeout reached: %v", testPodName, testNamespace, globalTimeout),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.description, func(t *testing.T) {
|
||||||
|
var retry bool
|
||||||
|
|
||||||
|
initialPod := &corev1.Pod{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: testPodName,
|
||||||
|
Namespace: testNamespace,
|
||||||
|
UID: testPodUID,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// pod with DeletionTimestamp, indicating deletion in progress
|
||||||
|
deletedPod := &corev1.Pod{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: testPodName,
|
||||||
|
Namespace: testNamespace,
|
||||||
|
UID: testPodUID,
|
||||||
|
DeletionTimestamp: &metav1.Time{Time: time.Now()},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
evictPods := []corev1.Pod{*initialPod}
|
||||||
|
|
||||||
|
k := fake.NewClientset(initialPod)
|
||||||
|
addEvictionSupport(t, k, "v1")
|
||||||
|
|
||||||
|
// mock eviction to return NamespaceTerminating error
|
||||||
|
k.PrependReactor("create", "pods", func(action ktest.Action) (bool, runtime.Object, error) {
|
||||||
|
if action.GetSubresource() != "eviction" {
|
||||||
|
return false, nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
err := apierrors.NewForbidden(
|
||||||
|
schema.GroupResource{Resource: "pods"},
|
||||||
|
testPodName,
|
||||||
|
errors.New("namespace is terminating"),
|
||||||
|
)
|
||||||
|
|
||||||
|
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{
|
||||||
|
Type: corev1.NamespaceTerminatingCause,
|
||||||
|
})
|
||||||
|
|
||||||
|
return true, nil, err
|
||||||
|
})
|
||||||
|
|
||||||
|
k.PrependReactor("get", "pods", func(action ktest.Action) (bool, runtime.Object, error) {
|
||||||
|
if !test.refresh {
|
||||||
|
// for non-refresh test, always return the initial pod
|
||||||
|
return true, initialPod, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if retry {
|
||||||
|
// second call, pod is deleted
|
||||||
|
return true, nil, apierrors.NewNotFound(schema.GroupResource{Resource: "pods"}, testPodName)
|
||||||
|
}
|
||||||
|
|
||||||
|
// first call, pod is being deleted
|
||||||
|
retry = true
|
||||||
|
|
||||||
|
return true, deletedPod, nil
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &Helper{
|
||||||
|
Client: k,
|
||||||
|
DisableEviction: false,
|
||||||
|
Out: os.Stdout,
|
||||||
|
ErrOut: os.Stderr,
|
||||||
|
Timeout: globalTimeout,
|
||||||
|
EvictErrorRetryDelay: retryDelay,
|
||||||
|
}
|
||||||
|
|
||||||
|
err := h.DeleteOrEvictPods(evictPods)
|
||||||
|
if test.err == nil && err != nil {
|
||||||
|
t.Errorf("expected no error, got: %v", err)
|
||||||
|
} else if test.err != nil && (err == nil || err.Error() != test.err.Error()) {
|
||||||
|
t.Errorf("%s: unexpected eviction; actual %v; expected %v", test.description, err, test.err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue