Merge pull request #848 from fluxcd/remediation-stall

Early stall condition detection after remediation
This commit is contained in:
Hidde Beydals 2023-12-14 10:28:28 +01:00 committed by GitHub
commit 603d3219da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 63 additions and 1 deletions

View File

@ -262,10 +262,18 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error {
"instructed to stop after running %s action reconciler %s", next.Type(), next.Name()),
)
if remediation := req.Object.GetActiveRemediation(); remediation == nil || !remediation.RetriesExhausted(req.Object) {
remediation := req.Object.GetActiveRemediation()
if remediation == nil || !remediation.RetriesExhausted(req.Object) {
conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, conditions.GetMessage(req.Object, meta.ReadyCondition))
return ErrMustRequeue
}
// Check if retries have exhausted after remediation for early
// stall condition detection.
if remediation != nil && remediation.RetriesExhausted(req.Object) {
conditions.MarkStalled(req.Object, "RetriesExceeded", "Failed to %s after %d attempt(s)",
req.Object.Status.LastAttemptedReleaseAction, req.Object.GetActiveRemediation().GetFailureCount(req.Object))
return ErrExceededMaxRetries
}
conditions.Delete(req.Object, meta.ReconcilingCondition)
return nil

View File

@ -629,6 +629,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
release.ObservedToSnapshot(release.ObserveRelease(releases[0])),
}
},
wantErr: ErrExceededMaxRetries,
},
{
name: "install test failure with remediation",
@ -654,6 +655,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
snap,
}
},
wantErr: ErrExceededMaxRetries,
},
{
name: "install test failure with test ignore",
@ -759,6 +761,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
release.ObservedToSnapshot(release.ObserveRelease(releases[0])),
}
},
wantErr: ErrExceededMaxRetries,
},
{
name: "upgrade failure with uninstall remediation",
@ -799,6 +802,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
release.ObservedToSnapshot(release.ObserveRelease(releases[0])),
}
},
wantErr: ErrExceededMaxRetries,
},
{
name: "upgrade test failure with remediation",
@ -841,6 +845,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
release.ObservedToSnapshot(release.ObserveRelease(releases[0])),
}
},
wantErr: ErrExceededMaxRetries,
},
{
name: "upgrade test failure with test ignore",
@ -928,6 +933,55 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) {
},
wantErr: ErrExceededMaxRetries,
},
{
name: "upgrade remediation results in exhausted retries",
releases: func(namespace string) []*helmrelease.Release {
return []*helmrelease.Release{
testutil.BuildRelease(&helmrelease.MockReleaseOptions{
Name: mockReleaseName,
Namespace: namespace,
Version: 1,
Chart: testutil.BuildChart(),
Status: helmrelease.StatusSuperseded,
}),
testutil.BuildRelease(&helmrelease.MockReleaseOptions{
Name: mockReleaseName,
Namespace: namespace,
Version: 2,
Chart: testutil.BuildChart(),
Status: helmrelease.StatusSuperseded,
}),
testutil.BuildRelease(&helmrelease.MockReleaseOptions{
Name: mockReleaseName,
Namespace: namespace,
Version: 3,
Chart: testutil.BuildChart(),
Status: helmrelease.StatusFailed,
}),
}
},
spec: func(spec *v2.HelmReleaseSpec) {
spec.Upgrade = &v2.Upgrade{
Remediation: &v2.UpgradeRemediation{
Retries: 1,
},
}
},
status: func(namespace string, releases []*helmrelease.Release) v2.HelmReleaseStatus {
return v2.HelmReleaseStatus{
History: v2.Snapshots{
release.ObservedToSnapshot(release.ObserveRelease(releases[2])),
release.ObservedToSnapshot(release.ObserveRelease(releases[1])),
release.ObservedToSnapshot(release.ObserveRelease(releases[0])),
},
LastAttemptedReleaseAction: v2.ReleaseActionUpgrade,
Failures: 2,
UpgradeFailures: 2,
}
},
chart: testutil.BuildChart(),
wantErr: ErrExceededMaxRetries,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {