Implement conditional remediation

This commit adds support for conditional remediation, enabling the user to: * configure if test failures should be ignored * configure what action should taken when a Helm install or upgrade action fails (e.g. rollback, uninstall) * configure if a failed Helm action should be retried * configure if a failed release should be kept for debugging purposes The previous behaviour where failed Helm tests did not mark the `HelmRelease` as not `Ready` has changed, it now marks them as failed by default. Co-authored-by: Hidde Beydals <hello@hidde.co>
2020-08-25 17:26:53 +02:00 · 2020-08-25 17:26:53 +02:00 · b8853ad7a5
parent 69a6f3c463
commit b8853ad7a5
7 changed files with 784 additions and 187 deletions
--- a/api/v2alpha1/condition_types.go
+++ b/api/v2alpha1/condition_types.go
@ -110,6 +110,9 @@ const (
 	// InitFailedReason represents the fact that the initialization of the Helm configuration failed.
 	InitFailedReason string = "InitFailed"

+	// GetLastReleaseFailedReason represents the fact that observing the last release failed.
+	GetLastReleaseFailedReason string = "GetLastReleaseFailed"
+
 	// ProgressingReason represents the fact that the reconciliation for the resource is underway.
 	ProgressingReason string = "Progressing"

--- a/api/v2alpha1/helmrelease_types.go
+++ b/api/v2alpha1/helmrelease_types.go
@ -176,6 +176,25 @@ func (in HelmChartTemplate) GetNamespace(defaultNamespace string) string {
 	return in.SourceRef.Namespace
 }

+// DeploymentAction defines a consistent interface for Install and Upgrade.
+// +kubebuilder:object:generate=false
+type DeploymentAction interface {
+	GetDescription() string
+	GetRemediation() Remediation
+}
+
+// Remediation defines a consistent interface for InstallRemediation and UpgradeRemediation.
+// +kubebuilder:object:generate=false
+type Remediation interface {
+	GetRetries() int
+	MustIgnoreTestFailures(bool) bool
+	MustRemediateLastFailure() bool
+	GetStrategy() RemediationStrategy
+	GetFailureCount(hr HelmRelease) int64
+	IncrementFailureCount(hr *HelmRelease)
+	RetriesExhausted(hr HelmRelease) bool
+}
+
 // Install holds the configuration for Helm install actions performed for this HelmRelease.
 type Install struct {
 	// Timeout is the time to wait for any individual Kubernetes operation (like Jobs
@ -184,6 +203,12 @@ type Install struct {
 	// +optional
 	Timeout *metav1.Duration `json:"timeout,omitempty"`

+	// Remediation holds the remediation configuration for when the
+	// Helm install action for the HelmRelease fails. The default
+	// is to not perform any action.
+	// +optional
+	Remediation *InstallRemediation `json:"remediation,omitempty"`
+
 	// DisableWait disables the waiting for resources to be ready after a
 	// Helm install has been performed.
 	// +optional
@ -218,6 +243,80 @@ func (in Install) GetTimeout(defaultTimeout metav1.Duration) metav1.Duration {
 	return *in.Timeout
 }

+// GetDescription returns a description for the Helm install action.
+func (in Install) GetDescription() string {
+	return "install"
+}
+
+// GetRemediation returns the configured Remediation for the Helm install action.
+func (in Install) GetRemediation() Remediation {
+	if in.Remediation == nil {
+		return InstallRemediation{}
+	}
+	return *in.Remediation
+}
+
+// InstallRemediation holds the configuration for Helm install remediation.
+type InstallRemediation struct {
+	// Retries is the number of retries that should be attempted on failures before
+	// bailing. Remediation, using an uninstall, is performed between each attempt.
+	// Defaults to '0', a negative integer equals to unlimited retries.
+	// +optional
+	Retries int `json:"retries,omitempty"`
+
+	// IgnoreTestFailures tells the controller to skip remediation when
+	// the Helm tests are run after an install action but fail.
+	// Defaults to 'Test.IgnoreFailures'.
+	// +optional
+	IgnoreTestFailures *bool `json:"ignoreTestFailures,omitempty"`
+
+	// RemediateLastFailure tells the controller to remediate the last
+	// failure, when no retries remain. Defaults to 'false'.
+	// +optional
+	RemediateLastFailure *bool `json:"remediateLastFailure,omitempty"`
+}
+
+// GetRetries returns the number of retries that should be attempted on failures.
+func (in InstallRemediation) GetRetries() int {
+	return in.Retries
+}
+
+// MustIgnoreTestFailures returns the configured IgnoreTestFailures or the given default.
+func (in InstallRemediation) MustIgnoreTestFailures(def bool) bool {
+	if in.IgnoreTestFailures == nil {
+		return def
+	}
+	return *in.IgnoreTestFailures
+}
+
+// MustRemediateLastFailure returns whether to remediate the last failure when no retries remain.
+func (in InstallRemediation) MustRemediateLastFailure() bool {
+	if in.RemediateLastFailure == nil {
+		return false
+	}
+	return *in.RemediateLastFailure
+}
+
+// GetStrategy returns the strategy to use for failure remediation.
+func (in InstallRemediation) GetStrategy() RemediationStrategy {
+	return UninstallRemediationStrategy
+}
+
+// GetFailureCount gets the failure count.
+func (in InstallRemediation) GetFailureCount(hr HelmRelease) int64 {
+	return hr.Status.InstallFailures
+}
+
+// IncrementFailureCount increments the failure count.
+func (in InstallRemediation) IncrementFailureCount(hr *HelmRelease) {
+	hr.Status.InstallFailures++
+}
+
+// RetriesExhausted returns true if there are no remaining retries.
+func (in InstallRemediation) RetriesExhausted(hr HelmRelease) bool {
+	return in.Retries >= 0 && in.GetFailureCount(hr) > int64(in.Retries)
+}
+
 // Upgrade holds the configuration for Helm upgrade actions for this HelmRelease.
 type Upgrade struct {
 	// Timeout is the time to wait for any individual Kubernetes operation (like Jobs
@ -226,10 +325,11 @@ type Upgrade struct {
 	// +optional
 	Timeout *metav1.Duration `json:"timeout,omitempty"`

-	// MaxRetries is the number of retries that should be attempted on failures before
-	// bailing. Defaults to '0', a negative integer equals to unlimited retries.
+	// Remediation holds the remediation configuration for when the
+	// Helm upgrade action for the HelmRelease fails. The default
+	// is to not perform any action.
 	// +optional
-	MaxRetries int `json:"maxRetries,omitempty"`
+	Remediation *UpgradeRemediation `json:"remediation,omitempty"`

 	// DisableWait disables the waiting for resources to be ready after a
 	// Helm upgrade has been performed.
@ -270,6 +370,100 @@ func (in Upgrade) GetTimeout(defaultTimeout metav1.Duration) metav1.Duration {
 	return *in.Timeout
 }

+// GetDescription returns a description for the Helm upgrade action.
+func (in Upgrade) GetDescription() string {
+	return "upgrade"
+}
+
+// GetRemediation returns the configured Remediation for the Helm upgrade action.
+func (in Upgrade) GetRemediation() Remediation {
+	if in.Remediation == nil {
+		return UpgradeRemediation{}
+	}
+	return *in.Remediation
+}
+
+// UpgradeRemediation holds the configuration for Helm upgrade remediation.
+type UpgradeRemediation struct {
+	// Retries is the number of retries that should be attempted on failures before
+	// bailing. Remediation, using 'Strategy', is performed between each attempt.
+	// Defaults to '0', a negative integer equals to unlimited retries.
+	// +optional
+	Retries int `json:"retries,omitempty"`
+
+	// IgnoreTestFailures tells the controller to skip remediation when
+	// the Helm tests are run after an upgrade action but fail.
+	// Defaults to 'Test.IgnoreFailures'.
+	// +optional
+	IgnoreTestFailures *bool `json:"ignoreTestFailures,omitempty"`
+
+	// RemediateLastFailure tells the controller to remediate the last
+	// failure, when no retries remain. Defaults to 'false' unless 'Retries'
+	// is greater than 0.
+	// +optional
+	RemediateLastFailure *bool `json:"remediateLastFailure,omitempty"`
+
+	// Strategy to use for failure remediation.
+	// Defaults to 'rollback'.
+	// +kubebuilder:validation:Enum=rollback;uninstall
+	// +optional
+	Strategy *RemediationStrategy `json:"strategy,omitempty"`
+}
+
+// GetRetries returns the number of retries that should be attempted on failures.
+func (in UpgradeRemediation) GetRetries() int {
+	return in.Retries
+}
+
+// MustIgnoreTestFailures returns the configured IgnoreTestFailures or the given default.
+func (in UpgradeRemediation) MustIgnoreTestFailures(def bool) bool {
+	if in.IgnoreTestFailures == nil {
+		return def
+	}
+	return *in.IgnoreTestFailures
+}
+
+// MustRemediateLastFailure returns whether to remediate the last failure when no retries remain.
+func (in UpgradeRemediation) MustRemediateLastFailure() bool {
+	if in.RemediateLastFailure == nil {
+		return in.Retries > 0
+	}
+	return *in.RemediateLastFailure
+}
+
+// GetStrategy returns the strategy to use for failure remediation.
+func (in UpgradeRemediation) GetStrategy() RemediationStrategy {
+	if in.Strategy == nil {
+		return RollbackRemediationStrategy
+	}
+	return *in.Strategy
+}
+
+// GetFailureCount gets the failure count.
+func (in UpgradeRemediation) GetFailureCount(hr HelmRelease) int64 {
+	return hr.Status.UpgradeFailures
+}
+
+// IncrementFailureCount increments the failure count.
+func (in UpgradeRemediation) IncrementFailureCount(hr *HelmRelease) {
+	hr.Status.UpgradeFailures++
+}
+
+// RetriesExhausted returns true if there are no remaining retries.
+func (in UpgradeRemediation) RetriesExhausted(hr HelmRelease) bool {
+	return in.Retries >= 0 && in.GetFailureCount(hr) > int64(in.Retries)
+}
+
+// RemediationStrategy returns the strategy to use to remediate a failed install or upgrade.
+type RemediationStrategy string
+
+const (
+	// RollbackRemediationStrategy represents a Helm remediation strategy of Helm rollback.
+	RollbackRemediationStrategy RemediationStrategy = "rollback"
+	// UninstallRemediationStrategy represents a Helm remediation strategy of Helm uninstall.
+	UninstallRemediationStrategy RemediationStrategy = "uninstall"
+)
+
 // Test holds the configuration for Helm test actions for this HelmRelease.
 type Test struct {
 	// Enable enables Helm test actions for this HelmRelease after an
@ -282,6 +476,13 @@ type Test struct {
 	// 'HelmReleaseSpec.Timeout'.
 	// +optional
 	Timeout *metav1.Duration `json:"timeout,omitempty"`
+
+	// IgnoreFailures tells the controller to skip remediation when
+	// the Helm tests are run but fail.
+	// Can be overwritten for tests run after install or upgrade actions
+	// in 'Install.IgnoreTestFailures' and 'Upgrade.IgnoreTestFailures'.
+	// +optional
+	IgnoreFailures bool `json:"ignoreFailures,omitempty"`
 }

 // GetTimeout returns the configured timeout for the Helm test action,
@ -295,11 +496,6 @@ func (in Test) GetTimeout(defaultTimeout metav1.Duration) metav1.Duration {

 // Rollback holds the configuration for Helm rollback actions for this HelmRelease.
 type Rollback struct {
-	// Enable enables Helm rollback actions for this HelmRelease after an
-	// Helm install or upgrade action failure.
-	// +optional
-	Enable bool `json:"enable,omitempty"`
-
 	// Timeout is the time to wait for any individual Kubernetes operation (like Jobs
 	// for hooks) during the performance of a Helm rollback action. Defaults to
 	// 'HelmReleaseSpec.Timeout'.
@ -396,10 +592,17 @@ type HelmReleaseStatus struct {
 	// +optional
 	HelmChart string `json:"helmChart,omitempty"`

-	// Failures is the reconciliation failure count. It is reset after a successful
-	// reconciliation.
+	// Failures is the reconciliation failure count.
 	// +optional
 	Failures int64 `json:"failures,omitempty"`
+
+	// InstallFailures is the install failure count.
+	// +optional
+	InstallFailures int64 `json:"installFailures,omitempty"`
+
+	// UpgradeFailures is the upgrade failure count.
+	// +optional
+	UpgradeFailures int64 `json:"upgradeFailures,omitempty"`
 }

 // GetHelmChart returns the namespace and name of the HelmChart.
@ -411,18 +614,14 @@ func (in HelmReleaseStatus) GetHelmChart() (string, string) {
 	return split[0], split[1]
 }

-// HelmReleaseProgressing resets the conditions of the given HelmRelease to a single
-// ReadyCondition with status ConditionUnknown.
+// HelmReleaseProgressing resets any failures and registers progress toward reconciling the given HelmRelease
+// by setting the ReadyCondition to ConditionUnknown for ProgressingReason.
 func HelmReleaseProgressing(hr HelmRelease) HelmRelease {
-	hr.Status.Conditions = []Condition{
-		{
-			Type:               ReadyCondition,
-			Status:             corev1.ConditionUnknown,
-			LastTransitionTime: metav1.Now(),
-			Reason:             ProgressingReason,
-			Message:            "reconciliation in progress",
-		},
-	}
+	hr.Status.Failures = 0
+	hr.Status.InstallFailures = 0
+	hr.Status.UpgradeFailures = 0
+	hr.Status.Conditions = []Condition{}
+	SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionUnknown, ProgressingReason, "reconciliation in progress")
 	return hr
 }

@ -439,88 +638,31 @@ func SetHelmReleaseCondition(hr *HelmRelease, condition string, status corev1.Co
 	})
 }

-// SetHelmReleaseReadiness sets the ReadyCondition, ObservedGeneration, LastAttemptedRevision,
-// and LastReleaseRevision, on the HelmRelease.
-func SetHelmReleaseReadiness(hr *HelmRelease, status corev1.ConditionStatus, reason, message string, revision string, releaseRevision int, valuesChecksum string) {
-	SetHelmReleaseCondition(hr, ReadyCondition, status, reason, message)
-	hr.Status.ObservedGeneration = hr.Generation
-	hr.Status.LastAttemptedRevision = revision
-	hr.Status.LastReleaseRevision = releaseRevision
-	hr.Status.LastAttemptedValuesChecksum = valuesChecksum
-}
-
 // HelmReleaseNotReady registers a failed release attempt of the given HelmRelease.
-func HelmReleaseNotReady(hr HelmRelease, revision string, releaseRevision int, valuesChecksum, reason, message string) HelmRelease {
-	SetHelmReleaseReadiness(&hr, corev1.ConditionFalse, reason, message, revision, releaseRevision, valuesChecksum)
-	hr.Status.Failures = hr.Status.Failures + 1
+func HelmReleaseNotReady(hr HelmRelease, reason, message string) HelmRelease {
+	SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionFalse, reason, message)
+	hr.Status.Failures++
 	return hr
 }

 // HelmReleaseReady registers a successful release attempt of the given HelmRelease.
-func HelmReleaseReady(hr HelmRelease, revision string, releaseRevision int, valuesChecksum, reason, message string) HelmRelease {
-	SetHelmReleaseReadiness(&hr, corev1.ConditionTrue, reason, message, revision, releaseRevision, valuesChecksum)
-	hr.Status.LastAppliedRevision = revision
-	hr.Status.Failures = 0
+func HelmReleaseReady(hr HelmRelease, reason, message string) HelmRelease {
+	SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionTrue, reason, message)
+	hr.Status.LastAppliedRevision = hr.Status.LastAttemptedRevision
 	return hr
 }

-// ShouldUpgrade determines if an Helm upgrade action needs to be performed for the given HelmRelease.
-func ShouldUpgrade(hr HelmRelease, revision string, releaseRevision int, valuesChecksum string) bool {
-	switch {
-	case hr.Status.LastAttemptedRevision != revision:
-		return true
-	case hr.Status.LastReleaseRevision != releaseRevision:
-		return true
-	case hr.Generation != hr.Status.ObservedGeneration:
-		return true
-	case hr.Status.LastAttemptedValuesChecksum != valuesChecksum:
-		return true
-	case hr.Status.Failures > 0 &&
-		(hr.Spec.GetUpgrade().MaxRetries < 0 || hr.Status.Failures < int64(hr.Spec.GetUpgrade().MaxRetries)):
-		return true
-	default:
-		return false
-	}
-}
+// HelmReleaseAttempted registers an attempt of the given HelmRelease with the given state.
+// and returns the modified HelmRelease and a boolean indicating a state change.
+func HelmReleaseAttempted(hr HelmRelease, revision string, releaseRevision int, valuesChecksum string) (HelmRelease, bool) {
+	changed := hr.Status.LastAttemptedRevision != revision ||
+		hr.Status.LastReleaseRevision != releaseRevision ||
+		hr.Status.LastAttemptedValuesChecksum != valuesChecksum
+	hr.Status.LastAttemptedRevision = revision
+	hr.Status.LastReleaseRevision = releaseRevision
+	hr.Status.LastAttemptedValuesChecksum = valuesChecksum

-// ShouldTest determines if a Helm test actions needs to be performed for the given HelmRelease.
-func ShouldTest(hr HelmRelease) bool {
-	if hr.Spec.Test.Enable {
-		for _, c := range hr.Status.Conditions {
-			if c.Status == corev1.ConditionTrue && (c.Type == InstalledCondition || c.Type == UpgradedCondition) {
-				return true
-			}
-		}
-	}
-	return false
-}
-
-// ShouldRollback determines if a Helm rollback action needs to be performed for the given HelmRelease.
-func ShouldRollback(hr HelmRelease, releaseRevision int) bool {
-	if hr.Spec.GetRollback().Enable {
-		if hr.Status.LastReleaseRevision <= releaseRevision {
-			return false
-		}
-		for _, c := range hr.Status.Conditions {
-			if c.Type == UpgradedCondition && c.Status == corev1.ConditionFalse {
-				return true
-			}
-		}
-	}
-	return false
-}
-
-// ShouldUninstall determines if a Helm uninstall action needs to be performed for the given HelmRelease.
-func ShouldUninstall(hr HelmRelease, releaseRevision int) bool {
-	if releaseRevision <= 0 {
-		return false
-	}
-	for _, c := range hr.Status.Conditions {
-		if c.Type == InstalledCondition && c.Status == corev1.ConditionFalse {
-			return true
-		}
-	}
-	return false
+	return hr, changed
 }

 const (
--- a/api/v2alpha1/zz_generated.deepcopy.go
+++ b/api/v2alpha1/zz_generated.deepcopy.go
@ -234,6 +234,11 @@ func (in *Install) DeepCopyInto(out *Install) {
 		*out = new(v1.Duration)
 		**out = **in
 	}
+	if in.Remediation != nil {
+		in, out := &in.Remediation, &out.Remediation
+		*out = new(InstallRemediation)
+		(*in).DeepCopyInto(*out)
+	}
 }

 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Install.
@ -246,6 +251,31 @@ func (in *Install) DeepCopy() *Install {
 	return out
 }

+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *InstallRemediation) DeepCopyInto(out *InstallRemediation) {
+	*out = *in
+	if in.IgnoreTestFailures != nil {
+		in, out := &in.IgnoreTestFailures, &out.IgnoreTestFailures
+		*out = new(bool)
+		**out = **in
+	}
+	if in.RemediateLastFailure != nil {
+		in, out := &in.RemediateLastFailure, &out.RemediateLastFailure
+		*out = new(bool)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InstallRemediation.
+func (in *InstallRemediation) DeepCopy() *InstallRemediation {
+	if in == nil {
+		return nil
+	}
+	out := new(InstallRemediation)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *Rollback) DeepCopyInto(out *Rollback) {
 	*out = *in
@ -314,6 +344,11 @@ func (in *Upgrade) DeepCopyInto(out *Upgrade) {
 		*out = new(v1.Duration)
 		**out = **in
 	}
+	if in.Remediation != nil {
+		in, out := &in.Remediation, &out.Remediation
+		*out = new(UpgradeRemediation)
+		(*in).DeepCopyInto(*out)
+	}
 }

 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Upgrade.
@ -326,6 +361,36 @@ func (in *Upgrade) DeepCopy() *Upgrade {
 	return out
 }

+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *UpgradeRemediation) DeepCopyInto(out *UpgradeRemediation) {
+	*out = *in
+	if in.IgnoreTestFailures != nil {
+		in, out := &in.IgnoreTestFailures, &out.IgnoreTestFailures
+		*out = new(bool)
+		**out = **in
+	}
+	if in.RemediateLastFailure != nil {
+		in, out := &in.RemediateLastFailure, &out.RemediateLastFailure
+		*out = new(bool)
+		**out = **in
+	}
+	if in.Strategy != nil {
+		in, out := &in.Strategy, &out.Strategy
+		*out = new(RemediationStrategy)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new UpgradeRemediation.
+func (in *UpgradeRemediation) DeepCopy() *UpgradeRemediation {
+	if in == nil {
+		return nil
+	}
+	out := new(UpgradeRemediation)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *ValuesReference) DeepCopyInto(out *ValuesReference) {
 	*out = *in
--- a/controllers/helmrelease_controller.go
+++ b/controllers/helmrelease_controller.go
@ -69,6 +69,16 @@ type HelmReleaseReconciler struct {
 	ExternalEventRecorder *recorder.EventRecorder
 }

+// ConditionError represents an error with a status condition reason attached.
+type ConditionError struct {
+	Reason string
+	Err    error
+}
+
+func (c ConditionError) Error() string {
+	return c.Err.Error()
+}
+
 // +kubebuilder:rbac:groups=helm.toolkit.fluxcd.io,resources=helmreleases,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=helm.toolkit.fluxcd.io,resources=helmreleases/status,verbs=get;update;patch

@ -117,7 +127,7 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)

 	if hr.Spec.Suspend {
 		msg := "HelmRelease is suspended, skipping reconciliation"
-		hr = v2.HelmReleaseNotReady(hr, hr.Status.LastAttemptedRevision, hr.Status.LastReleaseRevision, hr.Status.LastAttemptedValuesChecksum, v2.SuspendedReason, msg)
+		hr = v2.HelmReleaseNotReady(hr, v2.SuspendedReason, msg)
 		if err := r.Status().Update(ctx, &hr); err != nil {
 			log.Error(err, "unable to update status")
 			return ctrl.Result{Requeue: true}, err
@ -126,7 +136,13 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
 		return ctrl.Result{}, nil
 	}

-	hr = v2.HelmReleaseProgressing(hr)
+	// Observe the HelmRelease generation.
+	hasNewGeneration := hr.Status.ObservedGeneration != hr.Generation
+	if hasNewGeneration {
+		hr.Status.ObservedGeneration = hr.Generation
+		hr = v2.HelmReleaseProgressing(hr)
+	}
+
 	if err := r.Status().Update(ctx, &hr); err != nil {
 		log.Error(err, "unable to update status")
 		return ctrl.Result{Requeue: true}, err
@ -143,7 +159,7 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
 			msg = "HelmChart is not ready"
 			r.event(hr, hr.Status.LastAttemptedRevision, recorder.EventSeverityInfo, msg)
 		}
-		hr = v2.HelmReleaseNotReady(hr, hr.Status.LastAttemptedRevision, hr.Status.LastReleaseRevision, hr.Status.LastAttemptedValuesChecksum, v2.ArtifactFailedReason, msg)
+		hr = v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, msg)
 		if err := r.Status().Update(ctx, &hr); err != nil {
 			log.Error(err, "unable to update status")
 			return ctrl.Result{Requeue: true}, err
@ -154,7 +170,7 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
 	// Check chart artifact readiness
 	if hc.GetArtifact() == nil {
 		msg := "HelmChart is not ready"
-		hr = v2.HelmReleaseNotReady(hr, hr.Status.LastAttemptedRevision, hr.Status.LastReleaseRevision, hr.Status.LastAttemptedValuesChecksum, v2.ArtifactFailedReason, msg)
+		hr = v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, msg)
 		r.event(hr, hr.Status.LastAttemptedRevision, recorder.EventSeverityInfo, msg)
 		log.Info(msg)
 		if err := r.Status().Update(ctx, &hr); err != nil {
@ -171,7 +187,7 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
 			r.event(hr, hc.GetArtifact().Revision, recorder.EventSeverityInfo, msg)
 			log.Info(msg)

-			hr = v2.HelmReleaseNotReady(hr, hr.Status.LastAttemptedRevision, hr.Status.LastReleaseRevision, hr.Status.LastAttemptedValuesChecksum, v2.DependencyNotReadyReason, err.Error())
+			hr = v2.HelmReleaseNotReady(hr, v2.DependencyNotReadyReason, err.Error())
 			if err := r.Status().Update(ctx, &hr); err != nil {
 				log.Error(err, "unable to update status")
 				return ctrl.Result{Requeue: true}, err
@ -186,7 +202,7 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
 	// Compose values
 	values, err := r.composeValues(ctx, hr)
 	if err != nil {
-		hr = v2.HelmReleaseNotReady(hr, hr.Status.LastAttemptedRevision, hr.Status.LastReleaseRevision, hr.Status.LastAttemptedValuesChecksum, v2.InitFailedReason, err.Error())
+		hr = v2.HelmReleaseNotReady(hr, v2.InitFailedReason, err.Error())
 		r.event(hr, hr.Status.LastAttemptedRevision, recorder.EventSeverityError, err.Error())
 		if err := r.Status().Update(ctx, &hr); err != nil {
 			log.Error(err, "unable to update status")
@ -195,7 +211,7 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
 		return ctrl.Result{}, nil
 	}

-	reconciledHr, reconcileErr := r.release(log, *hr.DeepCopy(), hc, values)
+	reconciledHr, reconcileErr := r.release(log, *hr.DeepCopy(), hc, values, hasNewGeneration)
 	if reconcileErr != nil {
 		r.event(hr, hc.GetArtifact().Revision, recorder.EventSeverityError, fmt.Sprintf("reconciliation failed: %s", reconcileErr.Error()))
 	}
@ -279,12 +295,12 @@ func (r *HelmReleaseReconciler) reconcileChart(ctx context.Context, hr *v2.HelmR
 	return &helmChart, true, nil
 }

-func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, source sourcev1.Source, values chartutil.Values) (v2.HelmRelease, error) {
+func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, source sourcev1.Source, values chartutil.Values, hasNewGeneration bool) (v2.HelmRelease, error) {
 	// Acquire lock
 	unlock, err := lock(fmt.Sprintf("%s-%s", hr.GetName(), hr.GetNamespace()))
 	if err != nil {
 		err = fmt.Errorf("lockfile error: %w", err)
-		return v2.HelmReleaseNotReady(hr, hr.Status.LastAttemptedRevision, hr.Status.LastReleaseRevision, hr.Status.LastAttemptedValuesChecksum, sourcev1.StorageOperationFailedReason, err.Error()), err
+		return v2.HelmReleaseNotReady(hr, sourcev1.StorageOperationFailedReason, err.Error()), err
 	}
 	defer unlock()

@ -298,74 +314,119 @@ func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, sour
 	// Download artifact
 	artifactPath, err := download(source.GetArtifact().URL, tmpDir)
 	if err != nil {
-		return v2.HelmReleaseNotReady(hr, hr.Status.LastAttemptedRevision, hr.Status.LastReleaseRevision, hr.Status.LastAttemptedValuesChecksum, v2.ArtifactFailedReason, "artifact acquisition failed"), err
+		return v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, "artifact acquisition failed"), err
 	}

 	// Load chart
 	loadedChart, err := loader.Load(artifactPath)
 	if err != nil {
-		return v2.HelmReleaseNotReady(hr, hr.Status.LastAttemptedRevision, hr.Status.LastReleaseRevision, hr.Status.LastAttemptedValuesChecksum, v2.ArtifactFailedReason, "failed to load chart"), err
+		return v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, "failed to load chart"), err
 	}

 	// Initialize config
 	cfg, err := newActionCfg(log, r.Config, hr)
 	if err != nil {
-		return v2.HelmReleaseNotReady(hr, hr.Status.LastAttemptedRevision, hr.Status.LastReleaseRevision, hr.Status.LastAttemptedValuesChecksum, v2.InitFailedReason, "failed to initialize Helm action configuration"), err
+		return v2.HelmReleaseNotReady(hr, v2.InitFailedReason, "failed to initialize Helm action configuration"), err
 	}

-	// Get the current release
-	rel, err := cfg.Releases.Deployed(hr.GetReleaseName())
-	if err != nil && !errors.Is(err, driver.ErrNoDeployedReleases) {
-		return v2.HelmReleaseNotReady(hr, hr.Status.LastAttemptedRevision, hr.Status.LastReleaseRevision, hr.Status.LastAttemptedValuesChecksum, v2.InitFailedReason, "failed to determine if release exists"), err
+	// Determine last release revision.
+	rel, observeLastReleaseErr := observeLastRelease(cfg, hr)
+	if observeLastReleaseErr != nil {
+		return v2.HelmReleaseNotReady(hr, v2.GetLastReleaseFailedReason, "failed to get last release revision"), err
 	}

+	// Register the current release attempt.
+	revision := source.GetArtifact().Revision
+	releaseRevision := getReleaseRevision(rel)
 	valuesChecksum := calculateValuesChecksum(values)
+	hr, hasNewState := v2.HelmReleaseAttempted(hr, revision, releaseRevision, valuesChecksum)
+	if hasNewState {
+		hr = v2.HelmReleaseProgressing(hr)
+	}

-	// Install or upgrade the release
-	success := true
-	if errors.Is(err, driver.ErrNoDeployedReleases) {
+	// Determine release deployment action.
+	var deployAction v2.DeploymentAction
+	switch {
+	// Install if there is none.
+	case rel == nil:
+		deployAction = hr.Spec.GetInstall()
+	// Upgrade if there is a new generation, new state, or this is an upgrade retry.
+	case hasNewGeneration || hasNewState || hr.Spec.GetUpgrade().GetRemediation().GetFailureCount(hr) > 0:
+		deployAction = hr.Spec.GetUpgrade()
+	// Otherwise no action needed.
+	default:
+		return hr, nil
+	}
+
+	// Check if retries exhausted.
+	remediation := deployAction.GetRemediation()
+	if remediation.RetriesExhausted(hr) {
+		return hr, fmt.Errorf("%s retries exhausted", deployAction.GetDescription())
+	}
+
+	// Deploy the release.
+	switch a := deployAction.(type) {
+	case v2.Install:
 		rel, err = install(cfg, loadedChart, hr, values)
-		r.handleHelmActionResult(hr, source, err, "install", v2.InstalledCondition, v2.InstallSucceededReason, v2.InstallFailedReason)
-		success = err == nil
-	} else if v2.ShouldUpgrade(hr, source.GetArtifact().Revision, rel.Version, valuesChecksum) {
+		err = r.handleHelmActionResult(&hr, revision, err, a.GetDescription(), v2.InstalledCondition, v2.InstallSucceededReason, v2.InstallFailedReason)
+	case v2.Upgrade:
 		rel, err = upgrade(cfg, loadedChart, hr, values)
-		r.handleHelmActionResult(hr, source, err, "upgrade", v2.UpgradedCondition, v2.UpgradeSucceededReason, v2.UpgradeFailedReason)
-		success = err == nil
+		err = r.handleHelmActionResult(&hr, revision, err, a.GetDescription(), v2.UpgradedCondition, v2.UpgradeSucceededReason, v2.UpgradeFailedReason)
 	}

-	// Run tests
-	if v2.ShouldTest(hr) {
-		rel, err = test(cfg, hr)
-		r.handleHelmActionResult(hr, source, err, "test", v2.TestedCondition, v2.TestSucceededReason, v2.TestFailedReason)
-	}
-
-	// Run rollback
-	if rel != nil && v2.ShouldRollback(hr, rel.Version) {
-		success = false
-		err = rollback(cfg, hr)
-		r.handleHelmActionResult(hr, source, err, "rollback", v2.RolledBackCondition, v2.RollbackSucceededReason, v2.RollbackFailedReason)
-	}
-
-	// Determine release number after action runs
-	var releaseRevision int
-	if curRel, err := cfg.Releases.Deployed(hr.GetReleaseName()); err == nil {
-		releaseRevision = curRel.Version
-	}
-
-	// Run uninstall
-	if v2.ShouldUninstall(hr, releaseRevision) {
-		success = false
-		err = uninstall(cfg, hr)
-		if err == nil {
-			releaseRevision = 0
+	// Run tests if enabled and there is a successful new release revision.
+	if getReleaseRevision(rel) > releaseRevision && err == nil && hr.Spec.GetTest().Enable {
+		_, testErr := test(cfg, hr)
+		testErr = r.handleHelmActionResult(&hr, revision, testErr, "test", v2.TestedCondition, v2.TestSucceededReason, v2.TestFailedReason)
+		// Propagate any test error if not marked ignored.
+		if testErr != nil && !remediation.MustIgnoreTestFailures(hr.Spec.GetTest().IgnoreFailures) {
+			err = testErr
 		}
-		r.handleHelmActionResult(hr, source, err, "uninstall", v2.UninstalledCondition, v2.UninstallSucceededReason, v2.UninstallFailedReason)
 	}

-	if !success {
-		return v2.HelmReleaseNotReady(hr, source.GetArtifact().Revision, releaseRevision, valuesChecksum, v2.ReconciliationFailedReason, "release reconciliation failed"), err
+	if err != nil {
+		// Increment failure count for deployment action.
+		remediation.IncrementFailureCount(&hr)
+		// Remediate deployment failure if necessary.
+		if !remediation.RetriesExhausted(hr) || remediation.MustRemediateLastFailure() {
+			switch {
+			case getReleaseRevision(rel) <= releaseRevision:
+				log.Info(fmt.Sprintf("skipping remediation, no new release revision created"))
+			case remediation.GetStrategy() == v2.RollbackRemediationStrategy:
+				rollbackErr := rollback(cfg, hr)
+				rollbackConditionErr := r.handleHelmActionResult(&hr, revision, rollbackErr, "rollback", v2.RolledBackCondition, v2.RollbackSucceededReason, v2.RollbackFailedReason)
+				if rollbackConditionErr != nil {
+					err = rollbackConditionErr
+				}
+			case remediation.GetStrategy() == v2.UninstallRemediationStrategy:
+				uninstallErr := uninstall(cfg, hr)
+				uninstallConditionErr := r.handleHelmActionResult(&hr, revision, uninstallErr, "uninstall", v2.UninstalledCondition, v2.UninstallSucceededReason, v2.UninstallFailedReason)
+				if uninstallConditionErr != nil {
+					err = uninstallConditionErr
+				}
+			}
+		}
 	}
-	return v2.HelmReleaseReady(hr, source.GetArtifact().Revision, releaseRevision, valuesChecksum, v2.ReconciliationSucceededReason, "release reconciliation succeeded"), nil
+
+	// Determine release revision after deployment/remediation.
+	rel, observeLastReleaseErr = observeLastRelease(cfg, hr)
+	if observeLastReleaseErr != nil {
+		err = &ConditionError{
+			Reason: v2.GetLastReleaseFailedReason,
+			Err:    errors.New("failed to get last release revision after deployment/remediation"),
+		}
+	}
+	hr.Status.LastReleaseRevision = getReleaseRevision(rel)
+
+	if err != nil {
+		reason := v2.ReconciliationFailedReason
+		var cerr *ConditionError
+		if errors.As(err, &cerr) {
+			reason = cerr.Reason
+		}
+		return v2.HelmReleaseNotReady(hr, reason, err.Error()), err
+	}
+	return v2.HelmReleaseReady(hr, v2.ReconciliationSucceededReason, "release reconciliation succeeded"), nil
 }

 func (r *HelmReleaseReconciler) checkDependencies(hr v2.HelmRelease) error {
@ -493,14 +554,17 @@ func (r *HelmReleaseReconciler) composeValues(ctx context.Context, hr v2.HelmRel
 	return mergeMaps(result, hr.GetValues()), nil
 }

-func (r *HelmReleaseReconciler) handleHelmActionResult(hr v2.HelmRelease, source sourcev1.Source, err error, action string, condition string, succeededReason string, failedReason string) {
+func (r *HelmReleaseReconciler) handleHelmActionResult(hr *v2.HelmRelease, revision string, err error, action string, condition string, succeededReason string, failedReason string) error {
 	if err != nil {
-		v2.SetHelmReleaseCondition(&hr, condition, corev1.ConditionFalse, failedReason, err.Error())
-		r.event(hr, source.GetArtifact().Revision, recorder.EventSeverityError, fmt.Sprintf("Helm %s failed: %s", action, err.Error()))
+		msg := fmt.Sprintf("Helm %s failed: %s", action, err.Error())
+		v2.SetHelmReleaseCondition(hr, condition, corev1.ConditionFalse, failedReason, msg)
+		r.event(*hr, revision, recorder.EventSeverityError, msg)
+		return &ConditionError{Reason: failedReason, Err: errors.New(msg)}
 	} else {
 		msg := fmt.Sprintf("Helm %s succeeded", action)
-		v2.SetHelmReleaseCondition(&hr, condition, corev1.ConditionTrue, succeededReason, msg)
-		r.event(hr, source.GetArtifact().Revision, recorder.EventSeverityInfo, msg)
+		v2.SetHelmReleaseCondition(hr, condition, corev1.ConditionTrue, succeededReason, msg)
+		r.event(*hr, revision, recorder.EventSeverityInfo, msg)
+		return nil
 	}
 }

@ -565,6 +629,23 @@ func helmChartRequiresUpdate(hr v2.HelmRelease, chart sourcev1.HelmChart) bool {
 	}
 }

+// observeLastRelease observes the last revision, if there is one, for for actual helm release associated with the given HelmRelease.
+func observeLastRelease(cfg *action.Configuration, hr v2.HelmRelease) (*release.Release, error) {
+	rel, err := cfg.Releases.Last(hr.GetReleaseName())
+	if err != nil && errors.Is(err, driver.ErrReleaseNotFound) {
+		err = nil
+	}
+	return rel, err
+}
+
+// getReleaseRevision returns the revision of the given release.Release.
+func getReleaseRevision(rel *release.Release) int {
+	if rel == nil {
+		return 0
+	}
+	return rel.Version
+}
+
 func install(cfg *action.Configuration, chart *chart.Chart, hr v2.HelmRelease, values chartutil.Values) (*release.Release, error) {
 	install := action.NewInstall(cfg)
 	install.ReleaseName = hr.GetReleaseName()
@ -597,7 +678,7 @@ func upgrade(cfg *action.Configuration, chart *chart.Chart, hr v2.HelmRelease, v
 func test(cfg *action.Configuration, hr v2.HelmRelease) (*release.Release, error) {
 	test := action.NewReleaseTesting(cfg)
 	test.Namespace = hr.GetReleaseNamespace()
-	test.Timeout = hr.Spec.Test.GetTimeout(hr.GetTimeout()).Duration
+	test.Timeout = hr.Spec.GetTest().GetTimeout(hr.GetTimeout()).Duration

 	return test.Run(hr.GetReleaseName())
 }
--- a/docs/api/helmrelease.md
+++ b/docs/api/helmrelease.md
@ -445,6 +445,9 @@ string
 </table>
 </div>
 </div>
+<h3 id="helm.toolkit.fluxcd.io/v2alpha1.DeploymentAction">DeploymentAction
+</h3>
+<p>DeploymentAction defines a consistent interface for Install and Upgrade.</p>
 <h3 id="helm.toolkit.fluxcd.io/v2alpha1.HelmChartTemplate">HelmChartTemplate
 </h3>
 <p>
@ -854,8 +857,31 @@ int64
 </td>
 <td>
 <em>(Optional)</em>
-<p>Failures is the reconciliation failure count. It is reset after a successful
-reconciliation.</p>
+<p>Failures is the reconciliation failure count.</p>
+</td>
+</tr>
+<tr>
+<td>
+<code>installFailures</code><br>
+<em>
+int64
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>InstallFailures is the install failure count.</p>
+</td>
+</tr>
+<tr>
+<td>
+<code>upgradeFailures</code><br>
+<em>
+int64
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>UpgradeFailures is the upgrade failure count.</p>
 </td>
 </tr>
 </tbody>
@ -897,6 +923,22 @@ for hooks) during the performance of a Helm install action. Defaults to
 </tr>
 <tr>
 <td>
+<code>remediation</code><br>
+<em>
+<a href="#helm.toolkit.fluxcd.io/v2alpha1.InstallRemediation">
+InstallRemediation
+</a>
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>Remediation holds the remediation configuration for when the
+Helm install action for the HelmRelease fails. The default
+is to not perform any action.</p>
+</td>
+</tr>
+<tr>
+<td>
 <code>disableWait</code><br>
 <em>
 bool
@ -963,6 +1005,78 @@ CRDs are installed if not already present.</p>
 </table>
 </div>
 </div>
+<h3 id="helm.toolkit.fluxcd.io/v2alpha1.InstallRemediation">InstallRemediation
+</h3>
+<p>
+(<em>Appears on:</em>
+<a href="#helm.toolkit.fluxcd.io/v2alpha1.Install">Install</a>)
+</p>
+<p>InstallRemediation holds the configuration for Helm install remediation.</p>
+<div class="md-typeset__scrollwrap">
+<div class="md-typeset__table">
+<table>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<code>retries</code><br>
+<em>
+int
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>Retries is the number of retries that should be attempted on failures before
+bailing. Remediation, using an uninstall, is performed between each attempt.
+Defaults to &lsquo;0&rsquo;, a negative integer equals to unlimited retries.</p>
+</td>
+</tr>
+<tr>
+<td>
+<code>ignoreTestFailures</code><br>
+<em>
+bool
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>IgnoreTestFailures tells the controller to skip remediation when
+the Helm tests are run after an install action but fail.
+Defaults to &lsquo;Test.IgnoreFailures&rsquo;.</p>
+</td>
+</tr>
+<tr>
+<td>
+<code>remediateLastFailure</code><br>
+<em>
+bool
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>RemediateLastFailure tells the controller to remediate the last
+failure, when no retries remain. Defaults to &lsquo;false&rsquo;.</p>
+</td>
+</tr>
+</tbody>
+</table>
+</div>
+</div>
+<h3 id="helm.toolkit.fluxcd.io/v2alpha1.Remediation">Remediation
+</h3>
+<p>Remediation defines a consistent interface for InstallRemediation and UpgradeRemediation.</p>
+<h3 id="helm.toolkit.fluxcd.io/v2alpha1.RemediationStrategy">RemediationStrategy
+(<code>string</code> alias)</h3>
+<p>
+(<em>Appears on:</em>
+<a href="#helm.toolkit.fluxcd.io/v2alpha1.UpgradeRemediation">UpgradeRemediation</a>)
+</p>
+<p>RemediationStrategy returns the strategy to use to remediate a failed install or upgrade.</p>
 <h3 id="helm.toolkit.fluxcd.io/v2alpha1.Rollback">Rollback
 </h3>
 <p>
@ -982,19 +1096,6 @@ CRDs are installed if not already present.</p>
 <tbody>
 <tr>
 <td>
-<code>enable</code><br>
-<em>
-bool
-</em>
-</td>
-<td>
-<em>(Optional)</em>
-<p>Enable enables Helm rollback actions for this HelmRelease after an
-Helm install or upgrade action failure.</p>
-</td>
-</tr>
-<tr>
-<td>
 <code>timeout</code><br>
 <em>
 <a href="https://godoc.org/k8s.io/apimachinery/pkg/apis/meta/v1#Duration">
@ -1121,6 +1222,21 @@ during the performance of a Helm test action. Defaults to
 &lsquo;HelmReleaseSpec.Timeout&rsquo;.</p>
 </td>
 </tr>
+<tr>
+<td>
+<code>ignoreFailures</code><br>
+<em>
+bool
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>IgnoreFailures tells the controller to skip remediation when
+the Helm tests are run but fail.
+Can be overwritten for tests run after install or upgrade actions
+in &lsquo;Install.IgnoreTestFailures&rsquo; and &lsquo;Upgrade.IgnoreTestFailures&rsquo;.</p>
+</td>
+</tr>
 </tbody>
 </table>
 </div>
@ -1222,15 +1338,18 @@ for hooks) during the performance of a Helm upgrade action. Defaults to
 </tr>
 <tr>
 <td>
-<code>maxRetries</code><br>
+<code>remediation</code><br>
 <em>
-int
+<a href="#helm.toolkit.fluxcd.io/v2alpha1.UpgradeRemediation">
+UpgradeRemediation
+</a>
 </em>
 </td>
 <td>
 <em>(Optional)</em>
-<p>MaxRetries is the number of retries that should be attempted on failures before
-bailing. Defaults to &lsquo;0&rsquo;, a negative integer equals to unlimited retries.</p>
+<p>Remediation holds the remediation configuration for when the
+Helm upgrade action for the HelmRelease fails. The default
+is to not perform any action.</p>
 </td>
 </tr>
 <tr>
@ -1314,6 +1433,84 @@ upgrade action when it fails.</p>
 </table>
 </div>
 </div>
+<h3 id="helm.toolkit.fluxcd.io/v2alpha1.UpgradeRemediation">UpgradeRemediation
+</h3>
+<p>
+(<em>Appears on:</em>
+<a href="#helm.toolkit.fluxcd.io/v2alpha1.Upgrade">Upgrade</a>)
+</p>
+<p>UpgradeRemediation holds the configuration for Helm upgrade remediation.</p>
+<div class="md-typeset__scrollwrap">
+<div class="md-typeset__table">
+<table>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<code>retries</code><br>
+<em>
+int
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>Retries is the number of retries that should be attempted on failures before
+bailing. Remediation, using &lsquo;Strategy&rsquo;, is performed between each attempt.
+Defaults to &lsquo;0&rsquo;, a negative integer equals to unlimited retries.</p>
+</td>
+</tr>
+<tr>
+<td>
+<code>ignoreTestFailures</code><br>
+<em>
+bool
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>IgnoreTestFailures tells the controller to skip remediation when
+the Helm tests are run after an upgrade action but fail.
+Defaults to &lsquo;Test.IgnoreFailures&rsquo;.</p>
+</td>
+</tr>
+<tr>
+<td>
+<code>remediateLastFailure</code><br>
+<em>
+bool
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>RemediateLastFailure tells the controller to remediate the last
+failure, when no retries remain. Defaults to &lsquo;false&rsquo; unless &lsquo;Retries&rsquo;
+is greater than 0.</p>
+</td>
+</tr>
+<tr>
+<td>
+<code>strategy</code><br>
+<em>
+<a href="#helm.toolkit.fluxcd.io/v2alpha1.RemediationStrategy">
+RemediationStrategy
+</a>
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>Strategy to use for failure remediation.
+Defaults to &lsquo;rollback&rsquo;.</p>
+</td>
+</tr>
+</tbody>
+</table>
+</div>
+</div>
 <h3 id="helm.toolkit.fluxcd.io/v2alpha1.ValuesReference">ValuesReference
 </h3>
 <p>
--- a/docs/spec/v2alpha1/helmreleases.md
+++ b/docs/spec/v2alpha1/helmreleases.md
@ -108,6 +108,12 @@ type Install struct {
 	// +optional
 	Timeout *metav1.Duration `json:"timeout,omitempty"`

+	// Remediation holds the remediation configuration for when the
+	// Helm install action for the HelmRelease fails. The default
+	// is to not perform any action.
+	// +optional
+	Remediation *InstallRemediation `json:"remediation,omitempty"`
+
 	// DisableWait disables the waiting for resources to be ready after a
 	// Helm install has been performed.
 	// +optional
@ -133,6 +139,26 @@ type Install struct {
 	SkipCRDs bool `json:"skipCRDs,omitempty"`
 }

+// InstallRemediation holds the configuration for Helm install remediation.
+type InstallRemediation struct {
+	// Retries is the number of retries that should be attempted on failures before
+	// bailing. Remediation, using an uninstall, is performed between each attempt.
+	// Defaults to '0', a negative integer equals to unlimited retries.
+	// +optional
+	Retries int `json:"retries,omitempty"`
+
+	// IgnoreTestFailures tells the controller to skip remediation when
+	// the Helm tests are run after an install action but fail.
+	// Defaults to 'Test.IgnoreFailures'.
+	// +optional
+	IgnoreTestFailures *bool `json:"ignoreTestFailures,omitempty"`
+
+	// RemediateLastFailure tells the controller to remediate the last
+	// failure, when no retries remain. Defaults to 'false'.
+	// +optional
+	RemediateLastFailure *bool `json:"remediateLastFailure,omitempty"`
+}
+
 // Upgrade holds the configuration for Helm upgrade actions for this HelmRelease.
 type Upgrade struct {
 	// Timeout is the time to wait for any individual Kubernetes operation (like Jobs
@ -141,10 +167,11 @@ type Upgrade struct {
 	// +optional
 	Timeout *metav1.Duration `json:"timeout,omitempty"`

-	// MaxRetries is the number of retries that should be attempted on failures before
-	// bailing. Defaults to '0', a negative integer equals to unlimited retries.
+	// Remediation holds the remediation configuration for when the
+	// Helm upgrade action for the HelmRelease fails. The default
+	// is to not perform any action.
 	// +optional
-	MaxRetries int `json:"maxRetries,omitempty"`
+	Remediation *UpgradeRemediation `json:"remediation,omitempty"`

 	// DisableWait disables the waiting for resources to be ready after a
 	// Helm upgrade has been performed.
@ -176,6 +203,33 @@ type Upgrade struct {
 	CleanupOnFail bool `json:"cleanupOnFail,omitempty"`
 }

+// UpgradeRemediation holds the configuration for Helm upgrade remediation.
+type UpgradeRemediation struct {
+	// Retries is the number of retries that should be attempted on failures before
+	// bailing. Remediation, using 'Strategy', is performed between each attempt.
+	// Defaults to '0', a negative integer equals to unlimited retries.
+	// +optional
+	Retries int `json:"retries,omitempty"`
+
+	// IgnoreTestFailures tells the controller to skip remediation when
+	// the Helm tests are run after an upgrade action but fail.
+	// Defaults to 'Test.IgnoreFailures'.
+	// +optional
+	IgnoreTestFailures *bool `json:"ignoreTestFailures,omitempty"`
+
+	// RemediateLastFailure tells the controller to remediate the last
+	// failure, when no retries remain. Defaults to 'false' unless 'Retries'
+	// is greater than 0.
+	// +optional
+	RemediateLastFailure *bool `json:"remediateLastFailure,omitempty"`
+
+	// Strategy to use for failure remediation.
+	// Defaults to 'rollback'.
+	// +kubebuilder:validation:Enum=rollback;uninstall
+	// +optional
+	Strategy *RemediationStrategy `json:"strategy,omitempty"`
+}
+
 // Test holds the configuration for Helm test actions for this HelmRelease.
 type Test struct {
 	// Enable enables Helm test actions for this HelmRelease after an
@ -188,15 +242,17 @@ type Test struct {
 	// 'HelmReleaseSpec.Timeout'.
 	// +optional
 	Timeout *metav1.Duration `json:"timeout,omitempty"`
+
+	// IgnoreFailures tells the controller to skip remediation when
+	// the Helm tests are run but fail.
+	// Can be overwritten for tests run after install or upgrade actions
+	// in 'Install.IgnoreTestFailures' and 'Upgrade.IgnoreTestFailures'.
+	// +optional
+	IgnoreFailures bool `json:"ignoreFailures,omitempty"`
 }

 // Rollback holds the configuration for Helm rollback actions for this HelmRelease.
 type Rollback struct {
-	// Enable enables Helm rollback actions for this HelmRelease after an
-	// Helm install or upgrade action failure.
-	// +optional
-	Enable bool `json:"enable,omitempty"`
-
 	// Timeout is the time to wait for any individual Kubernetes operation (like Jobs
 	// for hooks) during the performance of a Helm rollback action. Defaults to
 	// 'HelmReleaseSpec.Timeout'.
@ -365,6 +421,9 @@ const (
 	// InitFailedReason represents the fact that the initialization of the Helm configuration failed.
 	InitFailedReason string = "InitFailed"

+	// GetLastReleaseFailedReason represents the fact that observing the last release failed.
+	GetLastReleaseFailedReason string = "GetLastReleaseFailed"
+
 	// ProgressingReason represents the fact that the reconciliation for the resource is underway.
 	ProgressingReason string = "Progressing"

@ -511,14 +570,64 @@ spec:
        memory: 64Mi
 ```

-At present, rollbacks are only supported for failed upgrades. Rollback support for other failed
-actions (i.e. tests) is in the scope of the controller but awaits a proper design.
+## Configuring failure remediation

-## Enabling Helm test actions
+By default, when a Helm action (install/upgrade/test) fails, no remediation is taken
+(uninstall/rollback/retries). However, remediation can be opted in to in several ways
+using `spec.install.remediation` and `spec.upgrade.remediation`.
+
+Each of these support `retries`, to configure the number of additional attempts after an initial
+failure. A negative integer results in infinite retries. This implicitly opts-in to a remediation
+action between each attempt. The remediation action for install failures is an uninstall. The
+remediation action for upgrade failures is by default a rollback, however
+`spec.upgrade.remediation.strategy` can be set to `uninstall`, in which case after the uninstall,
+the `spec.install` configuration takes over.
+
+One can also opt-in to remediation of the last failure (when no retries remain) by:
+
+1. For installs, setting `spec.install.remediation.remediateLastFailure` to `true`.
+2. For upgrades, setting `spec.upgrade.remediation.remediateLastFailure` to `true`, or configuring
+   at least one retry.
+
+```yaml
+apiVersion: helm.fluxcd.io/v2alpha1
+kind: HelmRelease
+metadata:
+  name: podinfo
+spec:
+  interval: 5m
+  chart:
+    name: podinfo
+    version: '^4.0.0'
+    sourceRef:
+      kind: HelmRepository
+      name: podinfo
+    interval: 1m
+  install:
+    remediation:
+      retries: 3
+  upgrade:
+    remediation:
+      remediateLastFailure: false
+  values:
+    resources:
+      requests:
+        cpu: 100m
+        memory: 64Mi
+```
+
+## Configuring Helm test actions

 To make the controller run the Helm tests available for your chart after a successful Helm install
 or upgrade, `spec.test.enable` should be set to `true`.

+By default, when tests are enabled, failures in tests are considered release failures, and thus
+are subject to the triggering Helm action's `remediation` configuration. However, test failures
+can be ignored by setting `spec.test.ignoreFailures` to `true`. In this case, no remediation will
+be taken, and the test failure will not affect the `Ready` status condition. This can be further
+configured per Helm action by setting `spec.install.remediation.ignoreTestFailures` or
+`spec.upgrade.remediation.ignoreTestFailures`, which default to `spec.test.ignoreFailures`.
+
 ```yaml
 apiVersion: helm.toolkit.fluxcd.io/v2alpha1
 kind: HelmRelease
@ -535,6 +644,7 @@ spec:
    interval: 1m
  test:
    enable: true
+    ignoreFailures: true
  values:
    resources:
      requests:
@ -542,10 +652,6 @@ spec:
        memory: 64Mi
 ```

-At present, failed tests do not mark the `HelmRelease` as not `Ready`. Making this configurable is
-in the scope of the controller but awaits a proper design, as well as running them on a schedule or
-for other actions than a successful Helm install or upgrade.
-
 ## Status

 When the controller completes a reconciliation, it reports the result in the status sub-resource.
--- a/go.sum
+++ b/go.sum
@ -79,6 +79,7 @@ github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kB
 github.com/bitly/go-simplejson v0.5.0 h1:6IH+V8/tVMab511d5bn4M7EwGXZf9Hj6i2xSwkNEM+Y=
 github.com/bitly/go-simplejson v0.5.0/go.mod h1:cXHtHw4XUPsvGaxgjIAn8PhEWG9NfngEKAMDJEczWVA=
 github.com/blang/semver v3.1.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
+github.com/blang/semver v3.5.0+incompatible h1:CGxCgetQ64DKk7rdZ++Vfnb1+ogGNnB17OJKJXD2Cfs=
 github.com/blang/semver v3.5.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
 github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY=
 github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4=
@ -469,6 +470,7 @@ github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceT
 github.com/mitchellh/copystructure v1.0.0 h1:Laisrj+bAB6b/yJwB5Bt3ITZhGJdqmxquMKeZ+mmkFQ=
 github.com/mitchellh/copystructure v1.0.0/go.mod h1:SNtv71yrdKgLRyLFxmLdkAbkKEFWgYaq1OVrnRcwhnw=
 github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
+github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
 github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI=
 github.com/mitchellh/go-wordwrap v1.0.0 h1:6GlHJ/LTGMrIJbwgdqdl2eEH8o+Exx/0m8ir9Gns0u4=
@ -901,6 +903,7 @@ gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo=
 gopkg.in/square/go-jose.v2 v2.2.2/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
+gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME=
 gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI=
 gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=
 gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=