710 lines
28 KiB
Go
710 lines
28 KiB
Go
/*
|
|
Copyright 2022 SUSE.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package controllers
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"sort"
|
|
"time"
|
|
|
|
"github.com/go-logr/logr"
|
|
"github.com/pkg/errors"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
kerrors "k8s.io/apimachinery/pkg/util/errors"
|
|
"k8s.io/klog/v2"
|
|
"k8s.io/utils/ptr"
|
|
ctrl "sigs.k8s.io/controller-runtime"
|
|
|
|
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
|
|
"sigs.k8s.io/cluster-api/util/annotations"
|
|
"sigs.k8s.io/cluster-api/util/collections"
|
|
"sigs.k8s.io/cluster-api/util/conditions"
|
|
"sigs.k8s.io/cluster-api/util/patch"
|
|
|
|
controlplanev1 "github.com/rancher/cluster-api-provider-rke2/controlplane/api/v1beta1"
|
|
"github.com/rancher/cluster-api-provider-rke2/pkg/rke2"
|
|
)
|
|
|
|
// reconcileUnhealthyMachines tries to remediate RKE2ControlPlane unhealthy machines
|
|
// based on the process described in
|
|
// https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate
|
|
//
|
|
// Adapted from kubeadm.
|
|
//
|
|
//nolint:lll
|
|
func (r *RKE2ControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *rke2.ControlPlane) (ret ctrl.Result, retErr error) { // nolint:gocyclo
|
|
log := ctrl.LoggerFrom(ctx)
|
|
reconciliationTime := time.Now().UTC()
|
|
|
|
// Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1)
|
|
// if the underlying machine is now back to healthy / not deleting.
|
|
errList := []error{}
|
|
|
|
for _, m := range controlPlane.Machines {
|
|
if !m.DeletionTimestamp.IsZero() {
|
|
continue
|
|
}
|
|
|
|
shouldCleanup := conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) &&
|
|
conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition)
|
|
|
|
if !(shouldCleanup) {
|
|
continue
|
|
}
|
|
|
|
patchHelper, err := patch.NewHelper(m, r.Client)
|
|
if err != nil {
|
|
errList = append(errList, err)
|
|
|
|
continue
|
|
}
|
|
|
|
conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition)
|
|
|
|
if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
}}); err != nil {
|
|
errList = append(errList, errors.Wrapf(err, "failed to patch machine %s", m.Name))
|
|
}
|
|
}
|
|
|
|
if len(errList) > 0 {
|
|
return ctrl.Result{}, kerrors.NewAggregate(errList)
|
|
}
|
|
|
|
// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
|
|
// and `MachineOwnerRemediated` is false, indicating that this controller is responsible for performing remediation.
|
|
machinesToBeRemediated := controlPlane.MachinesToBeRemediatedByRCP()
|
|
|
|
// If there are no machines to remediated, return so RKE2ControlPlane can proceed with other operations (ctrl.Result nil).
|
|
if len(machinesToBeRemediated) == 0 {
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
// Select the machine to be remediated, which is the oldest machine to be remediated not yet provisioned (if any)
|
|
// or the oldest machine to be remediated.
|
|
//
|
|
// NOTE: The current solution is considered acceptable for the most frequent use case (only one machine to be remediated),
|
|
// however, in the future this could potentially be improved for the scenario where more than one machine to be remediated exists
|
|
// by considering which machine has lower impact on etcd quorum.
|
|
machineToBeRemediated := getMachineToBeRemediated(machinesToBeRemediated, controlPlane.IsEtcdManaged())
|
|
if machineToBeRemediated == nil {
|
|
return ctrl.Result{}, errors.New("failed to find a Machine to remediate within unhealthy Machines")
|
|
}
|
|
|
|
// Returns if the machine is in the process of being deleted.
|
|
if !machineToBeRemediated.DeletionTimestamp.IsZero() {
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.RCP.Status.Initialized)
|
|
|
|
// Returns if another remediation is in progress but the new Machine is not yet created.
|
|
// Note: This condition is checked after we check for machines to be remediated and if machineToBeRemediated
|
|
// is not being deleted to avoid unnecessary logs if no further remediation should be done.
|
|
if v, ok := controlPlane.RCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
|
|
// Check if the annotation is stale; this might happen in case there is a crash in the controller in between
|
|
// when a new Machine is created and the annotation is eventually removed from RKE2ControlPlane via defer patch at the end
|
|
// of RKE2ControlPlane reconcile.
|
|
remediationData, err := RemediationDataFromAnnotation(v)
|
|
if err != nil {
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
staleAnnotation := false
|
|
|
|
for _, m := range controlPlane.Machines.UnsortedList() {
|
|
if m.CreationTimestamp.After(remediationData.Timestamp.Time) {
|
|
// Remove the annotation tracking that a remediation is in progress (the annotation is stale).
|
|
delete(controlPlane.RCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
|
|
|
|
staleAnnotation = true
|
|
|
|
break
|
|
}
|
|
}
|
|
|
|
if !staleAnnotation {
|
|
log.Info("Another remediation is already in progress. Skipping remediation.")
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
}
|
|
|
|
patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
|
|
if err != nil {
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
defer func() {
|
|
// Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines.
|
|
if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
}}); err != nil {
|
|
log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name)
|
|
|
|
if retErr == nil {
|
|
retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name)
|
|
}
|
|
}
|
|
}()
|
|
|
|
// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
|
|
// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.
|
|
|
|
// Check if RKE2ControlPlane is allowed to remediate considering retry limits:
|
|
// - Remediation cannot happen because retryPeriod is not yet expired.
|
|
// - RKE2ControlPlane already reached MaxRetries limit.
|
|
remediationInProgressData, canRemediate, err := r.checkRetryLimits(log, machineToBeRemediated, controlPlane, reconciliationTime)
|
|
if err != nil {
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
if !canRemediate {
|
|
// NOTE: log lines and conditions surfacing why it is not possible to remediate are set by checkRetryLimits.
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
// Executes checks that apply only if the control plane is already initialized; in this case RKE2ControlPlane can
|
|
// remediate only if it can safely assume that the operation preserves the operation state of the
|
|
// existing cluster (or at least it doesn't make it worse).
|
|
if controlPlane.RCP.Status.Initialized {
|
|
// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
|
|
if controlPlane.Machines.Len() <= 1 {
|
|
log.Info(
|
|
"A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation",
|
|
"replicas", controlPlane.Machines.Len(),
|
|
)
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.WaitingForRemediationReason,
|
|
clusterv1.ConditionSeverityWarning,
|
|
"RKE2ControlPlane can't remediate if current replicas are less or equal to 1",
|
|
)
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
// The cluster MUST NOT have healthy machines still being provisioned.
|
|
// This rule prevents RKE2ControlPlane taking actions while the cluster is in a transitional state.
|
|
if controlPlane.HasHealthyMachineStillProvisioning() {
|
|
log.Info("A control plane machine needs remediation, but there are other control-plane machines being provisioned. Skipping remediation")
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.WaitingForRemediationReason,
|
|
clusterv1.ConditionSeverityWarning,
|
|
"RKE2ControlPlane waiting for control plane machine provisioning to complete before triggering remediation",
|
|
)
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
// The cluster MUST have no machines with a deletion timestamp. This rule prevents RKE2ControlPlane taking actions while the cluster is in a transitional state.
|
|
if controlPlane.HasDeletingMachine() {
|
|
log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.WaitingForRemediationReason,
|
|
clusterv1.ConditionSeverityWarning,
|
|
"RKE2ControlPlane waiting for control plane machine deletion to complete before triggering remediation",
|
|
)
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
// Remediation MUST preserve etcd quorum. This rule ensures that RKE2ControlPlane will not remove a member that would result in etcd
|
|
// losing a majority of members and thus become unable to field new requests.
|
|
if controlPlane.IsEtcdManaged() && controlPlane.UsesEmbeddedEtcd() {
|
|
canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
|
|
if err != nil {
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.RemediationFailedReason,
|
|
clusterv1.ConditionSeverityError,
|
|
"%s", err.Error(),
|
|
)
|
|
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
if !canSafelyRemediate {
|
|
log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation")
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.WaitingForRemediationReason,
|
|
clusterv1.ConditionSeverityWarning,
|
|
"RKE2ControlPlane can't remediate this machine because this could result in etcd loosing quorum",
|
|
)
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
}
|
|
|
|
// Start remediating the unhealthy control plane machine by deleting it.
|
|
// A new machine will come up completing the operation as part of the regular reconcile.
|
|
|
|
// If the control plane is initialized, before deleting the machine:
|
|
// - if the machine hosts the etcd leader, forward etcd leadership to another machine.
|
|
// - delete the etcd member hosted on the machine being deleted.
|
|
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
|
|
if err != nil {
|
|
log.Error(err, "Failed to create client to workload cluster")
|
|
|
|
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
|
|
}
|
|
|
|
// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
|
|
// NOTE: etcd member removal will be performed by the rke2-cleanup hook after machine completes drain & all volumes are detached.
|
|
if controlPlane.IsEtcdManaged() && controlPlane.UsesEmbeddedEtcd() {
|
|
etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
|
|
if etcdLeaderCandidate == nil {
|
|
log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.RemediationFailedReason,
|
|
clusterv1.ConditionSeverityWarning,
|
|
"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation",
|
|
)
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
|
|
log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate))
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.RemediationFailedReason,
|
|
clusterv1.ConditionSeverityError,
|
|
"%s", err.Error(),
|
|
)
|
|
|
|
return ctrl.Result{}, err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Delete the machine
|
|
if err := r.Delete(ctx, machineToBeRemediated); err != nil {
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.RemediationFailedReason,
|
|
clusterv1.ConditionSeverityError,
|
|
"%s", err.Error(),
|
|
)
|
|
|
|
return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
|
|
}
|
|
|
|
// Surface the operation is in progress.
|
|
// Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set.
|
|
// Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time).
|
|
log.Info("Remediating unhealthy machine")
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.RemediationInProgressReason,
|
|
clusterv1.ConditionSeverityWarning,
|
|
"",
|
|
)
|
|
|
|
// Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation.
|
|
remediationInProgressValue, err := remediationInProgressData.Marshal()
|
|
if err != nil {
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
// Set annotations tracking remediation details so they can be picked up by the machine
|
|
// that will be created as part of the scale up action that completes the remediation.
|
|
annotations.AddAnnotations(controlPlane.RCP, map[string]string{
|
|
controlplanev1.RemediationInProgressAnnotation: remediationInProgressValue,
|
|
})
|
|
|
|
return ctrl.Result{Requeue: true}, nil
|
|
}
|
|
|
|
// Gets the machine to be remediated, which is the "most broken" among the unhealthy machines, determined as the machine
|
|
// having the highest priority issue that other machines have not.
|
|
// The following issues are considered (from highest to lowest priority):
|
|
// - machine with RemediateMachineAnnotation annotation
|
|
// - machine without .status.nodeRef
|
|
// - machine with etcd issue or etcd status unknown (etcd member, etcd pod)
|
|
// - machine with control plane component issue or status unknown (API server, controller manager, scheduler)
|
|
//
|
|
// Note: In case of more than one faulty machine the chance to recover mostly depends on the control plane being able to
|
|
// successfully create a replacement Machine, because due to scale up preflight checks, this cannot happen if there are
|
|
// still issues on the control plane after the first remediation.
|
|
// This func tries to maximize those chances of a successful remediation by picking for remediation the "most broken" machine first.
|
|
func getMachineToBeRemediated(unhealthyMachines collections.Machines, isEtcdManaged bool) *clusterv1.Machine {
|
|
if unhealthyMachines.Len() == 0 {
|
|
return nil
|
|
}
|
|
|
|
machinesToBeRemediated := unhealthyMachines.UnsortedList()
|
|
if len(machinesToBeRemediated) == 1 {
|
|
return machinesToBeRemediated[0]
|
|
}
|
|
|
|
sort.Slice(machinesToBeRemediated, func(i, j int) bool {
|
|
return pickMachineToBeRemediated(machinesToBeRemediated[i], machinesToBeRemediated[j], isEtcdManaged)
|
|
})
|
|
|
|
return machinesToBeRemediated[0]
|
|
}
|
|
|
|
// pickMachineToBeRemediated returns true if machine i should be remediated before machine j.
|
|
func pickMachineToBeRemediated(i, j *clusterv1.Machine, isEtcdManaged bool) bool {
|
|
// If one machine has the RemediateMachineAnnotation annotation, remediate first.
|
|
if annotations.HasRemediateMachine(i) && !annotations.HasRemediateMachine(j) {
|
|
return true
|
|
}
|
|
|
|
if !annotations.HasRemediateMachine(i) && annotations.HasRemediateMachine(j) {
|
|
return false
|
|
}
|
|
|
|
// if one machine does not have a node ref, we assume that provisioning failed and there is no CP components at all,
|
|
// so remediate first; also without a node, it is not possible to get further info about status.
|
|
if i.Status.NodeRef == nil && j.Status.NodeRef != nil {
|
|
return true
|
|
}
|
|
|
|
if i.Status.NodeRef != nil && j.Status.NodeRef == nil {
|
|
return false
|
|
}
|
|
|
|
// if one machine has unhealthy etcd member or pod, remediate first.
|
|
if isEtcdManaged {
|
|
if p := pickMachineToBeRemediatedByConditionState(i, j, controlplanev1.MachineEtcdMemberHealthyCondition); p != nil {
|
|
return *p
|
|
}
|
|
}
|
|
|
|
// if one machine has unhealthy agent, remediate first.
|
|
if p := pickMachineToBeRemediatedByConditionState(i, j, controlplanev1.MachineAgentHealthyCondition); p != nil {
|
|
return *p
|
|
}
|
|
|
|
// Use oldest (and Name) as a tie-breaker criteria.
|
|
if i.CreationTimestamp.Equal(&j.CreationTimestamp) {
|
|
return i.Name < j.Name
|
|
}
|
|
|
|
return i.CreationTimestamp.Before(&j.CreationTimestamp)
|
|
}
|
|
|
|
// pickMachineToBeRemediatedByConditionState returns true if condition t report issue on machine i and not on machine j,
|
|
// false if the vice-versa apply, or nil if condition t doesn't provide a discriminating criteria for picking one machine or another for remediation.
|
|
func pickMachineToBeRemediatedByConditionState(i, j *clusterv1.Machine, t clusterv1.ConditionType) *bool {
|
|
iCondition := conditions.IsTrue(i, t)
|
|
jCondition := conditions.IsTrue(j, t)
|
|
|
|
if !iCondition && jCondition {
|
|
return ptr.To(true)
|
|
}
|
|
|
|
if iCondition && !jCondition {
|
|
return ptr.To(false)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// checkRetryLimits checks if RKE2ControlPlane is allowed to remediate considering retry limits:
|
|
// - Remediation cannot happen because retryPeriod is not yet expired.
|
|
// - RKE2ControlPlane already reached the maximum number of retries for a machine.
|
|
// NOTE: Counting the number of retries is required In order to prevent infinite remediation e.g. in case the
|
|
// first Control Plane machine is failing due to quota issue.
|
|
func (r *RKE2ControlPlaneReconciler) checkRetryLimits(
|
|
log logr.Logger,
|
|
machineToBeRemediated *clusterv1.Machine,
|
|
controlPlane *rke2.ControlPlane,
|
|
reconciliationTime time.Time,
|
|
) (*RemediationData, bool, error) {
|
|
// Get last remediation info from the machine.
|
|
var lastRemediationData *RemediationData
|
|
|
|
if value, ok := machineToBeRemediated.Annotations[controlplanev1.RemediationForAnnotation]; ok {
|
|
l, err := RemediationDataFromAnnotation(value)
|
|
if err != nil {
|
|
return nil, false, err
|
|
}
|
|
|
|
lastRemediationData = l
|
|
}
|
|
|
|
remediationInProgressData := &RemediationData{
|
|
Machine: machineToBeRemediated.Name,
|
|
Timestamp: metav1.Time{Time: reconciliationTime},
|
|
RetryCount: 0,
|
|
}
|
|
|
|
// If there is no last remediation, this is the first try of a new retry sequence.
|
|
if lastRemediationData == nil {
|
|
return remediationInProgressData, true, nil
|
|
}
|
|
|
|
// Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults.
|
|
minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod
|
|
if controlPlane.RCP.Spec.RemediationStrategy != nil && controlPlane.RCP.Spec.RemediationStrategy.MinHealthyPeriod != nil {
|
|
minHealthyPeriod = controlPlane.RCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration
|
|
}
|
|
|
|
retryPeriod := time.Duration(0)
|
|
if controlPlane.RCP.Spec.RemediationStrategy != nil {
|
|
retryPeriod = controlPlane.RCP.Spec.RemediationStrategy.RetryPeriod.Duration
|
|
}
|
|
|
|
// Gets the timestamp of the last remediation; if missing, default to a value
|
|
// that ensures both MinHealthyPeriod and RetryPeriod are expired.
|
|
// NOTE: this could potentially lead to executing more retries than expected or to executing retries before than
|
|
// expected, but this is considered acceptable when the system recovers from someone/something changes or deletes
|
|
// the RemediationForAnnotation on Machines.
|
|
lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod))
|
|
if !lastRemediationData.Timestamp.IsZero() {
|
|
lastRemediationTime = lastRemediationData.Timestamp.Time
|
|
}
|
|
|
|
// Once we get here we already know that there was a last remediation for the Machine.
|
|
// If the current remediation is happening before minHealthyPeriod is expired, then RKE2ControlPlane considers this
|
|
// as a remediation for the same previously unhealthy machine.
|
|
// NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp),
|
|
// this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case.
|
|
var retryForSameMachineInProgress bool
|
|
if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) {
|
|
retryForSameMachineInProgress = true
|
|
log = log.WithValues("remediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine))
|
|
}
|
|
|
|
// If the retry for the same machine is not in progress, this is the first try of a new retry sequence.
|
|
if !retryForSameMachineInProgress {
|
|
return remediationInProgressData, true, nil
|
|
}
|
|
|
|
// If the remediation is for the same machine, carry over the retry count.
|
|
remediationInProgressData.RetryCount = lastRemediationData.RetryCount
|
|
|
|
// Check if remediation can happen because retryPeriod is passed.
|
|
if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) {
|
|
log.Info(
|
|
fmt.Sprintf(
|
|
"A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation",
|
|
retryPeriod,
|
|
),
|
|
)
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.WaitingForRemediationReason,
|
|
clusterv1.ConditionSeverityWarning,
|
|
"RKE2ControlPlane can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)",
|
|
retryPeriod,
|
|
)
|
|
|
|
return remediationInProgressData, false, nil
|
|
}
|
|
|
|
// Check if remediation can happen because of maxRetry is not reached yet, if defined.
|
|
if controlPlane.RCP.Spec.RemediationStrategy != nil && controlPlane.RCP.Spec.RemediationStrategy.MaxRetry != nil {
|
|
maxRetry := int(*controlPlane.RCP.Spec.RemediationStrategy.MaxRetry)
|
|
if remediationInProgressData.RetryCount >= maxRetry {
|
|
log.Info(
|
|
fmt.Sprintf(
|
|
"A control plane machine needs remediation, but the operation already failed %d times (MaxRetry %d). Skipping remediation",
|
|
remediationInProgressData.RetryCount,
|
|
maxRetry,
|
|
),
|
|
)
|
|
conditions.MarkFalse(
|
|
machineToBeRemediated,
|
|
clusterv1.MachineOwnerRemediatedCondition,
|
|
clusterv1.WaitingForRemediationReason,
|
|
clusterv1.ConditionSeverityWarning,
|
|
"RKE2ControlPlane can't remediate this machine because the operation already failed %d times (MaxRetry)",
|
|
maxRetry,
|
|
)
|
|
|
|
return remediationInProgressData, false, nil
|
|
}
|
|
}
|
|
|
|
// All the check passed, increase the remediation retry count.
|
|
remediationInProgressData.RetryCount++
|
|
|
|
return remediationInProgressData, true, nil
|
|
}
|
|
|
|
// canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
|
|
// without loosing etcd quorum.
|
|
//
|
|
// The answer mostly depend on the existence of other failing members on top of the one being deleted, and according
|
|
// to the etcd fault tolerance specification (see https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance):
|
|
// - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target
|
|
// cluster size after deletion is 2, fault tolerance 0)
|
|
// - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target
|
|
// cluster size after deletion is 4, fault tolerance 1)
|
|
// - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target
|
|
// cluster size after deletion is 6, fault tolerance 2)
|
|
// - etc.
|
|
//
|
|
// NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers
|
|
// as well as reconcileControlPlaneAndMachinesConditions before this.
|
|
func (r *RKE2ControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context,
|
|
controlPlane *rke2.ControlPlane,
|
|
machineToBeRemediated *clusterv1.Machine,
|
|
) (bool, error) {
|
|
log := ctrl.LoggerFrom(ctx)
|
|
|
|
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
|
|
if err != nil {
|
|
return false, errors.Wrapf(err, "failed to get client for workload cluster %s", controlPlane.Cluster.Name)
|
|
}
|
|
|
|
// Gets the etcd status
|
|
|
|
// This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions.
|
|
etcdMembers, err := workloadCluster.EtcdMembers(ctx)
|
|
if err != nil {
|
|
return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", controlPlane.Cluster.Name)
|
|
}
|
|
|
|
currentTotalMembers := len(etcdMembers)
|
|
|
|
log.Info("etcd cluster before remediation",
|
|
"currentTotalMembers", currentTotalMembers,
|
|
"currentMembers", etcdMembers)
|
|
|
|
// Projects the target etcd cluster after remediation, considering all the etcd members except the one being remediated.
|
|
targetTotalMembers := 0
|
|
targetUnhealthyMembers := 0
|
|
|
|
healthyMembers := []string{}
|
|
unhealthyMembers := []string{}
|
|
|
|
for _, etcdMember := range etcdMembers {
|
|
// Skip the machine to be deleted because it won't be part of the target etcd cluster.
|
|
if machineToBeRemediated.Status.NodeRef != nil && machineToBeRemediated.Status.NodeRef.Name == etcdMember {
|
|
continue
|
|
}
|
|
|
|
// Include the member in the target etcd cluster.
|
|
targetTotalMembers++
|
|
|
|
// Search for the machine corresponding to the etcd member.
|
|
var machine *clusterv1.Machine
|
|
|
|
for _, m := range controlPlane.Machines {
|
|
if m.Status.NodeRef != nil && m.Status.NodeRef.Name == etcdMember {
|
|
machine = m
|
|
|
|
break
|
|
}
|
|
}
|
|
|
|
// If an etcd member does not have a corresponding machine it is not possible to retrieve etcd member health,
|
|
// so RKE2ControlPlane is assuming the worst scenario and considering the member unhealthy.
|
|
//
|
|
// NOTE: This should not happen given that RKE2ControlPlane is running reconcileEtcdMembers before calling this method.
|
|
if machine == nil {
|
|
log.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "memberName", etcdMember)
|
|
|
|
targetUnhealthyMembers++
|
|
|
|
unhealthyMembers = append(unhealthyMembers, etcdMember+" (no machine)")
|
|
|
|
continue
|
|
}
|
|
|
|
// Check member health as reported by machine's health conditions
|
|
if !conditions.IsTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) {
|
|
targetUnhealthyMembers++
|
|
|
|
unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
|
|
|
|
continue
|
|
}
|
|
|
|
healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
|
|
}
|
|
|
|
// See https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance for fault tolerance formula explanation.
|
|
targetQuorum := (targetTotalMembers / 2.0) + 1 //nolint:mnd
|
|
canSafelyRemediate := targetTotalMembers-targetUnhealthyMembers >= targetQuorum
|
|
|
|
log.Info("etcd cluster projected after remediation of "+machineToBeRemediated.Name,
|
|
"healthyMembers", healthyMembers,
|
|
"unhealthyMembers", unhealthyMembers,
|
|
"targetTotalMembers", targetTotalMembers,
|
|
"targetQuorum", targetQuorum,
|
|
"targetUnhealthyMembers", targetUnhealthyMembers,
|
|
"canSafelyRemediate", canSafelyRemediate)
|
|
|
|
return canSafelyRemediate, nil
|
|
}
|
|
|
|
// RemediationData struct is used to keep track of information stored in the RemediationInProgressAnnotation in RKE2ControlPlane
|
|
// during remediation and then into the RemediationForAnnotation on the replacement machine once it is created.
|
|
type RemediationData struct {
|
|
// machine is the machine name of the latest machine being remediated.
|
|
Machine string `json:"machine"`
|
|
|
|
// timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
|
|
Timestamp metav1.Time `json:"timestamp"`
|
|
|
|
// retryCount used to keep track of remediation retry for the last remediated machine.
|
|
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
|
|
RetryCount int `json:"retryCount"`
|
|
}
|
|
|
|
// RemediationDataFromAnnotation gets RemediationData from an annotation value.
|
|
func RemediationDataFromAnnotation(value string) (*RemediationData, error) {
|
|
ret := &RemediationData{}
|
|
if err := json.Unmarshal([]byte(value), ret); err != nil {
|
|
return nil, errors.Wrapf(err, "failed to unmarshal value %s for %s annotation", value, clusterv1.RemediationInProgressReason)
|
|
}
|
|
|
|
return ret, nil
|
|
}
|
|
|
|
// Marshal an RemediationData into an annotation value.
|
|
func (r *RemediationData) Marshal() (string, error) {
|
|
b, err := json.Marshal(r)
|
|
if err != nil {
|
|
return "", errors.Wrapf(err, "failed to marshal value for %s annotation", clusterv1.RemediationInProgressReason)
|
|
}
|
|
|
|
return string(b), nil
|
|
}
|
|
|
|
// ToStatus converts a RemediationData into a LastRemediationStatus struct.
|
|
func (r *RemediationData) ToStatus() *controlplanev1.LastRemediationStatus {
|
|
return &controlplanev1.LastRemediationStatus{
|
|
Machine: r.Machine,
|
|
Timestamp: r.Timestamp,
|
|
RetryCount: r.RetryCount,
|
|
}
|
|
}
|