Merge pull request #627 from MaxFedotov/mhc_remediation

 Support RKE2ControlPlane mhc remediation
This commit is contained in:
Furkat Gofurov 2025-05-08 07:30:26 +00:00 committed by GitHub
commit d0a0ed1624
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 1419 additions and 28 deletions

View File

@ -327,7 +327,7 @@ docker-build: buildx-machine docker-pull-prerequisites
.PHONY: docker-build-rke2-bootstrap
docker-build-rke2-bootstrap:
docker-build-rke2-bootstrap:
DOCKER_BUILDKIT=1 BUILDX_BUILDER=$(MACHINE) docker buildx build \
--platform $(ARCH) \
--load \
@ -395,7 +395,7 @@ kubectl: # Download kubectl cli into tools bin folder
##@ e2e:
# Allow overriding the e2e configurations
GINKGO_FOCUS ?= Workload cluster creation
GINKGO_FOCUS ?=
GINKGO_SKIP ?= API Version Upgrade
GINKGO_NODES ?= 1
GINKGO_NOCOLOR ?= false

View File

@ -57,6 +57,10 @@ func (src *RKE2ControlPlane) ConvertTo(dstRaw conversion.Hub) error {
dst.Spec.AgentConfig.PodSecurityAdmissionConfigFile = restored.Spec.AgentConfig.PodSecurityAdmissionConfigFile
}
if restored.Spec.RemediationStrategy != nil {
dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
}
dst.Spec.ServerConfig.EmbeddedRegistry = restored.Spec.ServerConfig.EmbeddedRegistry
dst.Spec.MachineTemplate = restored.Spec.MachineTemplate
dst.Status = restored.Status

View File

@ -428,6 +428,7 @@ func autoConvert_v1beta1_RKE2ControlPlaneSpec_To_v1alpha1_RKE2ControlPlaneSpec(i
out.RegistrationMethod = RegistrationMethod(in.RegistrationMethod)
out.RegistrationAddress = in.RegistrationAddress
out.RolloutStrategy = (*RolloutStrategy)(unsafe.Pointer(in.RolloutStrategy))
// WARNING: in.RemediationStrategy requires manual conversion: does not exist in peer-type
return nil
}
@ -461,6 +462,7 @@ func autoConvert_v1beta1_RKE2ControlPlaneStatus_To_v1alpha1_RKE2ControlPlaneStat
out.UpdatedReplicas = in.UpdatedReplicas
out.UnavailableReplicas = in.UnavailableReplicas
out.AvailableServerIPs = *(*[]string)(unsafe.Pointer(&in.AvailableServerIPs))
// WARNING: in.LastRemediation requires manual conversion: does not exist in peer-type
return nil
}

View File

@ -17,6 +17,8 @@ limitations under the License.
package v1beta1
import (
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
@ -40,6 +42,23 @@ const (
// LegacyRKE2ControlPlane is a controlplane annotation that marks the CP as legacy. This CP will not provide
// etcd certificate management or etcd membership management.
LegacyRKE2ControlPlane = "controlplane.cluster.x-k8s.io/legacy"
// RemediationInProgressAnnotation is used to keep track that a RCP remediation is in progress, and more
// specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.
// NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to
// failures in updating remediation retry or remediation count (both counters restart from zero).
RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress"
// RemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing;
// please note that in case of retry, when also the remediating machine fails, the system keeps track of
// the first machine of the sequence only.
// NOTE: if something external to CAPI removes this annotation the system this can lead to
// failures in updating remediation retry (the counter restarts from zero).
RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for"
// DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a
// machine unrelated from the previous remediation.
DefaultMinHealthyPeriod = 1 * time.Hour
)
// RKE2ControlPlaneSpec defines the desired state of RKE2ControlPlane.
@ -98,6 +117,10 @@ type RKE2ControlPlaneSpec struct {
// The RolloutStrategy to use to replace control plane machines with new ones.
RolloutStrategy *RolloutStrategy `json:"rolloutStrategy"`
// remediationStrategy is the RemediationStrategy that controls how control plane machine remediation happens.
// +optional
RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
}
// RKE2ControlPlaneMachineTemplate defines the template for Machines
@ -265,6 +288,10 @@ type RKE2ControlPlaneStatus struct {
// AvailableServerIPs is a list of the Control Plane IP adds that can be used to register further nodes.
// +optional
AvailableServerIPs []string `json:"availableServerIPs,omitempty"`
// lastRemediation stores info about last remediation performed.
// +optional
LastRemediation *LastRemediationStatus `json:"lastRemediation,omitempty"`
}
// +kubebuilder:object:root=true
@ -423,6 +450,70 @@ const (
SnapshotValidationWebhook DisabledPluginComponent = "rke2-snapshot-validation-webhook"
)
// RemediationStrategy allows to define how control plane machine remediation happens.
type RemediationStrategy struct {
// maxRetry is the Max number of retries while attempting to remediate an unhealthy machine.
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
// For example, given a control plane with three machines M1, M2, M3:
//
// M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
// If M1-1 (replacement of M1) has problems while bootstrapping it will become unhealthy, and then be
// remediated; such operation is considered a retry, remediation-retry #1.
// If M1-2 (replacement of M1-1) becomes unhealthy, remediation-retry #2 will happen, etc.
//
// A retry could happen only after RetryPeriod from the previous retry.
// If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired,
// this is not considered a retry anymore because the new issue is assumed unrelated from the previous one.
//
// If not set, the remedation will be retried infinitely.
// +optional
MaxRetry *int32 `json:"maxRetry,omitempty"`
// retryPeriod is the duration that RKE2ControlPlane should wait before remediating a machine being created as a replacement
// for an unhealthy machine (a retry).
//
// If not set, a retry will happen immediately.
// +optional
RetryPeriod metav1.Duration `json:"retryPeriod,omitempty"`
// minHealthyPeriod defines the duration after which RKE2ControlPlane will consider any failure to a machine unrelated
// from the previous one. In this case the remediation is not considered a retry anymore, and thus the retry
// counter restarts from 0. For example, assuming MinHealthyPeriod is set to 1h (default)
//
// M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
// If M1-1 (replacement of M1) has problems within the 1hr after the creation, also
// this machine will be remediated and this operation is considered a retry - a problem related
// to the original issue happened to M1 -.
//
// If instead the problem on M1-1 is happening after MinHealthyPeriod expired, e.g. four days after
// m1-1 has been created as a remediation of M1, the problem on M1-1 is considered unrelated to
// the original issue happened to M1.
//
// If not set, this value is defaulted to 1h.
// +optional
MinHealthyPeriod *metav1.Duration `json:"minHealthyPeriod,omitempty"`
}
// LastRemediationStatus stores info about last remediation performed.
// NOTE: if for any reason information about last remediation are lost, RetryCount is going to restart from 0 and thus
// more remediations than expected might happen.
type LastRemediationStatus struct {
// machine is the machine name of the latest machine being remediated.
// +required
// +kubebuilder:validation:MinLength=1
// +kubebuilder:validation:MaxLength=253
Machine string `json:"machine"`
// timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
// +required
Timestamp metav1.Time `json:"timestamp"`
// retryCount used to keep track of remediation retry for the last remediated machine.
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
// +required
RetryCount int `json:"retryCount"`
}
// RolloutStrategy describes how to replace existing machines
// with new ones.
type RolloutStrategy struct {

View File

@ -125,6 +125,22 @@ func (in *EtcdS3) DeepCopy() *EtcdS3 {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *LastRemediationStatus) DeepCopyInto(out *LastRemediationStatus) {
*out = *in
in.Timestamp.DeepCopyInto(&out.Timestamp)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LastRemediationStatus.
func (in *LastRemediationStatus) DeepCopy() *LastRemediationStatus {
if in == nil {
return nil
}
out := new(LastRemediationStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RKE2ControlPlane) DeepCopyInto(out *RKE2ControlPlane) {
*out = *in
@ -239,6 +255,11 @@ func (in *RKE2ControlPlaneSpec) DeepCopyInto(out *RKE2ControlPlaneSpec) {
*out = new(RolloutStrategy)
(*in).DeepCopyInto(*out)
}
if in.RemediationStrategy != nil {
in, out := &in.RemediationStrategy, &out.RemediationStrategy
*out = new(RemediationStrategy)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RKE2ControlPlaneSpec.
@ -276,6 +297,11 @@ func (in *RKE2ControlPlaneStatus) DeepCopyInto(out *RKE2ControlPlaneStatus) {
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.LastRemediation != nil {
in, out := &in.LastRemediation, &out.LastRemediation
*out = new(LastRemediationStatus)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RKE2ControlPlaneStatus.
@ -431,6 +457,32 @@ func (in *RKE2ServerConfig) DeepCopy() *RKE2ServerConfig {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RemediationStrategy) DeepCopyInto(out *RemediationStrategy) {
*out = *in
if in.MaxRetry != nil {
in, out := &in.MaxRetry, &out.MaxRetry
*out = new(int32)
**out = **in
}
out.RetryPeriod = in.RetryPeriod
if in.MinHealthyPeriod != nil {
in, out := &in.MinHealthyPeriod, &out.MinHealthyPeriod
*out = new(v1.Duration)
**out = **in
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemediationStrategy.
func (in *RemediationStrategy) DeepCopy() *RemediationStrategy {
if in == nil {
return nil
}
out := new(RemediationStrategy)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RollingUpdate) DeepCopyInto(out *RollingUpdate) {
*out = *in

View File

@ -1952,6 +1952,54 @@ spec:
- control-plane-endpoint
- ""
type: string
remediationStrategy:
description: remediationStrategy is the RemediationStrategy that controls
how control plane machine remediation happens.
properties:
maxRetry:
description: "maxRetry is the Max number of retries while attempting
to remediate an unhealthy machine.\nA retry happens when a machine
that was created as a replacement for an unhealthy machine also
fails.\nFor example, given a control plane with three machines
M1, M2, M3:\n\n\tM1 become unhealthy; remediation happens, and
M1-1 is created as a replacement.\n\tIf M1-1 (replacement of
M1) has problems while bootstrapping it will become unhealthy,
and then be\n\tremediated; such operation is considered a retry,
remediation-retry #1.\n\tIf M1-2 (replacement of M1-1) becomes
unhealthy, remediation-retry #2 will happen, etc.\n\nA retry
could happen only after RetryPeriod from the previous retry.\nIf
a machine is marked as unhealthy after MinHealthyPeriod from
the previous remediation expired,\nthis is not considered a
retry anymore because the new issue is assumed unrelated from
the previous one.\n\nIf not set, the remedation will be retried
infinitely."
format: int32
type: integer
minHealthyPeriod:
description: "minHealthyPeriod defines the duration after which
RKE2ControlPlane will consider any failure to a machine unrelated\nfrom
the previous one. In this case the remediation is not considered
a retry anymore, and thus the retry\ncounter restarts from 0.
For example, assuming MinHealthyPeriod is set to 1h (default)\n\n\tM1
become unhealthy; remediation happens, and M1-1 is created as
a replacement.\n\tIf M1-1 (replacement of M1) has problems within
the 1hr after the creation, also\n\tthis machine will be remediated
and this operation is considered a retry - a problem related\n\tto
the original issue happened to M1 -.\n\n\tIf instead the problem
on M1-1 is happening after MinHealthyPeriod expired, e.g. four
days after\n\tm1-1 has been created as a remediation of M1,
the problem on M1-1 is considered unrelated to\n\tthe original
issue happened to M1.\n\nIf not set, this value is defaulted
to 1h."
type: string
retryPeriod:
description: |-
retryPeriod is the duration that RKE2ControlPlane should wait before remediating a machine being created as a replacement
for an unhealthy machine (a retry).
If not set, a retry will happen immediately.
type: string
type: object
replicas:
description: Replicas is the number of replicas for the Control Plane.
format: int32
@ -2525,6 +2573,30 @@ spec:
description: Initialized indicates the target cluster has completed
initialization.
type: boolean
lastRemediation:
description: lastRemediation stores info about last remediation performed.
properties:
machine:
description: machine is the machine name of the latest machine
being remediated.
maxLength: 253
minLength: 1
type: string
retryCount:
description: |-
retryCount used to keep track of remediation retry for the last remediated machine.
A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
type: integer
timestamp:
description: timestamp is when last remediation happened. It is
represented in RFC3339 form and is in UTC.
format: date-time
type: string
required:
- machine
- retryCount
- timestamp
type: object
observedGeneration:
description: ObservedGeneration is the latest generation observed
by the controller.

View File

@ -795,6 +795,57 @@ spec:
- control-plane-endpoint
- ""
type: string
remediationStrategy:
description: remediationStrategy is the RemediationStrategy
that controls how control plane machine remediation happens.
properties:
maxRetry:
description: "maxRetry is the Max number of retries while
attempting to remediate an unhealthy machine.\nA retry
happens when a machine that was created as a replacement
for an unhealthy machine also fails.\nFor example, given
a control plane with three machines M1, M2, M3:\n\n\tM1
become unhealthy; remediation happens, and M1-1 is created
as a replacement.\n\tIf M1-1 (replacement of M1) has
problems while bootstrapping it will become unhealthy,
and then be\n\tremediated; such operation is considered
a retry, remediation-retry #1.\n\tIf M1-2 (replacement
of M1-1) becomes unhealthy, remediation-retry #2 will
happen, etc.\n\nA retry could happen only after RetryPeriod
from the previous retry.\nIf a machine is marked as
unhealthy after MinHealthyPeriod from the previous remediation
expired,\nthis is not considered a retry anymore because
the new issue is assumed unrelated from the previous
one.\n\nIf not set, the remedation will be retried infinitely."
format: int32
type: integer
minHealthyPeriod:
description: "minHealthyPeriod defines the duration after
which RKE2ControlPlane will consider any failure to
a machine unrelated\nfrom the previous one. In this
case the remediation is not considered a retry anymore,
and thus the retry\ncounter restarts from 0. For example,
assuming MinHealthyPeriod is set to 1h (default)\n\n\tM1
become unhealthy; remediation happens, and M1-1 is created
as a replacement.\n\tIf M1-1 (replacement of M1) has
problems within the 1hr after the creation, also\n\tthis
machine will be remediated and this operation is considered
a retry - a problem related\n\tto the original issue
happened to M1 -.\n\n\tIf instead the problem on M1-1
is happening after MinHealthyPeriod expired, e.g. four
days after\n\tm1-1 has been created as a remediation
of M1, the problem on M1-1 is considered unrelated to\n\tthe
original issue happened to M1.\n\nIf not set, this value
is defaulted to 1h."
type: string
retryPeriod:
description: |-
retryPeriod is the duration that RKE2ControlPlane should wait before remediating a machine being created as a replacement
for an unhealthy machine (a retry).
If not set, a retry will happen immediately.
type: string
type: object
replicas:
description: Replicas is the number of replicas for the Control
Plane.
@ -1391,6 +1442,30 @@ spec:
description: Initialized indicates the target cluster has completed
initialization.
type: boolean
lastRemediation:
description: lastRemediation stores info about last remediation performed.
properties:
machine:
description: machine is the machine name of the latest machine
being remediated.
maxLength: 253
minLength: 1
type: string
retryCount:
description: |-
retryCount used to keep track of remediation retry for the last remediated machine.
A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
type: integer
timestamp:
description: timestamp is when last remediation happened. It is
represented in RFC3339 form and is in UTC.
format: date-time
type: string
required:
- machine
- retryCount
- timestamp
type: object
observedGeneration:
description: ObservedGeneration is the latest generation observed
by the controller.

View File

@ -0,0 +1,708 @@
/*
Copyright 2022 SUSE.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controllers
import (
"context"
"encoding/json"
"fmt"
"sort"
"time"
"github.com/go-logr/logr"
"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util/annotations"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
controlplanev1 "github.com/rancher/cluster-api-provider-rke2/controlplane/api/v1beta1"
"github.com/rancher/cluster-api-provider-rke2/pkg/rke2"
)
// reconcileUnhealthyMachines tries to remediate RKE2ControlPlane unhealthy machines
// based on the process described in
// https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate
//
// Adapted from kubeadm.
//
//nolint:lll
func (r *RKE2ControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *rke2.ControlPlane) (ret ctrl.Result, retErr error) { // nolint:gocyclo
log := ctrl.LoggerFrom(ctx)
reconciliationTime := time.Now().UTC()
// Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1)
// if the underlying machine is now back to healthy / not deleting.
errList := []error{}
for _, m := range controlPlane.Machines {
if !m.DeletionTimestamp.IsZero() {
continue
}
shouldCleanup := conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) &&
conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition)
if !(shouldCleanup) {
continue
}
patchHelper, err := patch.NewHelper(m, r.Client)
if err != nil {
errList = append(errList, err)
continue
}
conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition)
if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
clusterv1.MachineOwnerRemediatedCondition,
}}); err != nil {
errList = append(errList, errors.Wrapf(err, "failed to patch machine %s", m.Name))
}
}
if len(errList) > 0 {
return ctrl.Result{}, kerrors.NewAggregate(errList)
}
// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
// and `MachineOwnerRemediated` is false, indicating that this controller is responsible for performing remediation.
machinesToBeRemediated := controlPlane.MachinesToBeRemediatedByRCP()
// If there are no machines to remediated, return so RKE2ControlPlane can proceed with other operations (ctrl.Result nil).
if len(machinesToBeRemediated) == 0 {
return ctrl.Result{}, nil
}
// Select the machine to be remediated, which is the oldest machine to be remediated not yet provisioned (if any)
// or the oldest machine to be remediated.
//
// NOTE: The current solution is considered acceptable for the most frequent use case (only one machine to be remediated),
// however, in the future this could potentially be improved for the scenario where more than one machine to be remediated exists
// by considering which machine has lower impact on etcd quorum.
machineToBeRemediated := getMachineToBeRemediated(machinesToBeRemediated, controlPlane.IsEtcdManaged())
if machineToBeRemediated == nil {
return ctrl.Result{}, errors.New("failed to find a Machine to remediate within unhealthy Machines")
}
// Returns if the machine is in the process of being deleted.
if !machineToBeRemediated.DeletionTimestamp.IsZero() {
return ctrl.Result{}, nil
}
log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.RCP.Status.Initialized)
// Returns if another remediation is in progress but the new Machine is not yet created.
// Note: This condition is checked after we check for machines to be remediated and if machineToBeRemediated
// is not being deleted to avoid unnecessary logs if no further remediation should be done.
if v, ok := controlPlane.RCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
// Check if the annotation is stale; this might happen in case there is a crash in the controller in between
// when a new Machine is created and the annotation is eventually removed from RKE2ControlPlane via defer patch at the end
// of RKE2ControlPlane reconcile.
remediationData, err := RemediationDataFromAnnotation(v)
if err != nil {
return ctrl.Result{}, err
}
staleAnnotation := false
for _, m := range controlPlane.Machines.UnsortedList() {
if m.CreationTimestamp.After(remediationData.Timestamp.Time) {
// Remove the annotation tracking that a remediation is in progress (the annotation is stale).
delete(controlPlane.RCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
staleAnnotation = true
break
}
}
if !staleAnnotation {
log.Info("Another remediation is already in progress. Skipping remediation.")
return ctrl.Result{}, nil
}
}
patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
if err != nil {
return ctrl.Result{}, err
}
defer func() {
// Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines.
if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
clusterv1.MachineOwnerRemediatedCondition,
}}); err != nil {
log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name)
if retErr == nil {
retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name)
}
}
}()
// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.
// Check if RKE2ControlPlane is allowed to remediate considering retry limits:
// - Remediation cannot happen because retryPeriod is not yet expired.
// - RKE2ControlPlane already reached MaxRetries limit.
remediationInProgressData, canRemediate, err := r.checkRetryLimits(log, machineToBeRemediated, controlPlane, reconciliationTime)
if err != nil {
return ctrl.Result{}, err
}
if !canRemediate {
// NOTE: log lines and conditions surfacing why it is not possible to remediate are set by checkRetryLimits.
return ctrl.Result{}, nil
}
// Executes checks that apply only if the control plane is already initialized; in this case RKE2ControlPlane can
// remediate only if it can safely assume that the operation preserves the operation state of the
// existing cluster (or at least it doesn't make it worse).
if controlPlane.RCP.Status.Initialized {
// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
if controlPlane.Machines.Len() <= 1 {
log.Info(
"A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation",
"replicas", controlPlane.Machines.Len(),
)
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.WaitingForRemediationReason,
clusterv1.ConditionSeverityWarning,
"RKE2ControlPlane can't remediate if current replicas are less or equal to 1",
)
return ctrl.Result{}, nil
}
// The cluster MUST NOT have healthy machines still being provisioned.
// This rule prevents RKE2ControlPlane taking actions while the cluster is in a transitional state.
if controlPlane.HasHealthyMachineStillProvisioning() {
log.Info("A control plane machine needs remediation, but there are other control-plane machines being provisioned. Skipping remediation")
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.WaitingForRemediationReason,
clusterv1.ConditionSeverityWarning,
"RKE2ControlPlane waiting for control plane machine provisioning to complete before triggering remediation",
)
return ctrl.Result{}, nil
}
// The cluster MUST have no machines with a deletion timestamp. This rule prevents RKE2ControlPlane taking actions while the cluster is in a transitional state.
if controlPlane.HasDeletingMachine() {
log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.WaitingForRemediationReason,
clusterv1.ConditionSeverityWarning,
"RKE2ControlPlane waiting for control plane machine deletion to complete before triggering remediation",
)
return ctrl.Result{}, nil
}
// Remediation MUST preserve etcd quorum. This rule ensures that RKE2ControlPlane will not remove a member that would result in etcd
// losing a majority of members and thus become unable to field new requests.
if controlPlane.IsEtcdManaged() {
canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
if err != nil {
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.RemediationFailedReason,
clusterv1.ConditionSeverityError,
err.Error(),
)
return ctrl.Result{}, err
}
if !canSafelyRemediate {
log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation")
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.WaitingForRemediationReason,
clusterv1.ConditionSeverityWarning,
"RKE2ControlPlane can't remediate this machine because this could result in etcd loosing quorum",
)
return ctrl.Result{}, nil
}
}
// Start remediating the unhealthy control plane machine by deleting it.
// A new machine will come up completing the operation as part of the regular reconcile.
// If the control plane is initialized, before deleting the machine:
// - if the machine hosts the etcd leader, forward etcd leadership to another machine.
// - delete the etcd member hosted on the machine being deleted.
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
if err != nil {
log.Error(err, "Failed to create client to workload cluster")
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
}
// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
// NOTE: etcd member removal will be performed by the rke2-cleanup hook after machine completes drain & all volumes are detached.
if controlPlane.IsEtcdManaged() {
etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
if etcdLeaderCandidate == nil {
log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.RemediationFailedReason,
clusterv1.ConditionSeverityWarning,
"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation",
)
return ctrl.Result{}, nil
}
if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate))
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.RemediationFailedReason,
clusterv1.ConditionSeverityError,
err.Error(),
)
return ctrl.Result{}, err
}
}
}
// Delete the machine
if err := r.Delete(ctx, machineToBeRemediated); err != nil {
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.RemediationFailedReason,
clusterv1.ConditionSeverityError,
err.Error(),
)
return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
}
// Surface the operation is in progress.
// Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set.
// Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time).
log.Info("Remediating unhealthy machine")
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.RemediationInProgressReason,
clusterv1.ConditionSeverityWarning,
"",
)
// Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation.
remediationInProgressValue, err := remediationInProgressData.Marshal()
if err != nil {
return ctrl.Result{}, err
}
// Set annotations tracking remediation details so they can be picked up by the machine
// that will be created as part of the scale up action that completes the remediation.
annotations.AddAnnotations(controlPlane.RCP, map[string]string{
controlplanev1.RemediationInProgressAnnotation: remediationInProgressValue,
})
return ctrl.Result{Requeue: true}, nil
}
// Gets the machine to be remediated, which is the "most broken" among the unhealthy machines, determined as the machine
// having the highest priority issue that other machines have not.
// The following issues are considered (from highest to lowest priority):
// - machine with RemediateMachineAnnotation annotation
// - machine without .status.nodeRef
// - machine with etcd issue or etcd status unknown (etcd member, etcd pod)
// - machine with control plane component issue or status unknown (API server, controller manager, scheduler)
//
// Note: In case of more than one faulty machine the chance to recover mostly depends on the control plane being able to
// successfully create a replacement Machine, because due to scale up preflight checks, this cannot happen if there are
// still issues on the control plane after the first remediation.
// This func tries to maximize those chances of a successful remediation by picking for remediation the "most broken" machine first.
func getMachineToBeRemediated(unhealthyMachines collections.Machines, isEtcdManaged bool) *clusterv1.Machine {
if unhealthyMachines.Len() == 0 {
return nil
}
machinesToBeRemediated := unhealthyMachines.UnsortedList()
if len(machinesToBeRemediated) == 1 {
return machinesToBeRemediated[0]
}
sort.Slice(machinesToBeRemediated, func(i, j int) bool {
return pickMachineToBeRemediated(machinesToBeRemediated[i], machinesToBeRemediated[j], isEtcdManaged)
})
return machinesToBeRemediated[0]
}
// pickMachineToBeRemediated returns true if machine i should be remediated before machine j.
func pickMachineToBeRemediated(i, j *clusterv1.Machine, isEtcdManaged bool) bool {
// If one machine has the RemediateMachineAnnotation annotation, remediate first.
if annotations.HasRemediateMachine(i) && !annotations.HasRemediateMachine(j) {
return true
}
if !annotations.HasRemediateMachine(i) && annotations.HasRemediateMachine(j) {
return false
}
// if one machine does not have a node ref, we assume that provisioning failed and there is no CP components at all,
// so remediate first; also without a node, it is not possible to get further info about status.
if i.Status.NodeRef == nil && j.Status.NodeRef != nil {
return true
}
if i.Status.NodeRef != nil && j.Status.NodeRef == nil {
return false
}
// if one machine has unhealthy etcd member or pod, remediate first.
if isEtcdManaged {
if p := pickMachineToBeRemediatedByConditionState(i, j, controlplanev1.MachineEtcdMemberHealthyCondition); p != nil {
return *p
}
}
// if one machine has unhealthy agent, remediate first.
if p := pickMachineToBeRemediatedByConditionState(i, j, controlplanev1.MachineAgentHealthyCondition); p != nil {
return *p
}
// Use oldest (and Name) as a tie-breaker criteria.
if i.CreationTimestamp.Equal(&j.CreationTimestamp) {
return i.Name < j.Name
}
return i.CreationTimestamp.Before(&j.CreationTimestamp)
}
// pickMachineToBeRemediatedByConditionState returns true if condition t report issue on machine i and not on machine j,
// false if the vice-versa apply, or nil if condition t doesn't provide a discriminating criteria for picking one machine or another for remediation.
func pickMachineToBeRemediatedByConditionState(i, j *clusterv1.Machine, t clusterv1.ConditionType) *bool {
iCondition := conditions.IsTrue(i, t)
jCondition := conditions.IsTrue(j, t)
if !iCondition && jCondition {
return ptr.To(true)
}
if iCondition && !jCondition {
return ptr.To(false)
}
return nil
}
// checkRetryLimits checks if RKE2ControlPlane is allowed to remediate considering retry limits:
// - Remediation cannot happen because retryPeriod is not yet expired.
// - RKE2ControlPlane already reached the maximum number of retries for a machine.
// NOTE: Counting the number of retries is required In order to prevent infinite remediation e.g. in case the
// first Control Plane machine is failing due to quota issue.
func (r *RKE2ControlPlaneReconciler) checkRetryLimits(
log logr.Logger,
machineToBeRemediated *clusterv1.Machine,
controlPlane *rke2.ControlPlane,
reconciliationTime time.Time,
) (*RemediationData, bool, error) {
// Get last remediation info from the machine.
var lastRemediationData *RemediationData
if value, ok := machineToBeRemediated.Annotations[controlplanev1.RemediationForAnnotation]; ok {
l, err := RemediationDataFromAnnotation(value)
if err != nil {
return nil, false, err
}
lastRemediationData = l
}
remediationInProgressData := &RemediationData{
Machine: machineToBeRemediated.Name,
Timestamp: metav1.Time{Time: reconciliationTime},
RetryCount: 0,
}
// If there is no last remediation, this is the first try of a new retry sequence.
if lastRemediationData == nil {
return remediationInProgressData, true, nil
}
// Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults.
minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod
if controlPlane.RCP.Spec.RemediationStrategy != nil && controlPlane.RCP.Spec.RemediationStrategy.MinHealthyPeriod != nil {
minHealthyPeriod = controlPlane.RCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration
}
retryPeriod := time.Duration(0)
if controlPlane.RCP.Spec.RemediationStrategy != nil {
retryPeriod = controlPlane.RCP.Spec.RemediationStrategy.RetryPeriod.Duration
}
// Gets the timestamp of the last remediation; if missing, default to a value
// that ensures both MinHealthyPeriod and RetryPeriod are expired.
// NOTE: this could potentially lead to executing more retries than expected or to executing retries before than
// expected, but this is considered acceptable when the system recovers from someone/something changes or deletes
// the RemediationForAnnotation on Machines.
lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod))
if !lastRemediationData.Timestamp.IsZero() {
lastRemediationTime = lastRemediationData.Timestamp.Time
}
// Once we get here we already know that there was a last remediation for the Machine.
// If the current remediation is happening before minHealthyPeriod is expired, then RKE2ControlPlane considers this
// as a remediation for the same previously unhealthy machine.
// NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp),
// this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case.
var retryForSameMachineInProgress bool
if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) {
retryForSameMachineInProgress = true
log = log.WithValues("remediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine))
}
// If the retry for the same machine is not in progress, this is the first try of a new retry sequence.
if !retryForSameMachineInProgress {
return remediationInProgressData, true, nil
}
// If the remediation is for the same machine, carry over the retry count.
remediationInProgressData.RetryCount = lastRemediationData.RetryCount
// Check if remediation can happen because retryPeriod is passed.
if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) {
log.Info(
fmt.Sprintf(
"A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation",
retryPeriod,
),
)
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.WaitingForRemediationReason,
clusterv1.ConditionSeverityWarning,
fmt.Sprintf("RKE2ControlPlane can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)", retryPeriod),
)
return remediationInProgressData, false, nil
}
// Check if remediation can happen because of maxRetry is not reached yet, if defined.
if controlPlane.RCP.Spec.RemediationStrategy != nil && controlPlane.RCP.Spec.RemediationStrategy.MaxRetry != nil {
maxRetry := int(*controlPlane.RCP.Spec.RemediationStrategy.MaxRetry)
if remediationInProgressData.RetryCount >= maxRetry {
log.Info(
fmt.Sprintf(
"A control plane machine needs remediation, but the operation already failed %d times (MaxRetry %d). Skipping remediation",
remediationInProgressData.RetryCount,
maxRetry,
),
)
conditions.MarkFalse(
machineToBeRemediated,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.WaitingForRemediationReason,
clusterv1.ConditionSeverityWarning,
"RKE2ControlPlane can't remediate this machine because the operation already failed %d times (MaxRetry)",
maxRetry,
)
return remediationInProgressData, false, nil
}
}
// All the check passed, increase the remediation retry count.
remediationInProgressData.RetryCount++
return remediationInProgressData, true, nil
}
// canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
// without loosing etcd quorum.
//
// The answer mostly depend on the existence of other failing members on top of the one being deleted, and according
// to the etcd fault tolerance specification (see https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance):
// - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target
// cluster size after deletion is 2, fault tolerance 0)
// - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target
// cluster size after deletion is 4, fault tolerance 1)
// - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target
// cluster size after deletion is 6, fault tolerance 2)
// - etc.
//
// NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers
// as well as reconcileControlPlaneAndMachinesConditions before this.
func (r *RKE2ControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context,
controlPlane *rke2.ControlPlane,
machineToBeRemediated *clusterv1.Machine,
) (bool, error) {
log := ctrl.LoggerFrom(ctx)
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
if err != nil {
return false, errors.Wrapf(err, "failed to get client for workload cluster %s", controlPlane.Cluster.Name)
}
// Gets the etcd status
// This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions.
etcdMembers, err := workloadCluster.EtcdMembers(ctx)
if err != nil {
return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", controlPlane.Cluster.Name)
}
currentTotalMembers := len(etcdMembers)
log.Info("etcd cluster before remediation",
"currentTotalMembers", currentTotalMembers,
"currentMembers", etcdMembers)
// Projects the target etcd cluster after remediation, considering all the etcd members except the one being remediated.
targetTotalMembers := 0
targetUnhealthyMembers := 0
healthyMembers := []string{}
unhealthyMembers := []string{}
for _, etcdMember := range etcdMembers {
// Skip the machine to be deleted because it won't be part of the target etcd cluster.
if machineToBeRemediated.Status.NodeRef != nil && machineToBeRemediated.Status.NodeRef.Name == etcdMember {
continue
}
// Include the member in the target etcd cluster.
targetTotalMembers++
// Search for the machine corresponding to the etcd member.
var machine *clusterv1.Machine
for _, m := range controlPlane.Machines {
if m.Status.NodeRef != nil && m.Status.NodeRef.Name == etcdMember {
machine = m
break
}
}
// If an etcd member does not have a corresponding machine it is not possible to retrieve etcd member health,
// so RKE2ControlPlane is assuming the worst scenario and considering the member unhealthy.
//
// NOTE: This should not happen given that RKE2ControlPlane is running reconcileEtcdMembers before calling this method.
if machine == nil {
log.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "memberName", etcdMember)
targetUnhealthyMembers++
unhealthyMembers = append(unhealthyMembers, etcdMember+" (no machine)")
continue
}
// Check member health as reported by machine's health conditions
if !conditions.IsTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) {
targetUnhealthyMembers++
unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
continue
}
healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
}
// See https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance for fault tolerance formula explanation.
targetQuorum := (targetTotalMembers / 2.0) + 1 //nolint:mnd
canSafelyRemediate := targetTotalMembers-targetUnhealthyMembers >= targetQuorum
log.Info("etcd cluster projected after remediation of "+machineToBeRemediated.Name,
"healthyMembers", healthyMembers,
"unhealthyMembers", unhealthyMembers,
"targetTotalMembers", targetTotalMembers,
"targetQuorum", targetQuorum,
"targetUnhealthyMembers", targetUnhealthyMembers,
"canSafelyRemediate", canSafelyRemediate)
return canSafelyRemediate, nil
}
// RemediationData struct is used to keep track of information stored in the RemediationInProgressAnnotation in RKE2ControlPlane
// during remediation and then into the RemediationForAnnotation on the replacement machine once it is created.
type RemediationData struct {
// machine is the machine name of the latest machine being remediated.
Machine string `json:"machine"`
// timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
Timestamp metav1.Time `json:"timestamp"`
// retryCount used to keep track of remediation retry for the last remediated machine.
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
RetryCount int `json:"retryCount"`
}
// RemediationDataFromAnnotation gets RemediationData from an annotation value.
func RemediationDataFromAnnotation(value string) (*RemediationData, error) {
ret := &RemediationData{}
if err := json.Unmarshal([]byte(value), ret); err != nil {
return nil, errors.Wrapf(err, "failed to unmarshal value %s for %s annotation", value, clusterv1.RemediationInProgressReason)
}
return ret, nil
}
// Marshal an RemediationData into an annotation value.
func (r *RemediationData) Marshal() (string, error) {
b, err := json.Marshal(r)
if err != nil {
return "", errors.Wrapf(err, "failed to marshal value for %s annotation", clusterv1.RemediationInProgressReason)
}
return string(b), nil
}
// ToStatus converts a RemediationData into a LastRemediationStatus struct.
func (r *RemediationData) ToStatus() *controlplanev1.LastRemediationStatus {
return &controlplanev1.LastRemediationStatus{
Machine: r.Machine,
Timestamp: r.Timestamp,
RetryCount: r.RetryCount,
}
}

View File

@ -308,6 +308,7 @@ func (r *RKE2ControlPlaneReconciler) SetupWithManager(ctx context.Context, mgr c
return nil
}
// nolint:gocyclo
func (r *RKE2ControlPlaneReconciler) updateStatus(ctx context.Context, rcp *controlplanev1.RKE2ControlPlane, cluster *clusterv1.Cluster) error {
logger := log.FromContext(ctx)
@ -348,7 +349,7 @@ func (r *RKE2ControlPlaneReconciler) updateStatus(ctx context.Context, rcp *cont
logger.V(3).Info("Ready Machine : " + readyMachine.Name)
}
controlPlane, err := rke2.NewControlPlane(ctx, r.Client, cluster, rcp, ownedMachines)
controlPlane, err := rke2.NewControlPlane(ctx, r.managementCluster, r.Client, cluster, rcp, ownedMachines)
if err != nil {
logger.Error(err, "failed to initialize control plane")
@ -423,7 +424,7 @@ func (r *RKE2ControlPlaneReconciler) updateStatus(ctx context.Context, rcp *cont
rcp.Status.ReadyReplicas = rke2util.SafeInt32(len(readyMachines))
rcp.Status.UnavailableReplicas = replicas - rcp.Status.ReadyReplicas
workloadCluster, err := r.getWorkloadCluster(ctx, util.ObjectKey(cluster))
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
if err != nil {
logger.Error(err, "Failed to get remote client for workload cluster", "cluster key", util.ObjectKey(cluster))
@ -480,6 +481,37 @@ func (r *RKE2ControlPlaneReconciler) updateStatus(ctx context.Context, rcp *cont
controlPlane.RCP.Status.Version = lowestVersion
}
// Surface lastRemediation data in status.
// LastRemediation is the remediation currently in progress, in any, or the
// most recent of the remediation we are keeping track on machines.
var lastRemediation *RemediationData
if v, ok := controlPlane.RCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
remediationData, err := RemediationDataFromAnnotation(v)
if err != nil {
return err
}
lastRemediation = remediationData
} else {
for _, m := range controlPlane.Machines.UnsortedList() {
if v, ok := m.Annotations[controlplanev1.RemediationForAnnotation]; ok {
remediationData, err := RemediationDataFromAnnotation(v)
if err != nil {
return err
}
if lastRemediation == nil || lastRemediation.Timestamp.Time.Before(remediationData.Timestamp.Time) {
lastRemediation = remediationData
}
}
}
}
if lastRemediation != nil {
controlPlane.RCP.Status.LastRemediation = lastRemediation.ToStatus()
}
logger.Info("Successfully updated RKE2ControlPlane status", "namespace", rcp.Namespace, "name", rcp.Name)
return nil
@ -554,7 +586,7 @@ func (r *RKE2ControlPlaneReconciler) reconcileNormal(
return ctrl.Result{}, nil
}
controlPlane, err := rke2.NewControlPlane(ctx, r.Client, cluster, rcp, ownedMachines)
controlPlane, err := rke2.NewControlPlane(ctx, r.managementCluster, r.Client, cluster, rcp, ownedMachines)
if err != nil {
logger.Error(err, "failed to initialize control plane")
@ -590,6 +622,12 @@ func (r *RKE2ControlPlaneReconciler) reconcileNormal(
return result, err
}
// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
// otherwise continue with the other RCP operations.
if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() {
return result, err
}
// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
needRollout := controlPlane.MachinesNeedingRollout()
@ -745,7 +783,7 @@ func (r *RKE2ControlPlaneReconciler) reconcileDelete(ctx context.Context,
return ctrl.Result{}, nil
}
controlPlane, err := rke2.NewControlPlane(ctx, r.Client, cluster, rcp, ownedMachines)
controlPlane, err := rke2.NewControlPlane(ctx, r.managementCluster, r.Client, cluster, rcp, ownedMachines)
if err != nil {
logger.Error(err, "failed to initialize control plane")
@ -915,7 +953,7 @@ func (r *RKE2ControlPlaneReconciler) reconcileControlPlaneConditions(
return ctrl.Result{}, nil
}
workloadCluster, err := r.getWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
if err != nil {
logger.Error(err, "Failed to get remote client for workload cluster", "cluster key", util.ObjectKey(controlPlane.Cluster))
@ -967,7 +1005,7 @@ func (r *RKE2ControlPlaneReconciler) upgradeControlPlane(
return ctrl.Result{}, nil
}
workloadCluster, err := r.getWorkloadCluster(ctx, util.ObjectKey(cluster))
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
if err != nil {
logger.Error(err, "Failed to get remote client for workload cluster", "cluster key", util.ObjectKey(cluster))
@ -1129,17 +1167,6 @@ func machineHasOtherPreTerminateHooks(machine *clusterv1.Machine) bool {
return false
}
// getWorkloadCluster gets a cluster object.
// The cluster comes with an etcd client generator to connect to any etcd pod living on a managed machine.
func (r *RKE2ControlPlaneReconciler) getWorkloadCluster(ctx context.Context, clusterKey types.NamespacedName) (rke2.WorkloadCluster, error) {
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, clusterKey)
if err != nil {
return nil, fmt.Errorf("getting remote client for workload cluster: %w", err)
}
return workloadCluster, nil
}
// syncMachines updates Machines, InfrastructureMachines and Rke2Configs to propagate in-place mutable fields from RKE2ControlPlane.
// Note: For InfrastructureMachines and Rke2Configs it also drops ownership of "metadata.labels" and
// "metadata.annotations" from "manager" so that "rke2controlplane" can own these fields and can work with SSA.

View File

@ -209,7 +209,12 @@ var _ = Describe("Reconcile control plane conditions", func() {
},
}
cp, err = rke2.NewControlPlane(ctx, testEnv.GetClient(), cluster, rcp, collections.FromMachineList(&ml))
m := &rke2.Management{
Client: testEnv,
SecretCachingClient: testEnv,
}
cp, err = rke2.NewControlPlane(ctx, m, testEnv.GetClient(), cluster, rcp, collections.FromMachineList(&ml))
Expect(err).ToNot(HaveOccurred())
ref := metav1.OwnerReference{

View File

@ -25,6 +25,18 @@ import (
"github.com/rancher/cluster-api-provider-rke2/pkg/etcd"
)
// NodeNameFromMember returns the node name from the etcd member name.
func NodeNameFromMember(member *etcd.Member) string {
memberName := member.Name
lastIndex := strings.LastIndex(memberName, "-")
if lastIndex != -1 {
memberName = memberName[:lastIndex]
}
return memberName
}
// MemberForName returns the etcd member with the matching name.
func MemberForName(members []*etcd.Member, name string) *etcd.Member {
for _, m := range members {

View File

@ -55,11 +55,15 @@ type ControlPlane struct {
Rke2Configs map[string]*bootstrapv1.RKE2Config
InfraResources map[string]*unstructured.Unstructured
managementCluster ManagementCluster
workloadCluster WorkloadCluster
}
// NewControlPlane returns an instantiated ControlPlane.
func NewControlPlane(
ctx context.Context,
managementCluster ManagementCluster,
client client.Client,
cluster *clusterv1.Cluster,
rcp *controlplanev1.RKE2ControlPlane,
@ -98,6 +102,7 @@ func NewControlPlane(
Rke2Configs: rke2Configs,
InfraResources: infraObjects,
reconciliationTime: metav1.Now(),
managementCluster: managementCluster,
}, nil
}
@ -366,6 +371,16 @@ func GetRKE2Configs(ctx context.Context, cl client.Client, machines collections.
return result, nil
}
// IsEtcdManaged returns true if the control plane relies on a managed etcd.
func (c *ControlPlane) IsEtcdManaged() bool {
return true
}
// MachinesToBeRemediatedByRCP returns the list of control plane machines to be remediated by RCP.
func (c *ControlPlane) MachinesToBeRemediatedByRCP() collections.Machines {
return c.Machines.Filter(collections.IsUnhealthyAndOwnerRemediated)
}
// UnhealthyMachines returns the list of control plane machines marked as unhealthy by MHC.
func (c *ControlPlane) UnhealthyMachines() collections.Machines {
return c.Machines.Filter(collections.IsUnhealthy)
@ -413,6 +428,28 @@ func (c *ControlPlane) PatchMachines(ctx context.Context) error {
return kerrors.NewAggregate(errList)
}
// HasHealthyMachineStillProvisioning returns true if any healthy machine in the control plane is still in the process of being provisioned.
func (c *ControlPlane) HasHealthyMachineStillProvisioning() bool {
return len(c.HealthyMachines().Filter(collections.Not(collections.HasNode()))) > 0
}
// GetWorkloadCluster builds a cluster object.
// The cluster comes with an etcd client generator to connect to any etcd pod living on a managed machine.
func (c *ControlPlane) GetWorkloadCluster(ctx context.Context) (WorkloadCluster, error) {
if c.workloadCluster != nil {
return c.workloadCluster, nil
}
workloadCluster, err := c.managementCluster.GetWorkloadCluster(ctx, client.ObjectKeyFromObject(c.Cluster))
if err != nil {
return nil, err
}
c.workloadCluster = workloadCluster
return c.workloadCluster, nil
}
// machinesByDeletionTimestamp sorts a list of Machines by deletion timestamp, using their names as a tie breaker.
// Machines without DeletionTimestamp go after machines with this field set.
type machinesByDeletionTimestamp []*clusterv1.Machine

View File

@ -256,7 +256,8 @@ func (w *Workload) EtcdMembers(ctx context.Context) ([]string, error) {
names := []string{}
for _, member := range members {
names = append(names, member.Name)
// Convert etcd member to node name
names = append(names, etcdutil.NodeNameFromMember(member))
}
return names, nil

View File

@ -137,7 +137,7 @@ var _ = Describe("Node metadata propagation", func() {
w, err := m.NewWorkload(ctx, testEnv.GetClient(), testEnv.GetConfig(), types.NamespacedName{})
Expect(err).ToNot(HaveOccurred())
cp, err := NewControlPlane(ctx, testEnv.GetClient(), nil, nil, machines)
cp, err := NewControlPlane(ctx, m, testEnv.GetClient(), nil, nil, machines)
Expect(err).ToNot(HaveOccurred())
Expect(w.InitWorkload(ctx, cp)).ToNot(HaveOccurred())
Expect(w.UpdateNodeMetadata(ctx, cp)).ToNot(HaveOccurred())
@ -167,7 +167,7 @@ var _ = Describe("Node metadata propagation", func() {
w, err := m.NewWorkload(ctx, testEnv.GetClient(), testEnv.GetConfig(), types.NamespacedName{})
Expect(err).ToNot(HaveOccurred())
cp, err := NewControlPlane(ctx, testEnv.GetClient(), nil, nil, machines)
cp, err := NewControlPlane(ctx, m, testEnv.GetClient(), nil, nil, machines)
Expect(err).ToNot(HaveOccurred())
Expect(w.InitWorkload(ctx, cp)).ToNot(HaveOccurred())
Expect(w.UpdateNodeMetadata(ctx, cp)).ToNot(HaveOccurred())
@ -197,7 +197,7 @@ var _ = Describe("Node metadata propagation", func() {
w, err := m.NewWorkload(ctx, testEnv.GetClient(), testEnv.GetConfig(), types.NamespacedName{})
Expect(err).ToNot(HaveOccurred())
cp, err := NewControlPlane(ctx, testEnv.GetClient(), nil, nil, machines)
cp, err := NewControlPlane(ctx, m, testEnv.GetClient(), nil, nil, machines)
Expect(err).ToNot(HaveOccurred())
Expect(w.InitWorkload(ctx, cp)).ToNot(HaveOccurred())
Expect(w.UpdateNodeMetadata(ctx, cp)).ToNot(HaveOccurred())
@ -244,7 +244,7 @@ var _ = Describe("Node metadata propagation", func() {
w, err := m.NewWorkload(ctx, testEnv.GetClient(), testEnv.GetConfig(), types.NamespacedName{})
Expect(err).ToNot(HaveOccurred())
cp, err := NewControlPlane(ctx, testEnv.GetClient(), nil, nil, machines)
cp, err := NewControlPlane(ctx, m, testEnv.GetClient(), nil, nil, machines)
Expect(w.InitWorkload(ctx, cp)).ToNot(HaveOccurred())
Expect(err).ToNot(HaveOccurred())
Expect(w.UpdateNodeMetadata(ctx, cp)).ToNot(HaveOccurred())
@ -290,7 +290,7 @@ var _ = Describe("Node metadata propagation", func() {
w, err := m.NewWorkload(ctx, testEnv.GetClient(), testEnv.GetConfig(), types.NamespacedName{})
Expect(err).ToNot(HaveOccurred())
cp, err := NewControlPlane(ctx, testEnv.GetClient(), nil, nil, machines)
cp, err := NewControlPlane(ctx, m, testEnv.GetClient(), nil, nil, machines)
Expect(err).ToNot(HaveOccurred())
Expect(w.InitWorkload(ctx, cp)).ToNot(HaveOccurred())
Expect(w.UpdateNodeMetadata(ctx, cp)).ToNot(HaveOccurred())
@ -340,7 +340,7 @@ var _ = Describe("Node metadata propagation", func() {
w, err := m.NewWorkload(ctx, testEnv.GetClient(), testEnv.GetConfig(), types.NamespacedName{})
Expect(err).ToNot(HaveOccurred())
cp, err := NewControlPlane(ctx, testEnv.GetClient(), nil, nil, machines)
cp, err := NewControlPlane(ctx, m, testEnv.GetClient(), nil, nil, machines)
Expect(err).ToNot(HaveOccurred())
Expect(w.InitWorkload(ctx, cp)).ToNot(HaveOccurred())
Expect(w.UpdateNodeMetadata(ctx, cp)).ToNot(HaveOccurred())
@ -360,7 +360,7 @@ var _ = Describe("Node metadata propagation", func() {
machines = collections.FromMachineList(&clusterv1.MachineList{Items: []clusterv1.Machine{
*machineDifferentNode,
}})
cp, err = NewControlPlane(ctx, testEnv.GetClient(), nil, nil, machines)
cp, err = NewControlPlane(ctx, m, testEnv.GetClient(), nil, nil, machines)
Expect(err).ToNot(HaveOccurred())
Expect(w.InitWorkload(ctx, cp)).ToNot(HaveOccurred())
Expect(w.UpdateNodeMetadata(ctx, cp)).ToNot(HaveOccurred())

View File

@ -61,6 +61,23 @@ providers:
new: "--leader-elect=false"
- old: --metrics-addr=127.0.0.1:8080
new: --metrics-addr=:8080
- name: docker-rcp-remediation
type: InfrastructureProvider
versions:
- name: "v1.9.5"
value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.9.5/infrastructure-components-development.yaml"
type: "url"
contract: v1beta1
files:
- sourcePath: "../data/infrastructure/cluster-template-kcp-remediation.yaml"
- sourcePath: "../data/shared/v1beta1/metadata.yaml"
replacements:
- old: "imagePullPolicy: Always"
new: "imagePullPolicy: IfNotPresent"
- old: "--leader-elect"
new: "--leader-elect=false"
- old: --metrics-addr=127.0.0.1:8080
new: --metrics-addr=:8080
- name: rke2-control-plane
type: ControlPlaneProvider
versions:
@ -140,3 +157,6 @@ intervals:
default/wait-deployment: ["5m", "10s"]
default/wait-job: ["5m", "10s"]
default/wait-service: ["3m", "10s"]
kcp-remediation/wait-machines: ["5m", "10s"]
kcp-remediation/check-machines-stable: ["30s", "5s"]
kcp-remediation/wait-machine-provisioned: ["5m", "10s"]

View File

@ -0,0 +1,227 @@
apiVersion: v1
data:
value: |-
# generated by kind
global
log /dev/log local0
log /dev/log local1 notice
daemon
# limit memory usage to approximately 18 MB
# (see https://github.com/kubernetes-sigs/kind/pull/3115)
maxconn 100000
resolvers docker
nameserver dns 127.0.0.11:53
defaults
log global
mode tcp
option dontlognull
# TODO: tune these
timeout connect 5000
timeout client 50000
timeout server 50000
# allow to boot despite dns don't resolve backends
default-server init-addr none
frontend stats
mode http
bind *:8404
stats enable
stats uri /stats
stats refresh 1s
stats admin if TRUE
frontend control-plane
bind *:{{ .FrontendControlPlanePort }}
{{ if .IPv6 -}}
bind :::{{ .FrontendControlPlanePort }};
{{- end }}
default_backend kube-apiservers
backend kube-apiservers
option httpchk GET /healthz
{{range $server, $backend := .BackendServers}}
server {{ $server }} {{ JoinHostPort $backend.Address $.BackendControlPlanePort }} check check-ssl verify none resolvers docker resolve-prefer {{ if $.IPv6 -}} ipv6 {{- else -}} ipv4 {{- end }}
{{- end}}
frontend rke2-join
bind *:9345
{{ if .IPv6 -}}
bind :::9345;
{{- end }}
default_backend rke2-servers
backend rke2-servers
option httpchk GET /v1-rke2/readyz
http-check expect status 403
{{range $server, $backend := .BackendServers}}
server {{ $server }} {{ $backend.Address }}:9345 check check-ssl verify none
{{- end}}
kind: ConfigMap
metadata:
name: ${CLUSTER_NAME}-lb-config
---
apiVersion: cluster.x-k8s.io/v1beta1
kind: Cluster
metadata:
name: ${CLUSTER_NAME}
spec:
clusterNetwork:
pods:
cidrBlocks:
- 10.45.0.0/16
services:
cidrBlocks:
- 10.46.0.0/16
serviceDomain: cluster.local
controlPlaneRef:
apiVersion: controlplane.cluster.x-k8s.io/v1beta1
kind: RKE2ControlPlane
name: ${CLUSTER_NAME}-control-plane
infrastructureRef:
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: DockerCluster
name: ${CLUSTER_NAME}
---
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: DockerCluster
metadata:
name: ${CLUSTER_NAME}
spec:
loadBalancer:
customHAProxyConfigTemplateRef:
name: ${CLUSTER_NAME}-lb-config
---
apiVersion: controlplane.cluster.x-k8s.io/v1beta1
kind: RKE2ControlPlane
metadata:
name: ${CLUSTER_NAME}-control-plane
spec:
replicas: ${CONTROL_PLANE_MACHINE_COUNT}
version: ${KUBERNETES_VERSION}+rke2r1
preRKE2Commands:
- ./wait-signal.sh "${TOKEN}" "${SERVER}" "${NAMESPACE}"
files:
- path: /wait-signal.sh
content: |
#!/bin/bash
set -o errexit
set -o pipefail
echo "Waiting for signal..."
TOKEN=$1
SERVER=$2
NAMESPACE=$3
while true;
do
sleep 1s
signal=$(curl -k -s --header "Authorization: Bearer $TOKEN" $SERVER/api/v1/namespaces/$NAMESPACE/configmaps/mhc-test | jq -r .data.signal?)
echo "signal $signal"
if [ "$signal" == "pass" ]; then
curl -k -s --header "Authorization: Bearer $TOKEN" -XPATCH -H "Content-Type: application/strategic-merge-patch+json" --data '{"data": {"signal": "ack-pass"}}' $SERVER/api/v1/namespaces/$NAMESPACE/configmaps/mhc-test
exit 0
fi
done
permissions: "0777"
rolloutStrategy:
type: "RollingUpdate"
rollingUpdate:
maxSurge: 1
agentConfig:
nodeAnnotations:
test: "true"
serverConfig:
disableComponents:
pluginComponents:
- rke2-ingress-nginx
kubernetesComponents:
- cloudController
kubeAPIServer:
extraArgs:
- --anonymous-auth=true
infrastructureRef:
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: DockerMachineTemplate
name: "${CLUSTER_NAME}-control-plane"
nodeDrainTimeout: 30s
---
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: DockerMachineTemplate
metadata:
name: "${CLUSTER_NAME}-control-plane"
spec:
template:
spec:
customImage: kindest/node:${KIND_IMAGE_VERSION}
bootstrapTimeout: 15m
---
apiVersion: cluster.x-k8s.io/v1beta1
kind: MachineDeployment
metadata:
name: ${CLUSTER_NAME}-mhc-0
spec:
clusterName: ${CLUSTER_NAME}
replicas: ${WORKER_MACHINE_COUNT}
selector:
matchLabels:
cluster.x-k8s.io/cluster-name: ${CLUSTER_NAME}
template:
spec:
version: ${KUBERNETES_VERSION}+rke2r1
clusterName: ${CLUSTER_NAME}
bootstrap:
configRef:
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: RKE2ConfigTemplate
name: ${CLUSTER_NAME}-mhc-0
infrastructureRef:
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: DockerMachineTemplate
name: ${CLUSTER_NAME}-mhc-0
---
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: DockerMachineTemplate
metadata:
name: ${CLUSTER_NAME}-mhc-0
spec:
template:
spec:
customImage: kindest/node:${KIND_IMAGE_VERSION}
bootstrapTimeout: 15m
---
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: RKE2ConfigTemplate
metadata:
name: ${CLUSTER_NAME}-mhc-0
spec:
template:
spec:
agentConfig:
nodeAnnotations:
test: "true"
---
apiVersion: cluster.x-k8s.io/v1beta1
kind: MachineHealthCheck
metadata:
name: ${CLUSTER_NAME}-mhc-0
namespace: ${NAMESPACE}
spec:
clusterName: ${CLUSTER_NAME}
maxUnhealthy: 100%
nodeStartupTimeout: 30s
selector:
matchLabels:
cluster.x-k8s.io/control-plane: ""
mhc-test: fail
unhealthyConditions:
- status: "False"
timeout: 10s
type: e2e.remediation.condition

View File

@ -0,0 +1,58 @@
//go:build e2e
// +build e2e
/*
Copyright 2023 SUSE.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e
import (
"os"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"k8s.io/utils/ptr"
capi_e2e "sigs.k8s.io/cluster-api/test/e2e"
)
var _ = Describe("When testing RCP remediation", func() {
var (
specName = "create-workload-cluster"
)
BeforeEach(func() {
Expect(e2eConfig).ToNot(BeNil(), "Invalid argument. e2eConfig can't be nil when calling %s spec", specName)
Expect(clusterctlConfigPath).To(BeAnExistingFile(), "Invalid argument. clusterctlConfigPath must be an existing file when calling %s spec", specName)
Expect(bootstrapClusterProxy).ToNot(BeNil(), "Invalid argument. bootstrapClusterProxy can't be nil when calling %s spec", specName)
Expect(os.MkdirAll(artifactFolder, 0755)).To(Succeed(), "Invalid argument. artifactFolder can't be created for %s spec", specName)
Expect(e2eConfig.Variables).To(HaveKey(KubernetesVersion))
By("Initializing the bootstrap cluster")
initBootstrapCluster(bootstrapClusterProxy, e2eConfig, clusterctlConfigPath, artifactFolder)
})
capi_e2e.KCPRemediationSpec(ctx, func() capi_e2e.KCPRemediationSpecInput {
return capi_e2e.KCPRemediationSpecInput{
E2EConfig: e2eConfig,
ClusterctlConfigPath: clusterctlConfigPath,
BootstrapClusterProxy: bootstrapClusterProxy,
ArtifactFolder: artifactFolder,
SkipCleanup: skipCleanup,
InfrastructureProvider: ptr.To("docker-rcp-remediation:v1.9.5"),
}
})
})