From 8c5a81b0ad8e409c6057890b71443d47f93f31c6 Mon Sep 17 00:00:00 2001 From: "mingzhou.swx" Date: Tue, 6 Sep 2022 16:45:24 +0800 Subject: [PATCH] add toleratedFailedReplicas field Signed-off-by: mingzhou.swx --- api/v1alpha1/batchrelease_plan_types.go | 7 +++++++ api/v1alpha1/rollout_types.go | 7 +++++++ api/v1alpha1/zz_generated.deepcopy.go | 10 ++++++++++ .../rollouts.kruise.io_batchreleases.yaml | 13 ++++++++++++ .../bases/rollouts.kruise.io_rollouts.yaml | 13 ++++++++++++ .../workloads/cloneset_control_plane.go | 8 ++++++-- .../deployment_canary_control_plane.go | 9 ++++++--- .../workloads/statefulset_like_controller.go | 5 ++++- .../batchrelease/inner_batchrelease.go | 20 +++++++++++++++---- 9 files changed, 82 insertions(+), 10 deletions(-) diff --git a/api/v1alpha1/batchrelease_plan_types.go b/api/v1alpha1/batchrelease_plan_types.go index 6eacfe6..0e2ca81 100644 --- a/api/v1alpha1/batchrelease_plan_types.go +++ b/api/v1alpha1/batchrelease_plan_types.go @@ -44,6 +44,13 @@ type ReleasePlan struct { BatchPartition *int32 `json:"batchPartition,omitempty"` // RolloutID indicates an id for each rollout progress RolloutID string `json:"rolloutID,omitempty"` + // ToleratedFailedReplicas allow users ignore some pods that are not available due to some reasons, + // such as insufficient resources. For example, there are 10 pods were updated, and the + // ToleratedFailedReplicas=2, as long as 8 or more pods are available, we think that the + // rollout progression can be continued. + // If ToleratedFailedReplicas is not set (be nil), we will reuse `MaxUnavailable` field in workload + // to tolerate failed replicas by default. + ToleratedFailedReplicas *intstr.IntOrString `json:"toleratedFailedReplicas,omitempty"` } // ReleaseBatch is used to describe how each batch release should be diff --git a/api/v1alpha1/rollout_types.go b/api/v1alpha1/rollout_types.go index d9baafc..0a94ed8 100644 --- a/api/v1alpha1/rollout_types.go +++ b/api/v1alpha1/rollout_types.go @@ -96,6 +96,13 @@ type CanaryStrategy struct { // TrafficRoutings hosts all the supported service meshes supported to enable more fine-grained traffic routing // todo current only support one TrafficRouting TrafficRoutings []*TrafficRouting `json:"trafficRoutings,omitempty"` + // ToleratedFailedReplicas allow users ignore some pods that are not available due to some reasons, + // such as insufficient resources. For example, there are 10 pods were updated, and the + // ToleratedFailedReplicas=2, as long as 8 or more pods are available, we think that the + // rollout progression can be continued. + // If ToleratedFailedReplicas is not set (be nil), we will reuse `MaxUnavailable` field in workload + // to tolerate failed replicas by default. + ToleratedFailedReplicas *intstr.IntOrString `json:"toleratedFailedReplicas,omitempty"` // MetricsAnalysis *MetricsAnalysisBackground `json:"metricsAnalysis,omitempty"` } diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 39eb6a6..8332390 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -220,6 +220,11 @@ func (in *CanaryStrategy) DeepCopyInto(out *CanaryStrategy) { } } } + if in.ToleratedFailedReplicas != nil { + in, out := &in.ToleratedFailedReplicas, &out.ToleratedFailedReplicas + *out = new(intstr.IntOrString) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CanaryStrategy. @@ -316,6 +321,11 @@ func (in *ReleasePlan) DeepCopyInto(out *ReleasePlan) { *out = new(int32) **out = **in } + if in.ToleratedFailedReplicas != nil { + in, out := &in.ToleratedFailedReplicas, &out.ToleratedFailedReplicas + *out = new(intstr.IntOrString) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReleasePlan. diff --git a/config/crd/bases/rollouts.kruise.io_batchreleases.yaml b/config/crd/bases/rollouts.kruise.io_batchreleases.yaml index 21742e9..83bd130 100644 --- a/config/crd/bases/rollouts.kruise.io_batchreleases.yaml +++ b/config/crd/bases/rollouts.kruise.io_batchreleases.yaml @@ -95,6 +95,19 @@ spec: rolloutID: description: RolloutID indicates an id for each rollout progress type: string + toleratedFailedReplicas: + anyOf: + - type: integer + - type: string + description: ToleratedFailedReplicas allow users ignore some pods + that are not available due to some reasons, such as insufficient + resources. For example, there are 10 pods were updated, and + the ToleratedFailedReplicas=2, as long as 8 or more pods are + available, we think that the rollout progression can be continued. + If ToleratedFailedReplicas is not set (be nil), we will reuse + `MaxUnavailable` field in workload to tolerate failed replicas + by default. + x-kubernetes-int-or-string: true type: object targetReference: description: TargetRef contains the GVK and name of the workload that diff --git a/config/crd/bases/rollouts.kruise.io_rollouts.yaml b/config/crd/bases/rollouts.kruise.io_rollouts.yaml index c7626aa..2eb7423 100644 --- a/config/crd/bases/rollouts.kruise.io_rollouts.yaml +++ b/config/crd/bases/rollouts.kruise.io_rollouts.yaml @@ -121,6 +121,19 @@ spec: type: integer type: object type: array + toleratedFailedReplicas: + anyOf: + - type: integer + - type: string + description: ToleratedFailedReplicas allow users ignore some + pods that are not available due to some reasons, such as + insufficient resources. For example, there are 10 pods were + updated, and the ToleratedFailedReplicas=2, as long as 8 + or more pods are available, we think that the rollout progression + can be continued. If ToleratedFailedReplicas is not set + (be nil), we will reuse `MaxUnavailable` field in workload + to tolerate failed replicas by default. + x-kubernetes-int-or-string: true trafficRoutings: description: TrafficRoutings hosts all the supported service meshes supported to enable more fine-grained traffic routing diff --git a/pkg/controller/batchrelease/workloads/cloneset_control_plane.go b/pkg/controller/batchrelease/workloads/cloneset_control_plane.go index 8bd5bb9..3700c34 100644 --- a/pkg/controller/batchrelease/workloads/cloneset_control_plane.go +++ b/pkg/controller/batchrelease/workloads/cloneset_control_plane.go @@ -287,8 +287,12 @@ func (c *CloneSetRolloutController) CheckOneBatchReady() (bool, error) { realNeedUpgradeCanaryReplicas := CalculateRealCanaryReplicasGoal(expectedBatchStableReplicas, replicas, &c.release.Spec.ReleasePlan.Batches[currentBatch].CanaryReplicas) var maxUnavailableReplicas int - if c.clone.Spec.UpdateStrategy.MaxUnavailable != nil { - maxUnavailableReplicas, _ = intstr.GetValueFromIntOrPercent(c.clone.Spec.UpdateStrategy.MaxUnavailable, int(realNeedUpgradeCanaryReplicas), true) + if c.release.Spec.ReleasePlan.ToleratedFailedReplicas != nil { + maxUnavailableReplicas, _ = intstr.GetValueFromIntOrPercent( + c.release.Spec.ReleasePlan.ToleratedFailedReplicas, int(realNeedUpgradeCanaryReplicas), true) + } else if c.clone.Spec.UpdateStrategy.MaxUnavailable != nil { + maxUnavailableReplicas, _ = intstr.GetValueFromIntOrPercent( + c.clone.Spec.UpdateStrategy.MaxUnavailable, int(realNeedUpgradeCanaryReplicas), true) } klog.V(3).InfoS("check one batch, current info:", diff --git a/pkg/controller/batchrelease/workloads/deployment_canary_control_plane.go b/pkg/controller/batchrelease/workloads/deployment_canary_control_plane.go index 46da1d9..43bcc5b 100644 --- a/pkg/controller/batchrelease/workloads/deployment_canary_control_plane.go +++ b/pkg/controller/batchrelease/workloads/deployment_canary_control_plane.go @@ -184,9 +184,12 @@ func (c *DeploymentsRolloutController) CheckOneBatchReady() (bool, error) { canaryGoal := c.calculateCurrentCanary(c.newStatus.ObservedWorkloadReplicas) // max unavailable allowed replicas maxUnavailable := 0 - if c.canary.Spec.Strategy.RollingUpdate != nil && - c.canary.Spec.Strategy.RollingUpdate.MaxUnavailable != nil { - maxUnavailable, _ = intstr.GetScaledValueFromIntOrPercent(c.canary.Spec.Strategy.RollingUpdate.MaxUnavailable, int(*c.canary.Spec.Replicas), true) + if c.release.Spec.ReleasePlan.ToleratedFailedReplicas != nil { + maxUnavailable, _ = intstr.GetScaledValueFromIntOrPercent( + c.release.Spec.ReleasePlan.ToleratedFailedReplicas, int(*c.canary.Spec.Replicas), true) + } else if c.canary.Spec.Strategy.RollingUpdate != nil && c.canary.Spec.Strategy.RollingUpdate.MaxUnavailable != nil { + maxUnavailable, _ = intstr.GetScaledValueFromIntOrPercent( + c.canary.Spec.Strategy.RollingUpdate.MaxUnavailable, int(*c.canary.Spec.Replicas), true) } klog.InfoS("checking the batch releasing progress", diff --git a/pkg/controller/batchrelease/workloads/statefulset_like_controller.go b/pkg/controller/batchrelease/workloads/statefulset_like_controller.go index 220083f..3983657 100644 --- a/pkg/controller/batchrelease/workloads/statefulset_like_controller.go +++ b/pkg/controller/batchrelease/workloads/statefulset_like_controller.go @@ -190,7 +190,10 @@ func (c *StatefulSetLikeController) IsBatchReady(canaryReplicasGoal, stableRepli } maxUnavailable := 0 - if workloadInfo.MaxUnavailable != nil { + if c.planController.Spec.ReleasePlan.ToleratedFailedReplicas != nil { + maxUnavailable, _ = intstr.GetScaledValueFromIntOrPercent( + c.planController.Spec.ReleasePlan.ToleratedFailedReplicas, int(canaryReplicasGoal), true) + } else if workloadInfo.MaxUnavailable != nil { maxUnavailable, _ = intstr.GetScaledValueFromIntOrPercent(workloadInfo.MaxUnavailable, int(canaryReplicasGoal), true) } diff --git a/pkg/controller/rollout/batchrelease/inner_batchrelease.go b/pkg/controller/rollout/batchrelease/inner_batchrelease.go index ea1d293..3ab8d15 100644 --- a/pkg/controller/rollout/batchrelease/inner_batchrelease.go +++ b/pkg/controller/rollout/batchrelease/inner_batchrelease.go @@ -84,7 +84,7 @@ func (r *innerBatchRelease) Verify(index int32) (bool, error) { // check whether batchRelease configuration is the latest newBr := createBatchRelease(r.rollout, r.batchName, r.rolloutID) - if reflect.DeepEqual(batch.Spec.ReleasePlan.Batches, newBr.Spec.ReleasePlan.Batches) { + if isSyncedContext(newBr, batch) { klog.Infof("rollout(%s/%s) batchRelease(generation:%d) configuration is the latest", r.rollout.Namespace, r.rollout.Name, batch.Generation) return true, nil } @@ -97,6 +97,7 @@ func (r *innerBatchRelease) Verify(index int32) (bool, error) { } batch.Spec.ReleasePlan.Batches = newBr.Spec.ReleasePlan.Batches batch.Spec.ReleasePlan.BatchPartition = utilpointer.Int32Ptr(index) + batch.Spec.ReleasePlan.ToleratedFailedReplicas = r.rollout.Spec.Strategy.Canary.ToleratedFailedReplicas if err = r.Client.Update(context.TODO(), batch); err != nil { return err } @@ -331,9 +332,10 @@ func createBatchRelease(rollout *rolloutv1alpha1.Rollout, batchName, rolloutID s }, }, ReleasePlan: rolloutv1alpha1.ReleasePlan{ - Batches: batches, - RolloutID: rolloutID, - BatchPartition: utilpointer.Int32Ptr(0), + Batches: batches, + RolloutID: rolloutID, + BatchPartition: utilpointer.Int32Ptr(0), + ToleratedFailedReplicas: rollout.Spec.Strategy.Canary.ToleratedFailedReplicas, }, }, } @@ -363,3 +365,13 @@ func IsPromoted(rollout *rolloutv1alpha1.Rollout, batch *rolloutv1alpha1.BatchRe } return true } + +func isSyncedContext(new, old *rolloutv1alpha1.BatchRelease) bool { + if !reflect.DeepEqual(new.Spec.ReleasePlan.ToleratedFailedReplicas, old.Spec.ReleasePlan.ToleratedFailedReplicas) { + return false + } + if !reflect.DeepEqual(new.Spec.ReleasePlan.Batches, old.Spec.ReleasePlan.Batches) { + return false + } + return true +}