/* Copyright 2022 The Kruise Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package rollout import ( "context" "fmt" "time" appsv1alpha1 "github.com/openkruise/kruise-api/apps/v1alpha1" rolloutv1alpha1 "github.com/openkruise/rollouts/api/v1alpha1" "github.com/openkruise/rollouts/pkg/util" apps "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" ) func (r *rolloutContext) runCanary() error { canaryStatus := r.newStatus.CanaryStatus // init canary status if canaryStatus.CanaryRevision == "" { canaryStatus.CurrentStepState = rolloutv1alpha1.CanaryStepStateUpgrade canaryStatus.CanaryRevision = r.workload.CanaryRevision canaryStatus.CurrentStepIndex = 1 canaryStatus.RolloutHash = r.rollout.Annotations[util.RolloutHashAnnotation] } // update canary status canaryStatus.CanaryReplicas = r.workload.CanaryReplicas canaryStatus.CanaryReadyReplicas = r.workload.CanaryReadyReplicas switch canaryStatus.CurrentStepState { case rolloutv1alpha1.CanaryStepStateUpgrade: klog.Infof("rollout(%s/%s) run canary strategy, and state(%s)", r.rollout.Namespace, r.rollout.Name, rolloutv1alpha1.CanaryStepStateUpgrade) // If the last step is 100%, there is no need to execute the canary process at this time if r.rollout.Spec.Strategy.Canary.Steps[canaryStatus.CurrentStepIndex-1].Weight == 100 { klog.Infof("rollout(%s/%s) last step is 100%, there is no need to execute the canary process at this time, and set state=%s", r.rollout.Namespace, r.rollout.Name, canaryStatus.CurrentStepIndex-1, canaryStatus.CurrentStepIndex, rolloutv1alpha1.CanaryStepStateCompleted) canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} canaryStatus.CurrentStepState = rolloutv1alpha1.CanaryStepStateCompleted } else { done, err := r.doCanaryUpgrade() if err != nil { return err } else if done { canaryStatus.CurrentStepState = rolloutv1alpha1.CanaryStepStateTrafficRouting canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", r.rollout.Namespace, r.rollout.Name, canaryStatus.CurrentStepIndex, rolloutv1alpha1.CanaryStepStateUpgrade, canaryStatus.CurrentStepState) } } case rolloutv1alpha1.CanaryStepStateTrafficRouting: klog.Infof("rollout(%s/%s) run canary strategy, and state(%s)", r.rollout.Namespace, r.rollout.Name, rolloutv1alpha1.CanaryStepStateTrafficRouting) done, err := r.doCanaryTrafficRouting() if err != nil { return err } else if done { canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} canaryStatus.CurrentStepState = rolloutv1alpha1.CanaryStepStateMetricsAnalysis klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", r.rollout.Namespace, r.rollout.Name, canaryStatus.CurrentStepIndex, rolloutv1alpha1.CanaryStepStateTrafficRouting, canaryStatus.CurrentStepState) } expectedTime := time.Now().Add(time.Duration(defaultGracePeriodSeconds) * time.Second) r.recheckTime = &expectedTime case rolloutv1alpha1.CanaryStepStateMetricsAnalysis: klog.Infof("rollout(%s/%s) run canary strategy, and state(%s)", r.rollout.Namespace, r.rollout.Name, rolloutv1alpha1.CanaryStepStateMetricsAnalysis) done, err := r.doCanaryMetricsAnalysis() if err != nil { return err } else if done { canaryStatus.CurrentStepState = rolloutv1alpha1.CanaryStepStatePaused klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", r.rollout.Namespace, r.rollout.Name, canaryStatus.CurrentStepIndex, rolloutv1alpha1.CanaryStepStateMetricsAnalysis, canaryStatus.CurrentStepState) } case rolloutv1alpha1.CanaryStepStatePaused: klog.Infof("rollout(%s/%s) run canary strategy, and state(%s)", r.rollout.Namespace, r.rollout.Name, rolloutv1alpha1.CanaryStepStatePaused) done, err := r.doCanaryPaused() if err != nil { return err } else if done { canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} canaryStatus.CurrentStepState = rolloutv1alpha1.CanaryStepStateReady klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", r.rollout.Namespace, r.rollout.Name, canaryStatus.CurrentStepIndex, rolloutv1alpha1.CanaryStepStatePaused, canaryStatus.CurrentStepState) } case rolloutv1alpha1.CanaryStepStateReady: klog.Infof("rollout(%s/%s) run canary strategy, and state(%s)", r.rollout.Namespace, r.rollout.Name, rolloutv1alpha1.CanaryStepStateReady) // run next step if len(r.rollout.Spec.Strategy.Canary.Steps) > int(canaryStatus.CurrentStepIndex) { canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} canaryStatus.CurrentStepIndex++ canaryStatus.CurrentStepState = rolloutv1alpha1.CanaryStepStateUpgrade klog.Infof("rollout(%s/%s) canary step from(%d) -> to(%d)", r.rollout.Namespace, r.rollout.Name, canaryStatus.CurrentStepIndex-1, canaryStatus.CurrentStepIndex) } else { klog.Infof("rollout(%s/%s) canary run all steps, and completed", r.rollout.Namespace, r.rollout.Name) canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} canaryStatus.CurrentStepState = rolloutv1alpha1.CanaryStepStateCompleted } klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", r.rollout.Namespace, r.rollout.Name, canaryStatus.CurrentStepIndex, rolloutv1alpha1.CanaryStepStateReady, canaryStatus.CurrentStepState) // canary completed case rolloutv1alpha1.CanaryStepStateCompleted: klog.Infof("rollout(%s/%s) run canary strategy, and state(%s)", r.rollout.Namespace, r.rollout.Name, rolloutv1alpha1.CanaryStepStateCompleted) } return nil } func (r *rolloutContext) doCanaryUpgrade() (bool, error) { // only traffic routing /*if len(r.rollout.Spec.Strategy.Canary.Steps) == 0 { if r.workload.CanaryReadyReplicas > 0 { klog.Infof("rollout(%s/%s) workload(%s) canaryAvailable(%d), and go to the next stage", r.rollout.Namespace, r.rollout.Name, r.workload.Name, r.workload.CanaryReadyReplicas) return true, nil } klog.Infof("rollout(%s/%s) workload(%s) canaryAvailable(%d), and wait a moment", r.rollout.Namespace, r.rollout.Name, r.workload.Name, r.workload.CanaryReadyReplicas) return false, nil }*/ // verify whether batchRelease configuration is the latest steps := len(r.rollout.Spec.Strategy.Canary.Steps) canaryStatus := r.newStatus.CanaryStatus isLatest, err := r.batchControl.Verify(canaryStatus.CurrentStepIndex) if err != nil { return false, err } else if !isLatest { return false, nil } // fetch batchRelease batch, err := r.batchControl.FetchBatchRelease() if err != nil { return false, err } else if batch.Status.ObservedReleasePlanHash != util.HashReleasePlanBatches(&batch.Spec.ReleasePlan) || batch.Generation != batch.Status.ObservedGeneration { klog.Infof("rollout(%s/%s) batchReleasePlan is not consistent, and wait a moment", r.rollout.Namespace, r.rollout.Name) return false, nil } batchData := util.DumpJSON(batch.Status) cond := util.GetRolloutCondition(*r.newStatus, rolloutv1alpha1.RolloutConditionProgressing) cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and upgrade workload new versions", canaryStatus.CurrentStepIndex, steps) r.newStatus.Message = cond.Message // promote workload next batch release if *batch.Spec.ReleasePlan.BatchPartition+1 < canaryStatus.CurrentStepIndex { r.recorder.Eventf(r.rollout, corev1.EventTypeNormal, "Progressing", fmt.Sprintf("start upgrade step(%d) canary pods with new versions", canaryStatus.CurrentStepIndex)) klog.Infof("rollout(%s/%s) will promote batch from(%d) -> to(%d)", r.rollout.Namespace, r.rollout.Name, *batch.Spec.ReleasePlan.BatchPartition+1, canaryStatus.CurrentStepIndex) return r.batchControl.Promote(canaryStatus.CurrentStepIndex, false) } // check whether batchRelease is ready if batch.Status.CanaryStatus.CurrentBatchState != rolloutv1alpha1.ReadyBatchState || batch.Status.CanaryStatus.CurrentBatch+1 != canaryStatus.CurrentStepIndex { klog.Infof("rollout(%s/%s) batch(%s) state(%s), and wait a moment", r.rollout.Namespace, r.rollout.Name, batchData, batch.Status.CanaryStatus.CurrentBatchState) return false, nil } r.recorder.Eventf(r.rollout, corev1.EventTypeNormal, "Progressing", fmt.Sprintf("upgrade step(%d) canary pods with new versions done", canaryStatus.CurrentStepIndex)) klog.Infof("rollout(%s/%s) batch(%s) state(%s), and success", r.rollout.Namespace, r.rollout.Name, batchData, batch.Status.CanaryStatus.CurrentBatchState) return true, nil } func (r *rolloutContext) doCanaryMetricsAnalysis() (bool, error) { // todo return true, nil } func (r *rolloutContext) doCanaryPaused() (bool, error) { // No step set, need manual confirmation if len(r.rollout.Spec.Strategy.Canary.Steps) == 0 { klog.Infof("rollout(%s/%s) don't contains steps, and need manual confirmation", r.rollout.Namespace, r.rollout.Name) return false, nil } canaryStatus := r.newStatus.CanaryStatus currentStep := r.rollout.Spec.Strategy.Canary.Steps[canaryStatus.CurrentStepIndex-1] steps := len(r.rollout.Spec.Strategy.Canary.Steps) cond := util.GetRolloutCondition(*r.newStatus, rolloutv1alpha1.RolloutConditionProgressing) // need manual confirmation if currentStep.Pause.Duration == nil { klog.Infof("rollout(%s/%s) don't set pause duration, and need manual confirmation", r.rollout.Namespace, r.rollout.Name) cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and you need manually confirm to enter the next step", canaryStatus.CurrentStepIndex, steps) r.newStatus.Message = cond.Message return false, nil } cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and wait duration(%d seconds) to enter the next step", canaryStatus.CurrentStepIndex, steps, *currentStep.Pause.Duration) r.newStatus.Message = cond.Message // wait duration time, then go to next step duration := time.Second * time.Duration(*currentStep.Pause.Duration) expectedTime := canaryStatus.LastUpdateTime.Add(duration) if expectedTime.Before(time.Now()) { klog.Infof("rollout(%s/%s) canary step(%d) paused duration(%d seconds), and go to the next step", r.rollout.Namespace, r.rollout.Name, canaryStatus.CurrentStepIndex, *currentStep.Pause.Duration) return true, nil } r.recheckTime = &expectedTime return false, nil } // cleanup after rollout is completed or finished func (r *rolloutContext) doCanaryFinalising() (bool, error) { // when CanaryStatus is nil, which means canary action hasn't started yet, don't need doing cleanup if r.newStatus.CanaryStatus == nil { return true, nil } // 1. rollout progressing complete, allow workload paused=false in webhook err := r.removeRolloutStateInWorkload() if err != nil { return false, err } // 2. restore stable service, remove podRevision selector done, err := r.restoreStableService() if err != nil || !done { return done, err } // 3. upgrade stable deployment, set paused=false // isComplete indicates whether rollout progressing complete, and wait for all pods are ready // else indicates rollout is canceled done, err = r.batchControl.Promote(-1, r.isComplete) if err != nil || !done { return done, err } // 4. route all traffic to stable service done, err = r.doFinalisingTrafficRouting() if err != nil || !done { return done, err } // 5. delete batchRelease crd done, err = r.batchControl.Finalize() if err != nil { klog.Errorf("rollout(%s/%s) DoFinalising batchRelease failed: %s", r.rollout.Namespace, r.rollout.Name, err.Error()) return false, err } else if !done { return false, nil } klog.Infof("rollout(%s/%s) batchRelease Finalize success", r.rollout.Namespace, r.rollout.Name) return true, nil } func (r *rolloutContext) removeRolloutStateInWorkload() error { if r.workload == nil { return nil } if _, ok := r.workload.Annotations[util.InRolloutProgressingAnnotation]; !ok { return nil } var obj client.Object // cloneSet if r.workload.Kind == util.ControllerKruiseKindCS.Kind { obj = &appsv1alpha1.CloneSet{} // deployment } else { obj = &apps.Deployment{} } err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { if err := r.Get(context.TODO(), types.NamespacedName{Name: r.workload.Name, Namespace: r.workload.Namespace}, obj); err != nil { klog.Errorf("getting updated workload(%s.%s) failed: %s", r.workload.Namespace, r.workload.Name, err.Error()) return err } annotations := obj.GetAnnotations() delete(annotations, util.InRolloutProgressingAnnotation) obj.SetAnnotations(annotations) return r.Update(context.TODO(), obj) }) if err != nil { klog.Errorf("update rollout(%s/%s) workload(%s) failed: %s", r.rollout.Namespace, r.rollout.Name, r.workload.Name, err.Error()) return err } klog.Infof("remove rollout(%s/%s) workload(%s) annotation[%s] success", r.rollout.Namespace, r.rollout.Name, r.workload.Name, util.InRolloutProgressingAnnotation) return nil }