218 lines
8.5 KiB
Go
218 lines
8.5 KiB
Go
/*
|
|
Copyright 2022 The Kruise Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package batchrelease
|
|
|
|
import (
|
|
"reflect"
|
|
|
|
"github.com/openkruise/rollouts/api/v1alpha1"
|
|
"github.com/openkruise/rollouts/pkg/controller/batchrelease/workloads"
|
|
"github.com/openkruise/rollouts/pkg/util"
|
|
v1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/errors"
|
|
"k8s.io/klog/v2"
|
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
|
"sigs.k8s.io/controller-runtime/pkg/reconcile"
|
|
)
|
|
|
|
func (r *Executor) checkHealthBeforeExecution(controller workloads.WorkloadController) (bool, reconcile.Result, error) {
|
|
var err error
|
|
var reason string
|
|
var message string
|
|
var needRetry bool
|
|
needStopThisRound := false
|
|
result := reconcile.Result{}
|
|
|
|
// sync the workload info and watch the workload change event
|
|
workloadEvent, workloadInfo, err := controller.SyncWorkloadInfo()
|
|
|
|
// Note: must keep the order of the following cases:
|
|
switch {
|
|
/**************************************************************************
|
|
SPECIAL CASES ABOUT THE BATCH RELEASE PLAN
|
|
*************************************************************************/
|
|
//The following special cases are about the **batch release plan**, include:
|
|
// (1). Plan is deleted or cancelled
|
|
// (2). Plan is paused during rollout
|
|
// (3). Plan is changed during rollout
|
|
// (4). Plan status is unexpected/unhealthy
|
|
case isPlanTerminating(r.release, r.releaseStatus):
|
|
// handle the case that the plan is deleted or is terminating
|
|
reason = "PlanTerminating"
|
|
message = "Release plan is deleted or cancelled, then terminate"
|
|
signalTerminating(r.releaseStatus)
|
|
|
|
case isPlanPaused(err, r.releasePlan, r.releaseStatus):
|
|
// handle the case that releasePlan.paused = true
|
|
reason = "PlanPaused"
|
|
message = "release plan is paused, then stop reconcile"
|
|
needStopThisRound = true
|
|
|
|
case isPlanChanged(r.releasePlan, r.releaseStatus):
|
|
// handle the case that release plan is changed during progressing
|
|
reason = "PlanChanged"
|
|
message = "release plan is changed, then recalculate status"
|
|
signalRecalculate(r.release, r.releaseStatus)
|
|
|
|
case isPlanUnhealthy(r.releasePlan, r.releaseStatus):
|
|
// handle the case that release status is chaos which may lead to panic
|
|
reason = "PlanStatusUnhealthy"
|
|
message = "release plan is unhealthy, then restart"
|
|
needStopThisRound = true
|
|
|
|
/**************************************************************************
|
|
SPECIAL CASES ABOUT THE WORKLOAD
|
|
*************************************************************************/
|
|
// The following special cases are about the **workload**, include:
|
|
// (1). Get workload info err
|
|
// (2). Workload was deleted
|
|
// (3). Workload is created
|
|
// (4). Workload scale when rollout
|
|
// (5). Workload rollback when rollout
|
|
// (6). Workload revision changed when rollout
|
|
// (7). Workload is at unexpected/unhealthy state
|
|
// (8). Workload is at unstable state, its workload info is untrustworthy
|
|
case isGetWorkloadInfoError(err):
|
|
// handle the case of IgnoreNotFound(err) != nil
|
|
reason = "GetWorkloadError"
|
|
message = err.Error()
|
|
|
|
case isWorkloadGone(err, r.releaseStatus):
|
|
// handle the case that the workload is deleted
|
|
reason = "WorkloadGone"
|
|
message = "target workload has gone, then terminate"
|
|
signalTerminating(r.releaseStatus)
|
|
|
|
case isWorkloadLocated(err, r.releaseStatus):
|
|
// handle the case that workload is newly created
|
|
reason = "WorkloadLocated"
|
|
message = "workload is located, then start"
|
|
signalLocated(r.releaseStatus)
|
|
|
|
case isWorkloadScaling(workloadEvent, r.releaseStatus):
|
|
// handle the case that workload is scaling during progressing
|
|
reason = "ReplicasChanged"
|
|
message = "workload is scaling, then reinitialize batch status"
|
|
signalReinitializeBatch(r.releaseStatus)
|
|
// we must ensure that this field is updated only when we have observed
|
|
// the workload scaling event, otherwise this event may be lost.
|
|
r.releaseStatus.ObservedWorkloadReplicas = *workloadInfo.Replicas
|
|
|
|
case isWorkloadChanged(workloadEvent, r.releaseStatus):
|
|
// handle the case of continuous release v1 -> v2 -> v3
|
|
reason = "TargetRevisionChanged"
|
|
message = "workload revision was changed, then abort"
|
|
signalFinalize(r.releaseStatus)
|
|
|
|
case isWorkloadUnhealthy(workloadEvent, r.releaseStatus):
|
|
// handle the case that workload is unhealthy, and rollout plan cannot go on
|
|
reason = "WorkloadUnHealthy"
|
|
message = "workload is UnHealthy, then stop"
|
|
needStopThisRound = true
|
|
|
|
case isWorkloadUnstable(workloadEvent, r.releaseStatus):
|
|
// handle the case that workload.Generation != workload.Status.ObservedGeneration
|
|
reason = "WorkloadNotStable"
|
|
message = "workload status is not stable, then wait"
|
|
needStopThisRound = true
|
|
}
|
|
|
|
// log the special event info
|
|
if len(message) > 0 {
|
|
r.recorder.Eventf(r.release, v1.EventTypeWarning, reason, message)
|
|
klog.Warningf("Special case occurred in BatchRelease(%v), message: %v", r.releaseKey, message)
|
|
}
|
|
|
|
// sync workload info with status
|
|
oldStatus := r.releaseStatus.DeepCopy()
|
|
refreshStatus(r.release, r.releaseStatus, workloadInfo)
|
|
|
|
// If it needs to retry or status phase or state changed, should
|
|
// stop and retry. This is because we must ensure that the phase
|
|
// and state is persistent in ETCD, or will lead to the chaos of
|
|
// state machine.
|
|
if !reflect.DeepEqual(oldStatus, r.releaseStatus) {
|
|
needRetry = true
|
|
}
|
|
|
|
// will retry after 50ms
|
|
err = client.IgnoreNotFound(err)
|
|
if needRetry && err == nil {
|
|
needStopThisRound = true
|
|
result = reconcile.Result{RequeueAfter: DefaultDuration}
|
|
}
|
|
|
|
return needStopThisRound, result, err
|
|
}
|
|
|
|
func refreshStatus(release *v1alpha1.BatchRelease, newStatus *v1alpha1.BatchReleaseStatus, workloadInfo *util.WorkloadInfo) {
|
|
// refresh workload info for status
|
|
if workloadInfo != nil {
|
|
if workloadInfo.Status != nil {
|
|
newStatus.CanaryStatus.UpdatedReplicas = workloadInfo.Status.UpdatedReplicas
|
|
newStatus.CanaryStatus.UpdatedReadyReplicas = workloadInfo.Status.UpdatedReadyReplicas
|
|
}
|
|
}
|
|
if len(newStatus.ObservedReleasePlanHash) == 0 {
|
|
newStatus.ObservedReleasePlanHash = util.HashReleasePlanBatches(&release.Spec.ReleasePlan)
|
|
}
|
|
}
|
|
|
|
func isPlanTerminating(release *v1alpha1.BatchRelease, status *v1alpha1.BatchReleaseStatus) bool {
|
|
return release.DeletionTimestamp != nil || status.Phase == v1alpha1.RolloutPhaseTerminating
|
|
}
|
|
|
|
func isPlanChanged(plan *v1alpha1.ReleasePlan, status *v1alpha1.BatchReleaseStatus) bool {
|
|
return status.ObservedReleasePlanHash != util.HashReleasePlanBatches(plan) && status.Phase == v1alpha1.RolloutPhaseProgressing
|
|
}
|
|
|
|
func isPlanUnhealthy(plan *v1alpha1.ReleasePlan, status *v1alpha1.BatchReleaseStatus) bool {
|
|
return int(status.CanaryStatus.CurrentBatch) >= len(plan.Batches) && status.Phase == v1alpha1.RolloutPhaseProgressing
|
|
}
|
|
|
|
func isPlanPaused(err error, plan *v1alpha1.ReleasePlan, status *v1alpha1.BatchReleaseStatus) bool {
|
|
return plan.Paused && status.Phase == v1alpha1.RolloutPhaseProgressing && !isWorkloadGone(err, status)
|
|
}
|
|
|
|
func isGetWorkloadInfoError(err error) bool {
|
|
return err != nil && !errors.IsNotFound(err)
|
|
}
|
|
|
|
func isWorkloadLocated(err error, status *v1alpha1.BatchReleaseStatus) bool {
|
|
return err == nil && status.Phase == v1alpha1.RolloutPhaseInitial
|
|
}
|
|
|
|
func isWorkloadGone(err error, status *v1alpha1.BatchReleaseStatus) bool {
|
|
return errors.IsNotFound(err) && status.Phase != v1alpha1.RolloutPhaseInitial
|
|
}
|
|
|
|
func isWorkloadScaling(event workloads.WorkloadEventType, status *v1alpha1.BatchReleaseStatus) bool {
|
|
return event == workloads.WorkloadReplicasChanged && status.Phase == v1alpha1.RolloutPhaseProgressing
|
|
}
|
|
|
|
func isWorkloadChanged(event workloads.WorkloadEventType, status *v1alpha1.BatchReleaseStatus) bool {
|
|
return event == workloads.WorkloadPodTemplateChanged && status.Phase == v1alpha1.RolloutPhaseProgressing
|
|
}
|
|
|
|
func isWorkloadUnhealthy(event workloads.WorkloadEventType, status *v1alpha1.BatchReleaseStatus) bool {
|
|
return event == workloads.WorkloadUnHealthy && status.Phase == v1alpha1.RolloutPhaseProgressing
|
|
}
|
|
|
|
func isWorkloadUnstable(event workloads.WorkloadEventType, _ *v1alpha1.BatchReleaseStatus) bool {
|
|
return event == workloads.WorkloadStillReconciling
|
|
}
|