imporve traffic strategy for canary and partition release
Signed-off-by: yunbo <yunbo10124scut@gmail.com>
This commit is contained in:
parent
cddc53ad14
commit
d32837e104
|
@ -17,6 +17,9 @@ limitations under the License.
|
|||
package v1beta1
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
apps "k8s.io/api/apps/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/intstr"
|
||||
|
@ -92,6 +95,22 @@ func (r *RolloutStrategy) GetRollingStyle() RollingStyleType {
|
|||
return PartitionRollingStyle
|
||||
}
|
||||
|
||||
// simply using EnableExtraWorkloadForCanary is not enough, for example, a v1alaph1 Rollout
|
||||
// can be converted to v1beta1 Rollout with EnableExtraWorkloadForCanary set as true, even the
|
||||
// objectRef is cloneset (which doesn't support canary release)
|
||||
func IsRealPartition(rollout *Rollout) bool {
|
||||
estimation := rollout.Spec.Strategy.GetRollingStyle()
|
||||
if estimation == EmptyRollingStyle || estimation == BlueGreenRollingStyle {
|
||||
return false
|
||||
}
|
||||
targetRef := rollout.Spec.WorkloadRef
|
||||
if targetRef.APIVersion == apps.SchemeGroupVersion.String() && targetRef.Kind == reflect.TypeOf(apps.Deployment{}).Name() &&
|
||||
estimation == CanaryRollingStyle {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// r.GetRollingStyle() == BlueGreenRollingStyle
|
||||
func (r *RolloutStrategy) IsBlueGreenRelease() bool {
|
||||
return r.GetRollingStyle() == BlueGreenRollingStyle
|
||||
|
|
|
@ -75,7 +75,7 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
|
|||
currentStep := c.Rollout.Spec.Strategy.Canary.Steps[canaryStatus.CurrentStepIndex-1]
|
||||
if currentStep.Traffic == nil && len(currentStep.Matches) == 0 {
|
||||
tr := newTrafficRoutingContext(c)
|
||||
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr, false)
|
||||
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
|
||||
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -89,10 +89,59 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
|
|||
switch canaryStatus.CurrentStepState {
|
||||
// before CanaryStepStateUpgrade, handle some special cases, to prevent traffic loss
|
||||
case v1beta1.CanaryStepStateInit:
|
||||
// placeholder for the later traffic modification Pull Request
|
||||
canaryStatus.NextStepIndex = util.NextBatchIndex(c.Rollout, canaryStatus.CurrentStepIndex)
|
||||
klog.Infof("rollout(%s/%s) run canary strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateInit)
|
||||
tr := newTrafficRoutingContext(c)
|
||||
if currentStep.Traffic == nil && len(currentStep.Matches) == 0 {
|
||||
canaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade
|
||||
klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name,
|
||||
canaryStatus.CurrentStepIndex, v1beta1.CanaryStepStateInit, canaryStatus.CurrentStepState)
|
||||
return nil
|
||||
}
|
||||
|
||||
/*
|
||||
The next check is used to bypass the bug in ingress-nginx controller https://github.com/kubernetes/ingress-nginx/issues/9635
|
||||
for partition release, if the currentStep replicas is "100%", we can assume that all traffic should be routed to canary pods
|
||||
*/
|
||||
if currentStep.Replicas.StrVal == "100%" && v1beta1.IsRealPartition(c.Rollout) {
|
||||
klog.Infof("special case detected: rollout(%s/%s) restore stable Service", c.Rollout.Namespace, c.Rollout.Name)
|
||||
done, err := m.trafficRoutingManager.RestoreStableService(tr)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if !done {
|
||||
expectedTime := time.Now().Add(time.Duration(defaultGracePeriodSeconds) * time.Second)
|
||||
c.RecheckTime = &expectedTime
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
The next check is used to solve the following scenario:
|
||||
steps:
|
||||
- replicas: 1 # frist batch
|
||||
matches:
|
||||
- headers:
|
||||
- name: user-agent
|
||||
type: Exact
|
||||
value: pc
|
||||
we should patch selector to stable Service before CanaryStepStateUpgrade when in the first batch
|
||||
otherwise, some traffic will loss between CanaryStepStateUpgrade and CanaryStepStateTrafficRouting
|
||||
*/
|
||||
if canaryStatus.CurrentStepIndex == 1 {
|
||||
klog.Infof("special case detected: rollout(%s/%s) patch stable Service", c.Rollout.Namespace, c.Rollout.Name)
|
||||
done, err := m.trafficRoutingManager.PatchStableService(tr)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if !done {
|
||||
expectedTime := time.Now().Add(time.Duration(defaultGracePeriodSeconds) * time.Second)
|
||||
c.RecheckTime = &expectedTime
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
|
||||
canaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade
|
||||
fallthrough
|
||||
klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name,
|
||||
canaryStatus.CurrentStepIndex, v1beta1.CanaryStepStateInit, canaryStatus.CurrentStepState)
|
||||
|
||||
case v1beta1.CanaryStepStateUpgrade:
|
||||
klog.Infof("rollout(%s/%s) run canary strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateUpgrade)
|
||||
|
@ -101,6 +150,9 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
|
|||
return err
|
||||
} else if done {
|
||||
canaryStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting
|
||||
if currentStep.Replicas.StrVal == "100%" && v1beta1.IsRealPartition(c.Rollout) {
|
||||
canaryStatus.CurrentStepState = v1beta1.CanaryStepStateMetricsAnalysis
|
||||
}
|
||||
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
|
||||
klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name,
|
||||
canaryStatus.CurrentStepIndex, v1beta1.CanaryStepStateUpgrade, canaryStatus.CurrentStepState)
|
||||
|
@ -216,6 +268,12 @@ func (m *canaryReleaseManager) doCanaryPaused(c *RolloutContext) (bool, error) {
|
|||
canaryStatus := c.NewStatus.CanaryStatus
|
||||
currentStep := c.Rollout.Spec.Strategy.Canary.Steps[canaryStatus.CurrentStepIndex-1]
|
||||
steps := len(c.Rollout.Spec.Strategy.Canary.Steps)
|
||||
// If it is the last step, and 100% of pods, then return true
|
||||
if int32(steps) == canaryStatus.CurrentStepIndex {
|
||||
if currentStep.Replicas != nil && currentStep.Replicas.StrVal == "100%" {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
cond := util.GetRolloutCondition(*c.NewStatus, v1beta1.RolloutConditionProgressing)
|
||||
// need manual confirmation
|
||||
if currentStep.Pause.Duration == nil {
|
||||
|
@ -268,8 +326,9 @@ func (m *canaryReleaseManager) doCanaryJump(c *RolloutContext) (jumped bool) {
|
|||
|
||||
// cleanup after rollout is completed or finished
|
||||
func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, error) {
|
||||
canaryStatus := c.NewStatus.CanaryStatus
|
||||
// when CanaryStatus is nil, which means canary action hasn't started yet, don't need doing cleanup
|
||||
if c.NewStatus.CanaryStatus == nil {
|
||||
if canaryStatus == nil {
|
||||
return true, nil
|
||||
}
|
||||
// 1. rollout progressing complete, remove rollout progressing annotation in workload
|
||||
|
@ -278,33 +337,73 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro
|
|||
return false, err
|
||||
}
|
||||
tr := newTrafficRoutingContext(c)
|
||||
// 2. remove stable service the pod revision selector, so stable service will be selector all version pods.
|
||||
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr, true)
|
||||
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
|
||||
if err != nil || !done {
|
||||
return done, err
|
||||
klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, canaryStatus.FinalisingStep)
|
||||
switch canaryStatus.FinalisingStep {
|
||||
default:
|
||||
canaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService
|
||||
fallthrough
|
||||
|
||||
case v1beta1.FinalisingStepTypeStableService:
|
||||
// restore stable service selector to select all pods [with grace time]
|
||||
done, err := m.trafficRoutingManager.RestoreStableService(tr)
|
||||
if err != nil || !done {
|
||||
canaryStatus.LastUpdateTime = tr.LastUpdateTime
|
||||
return done, err
|
||||
}
|
||||
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
|
||||
canaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway
|
||||
|
||||
case v1beta1.FinalisingStepTypeGateway:
|
||||
// modify network api(ingress or gateway api) configuration
|
||||
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
|
||||
if err != nil || !done {
|
||||
canaryStatus.LastUpdateTime = tr.LastUpdateTime
|
||||
return done, err
|
||||
}
|
||||
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
|
||||
canaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeCanaryService
|
||||
|
||||
/*
|
||||
//TODO - As mentioned in FinalisingTrafficRouting function,
|
||||
we should wait grace time between FinalisingStepTypeGateway and FinalisingStepTypeCanaryService
|
||||
to avoid a very rare case which could cause minor traffic loss (espically, Istio), but it's difficult
|
||||
to implement now.
|
||||
However, we still reserve the FinalisingStepTypeCanaryService step here, but instead of removing the
|
||||
canary Service as expected (which has been done in FinalisingStepTypeGateway), FinalisingStepTypeCanaryService
|
||||
simply wait a gracetime between FinalisingStepTypeCanaryService and FinalisingStepTypeDeleteBR now
|
||||
*/
|
||||
case v1beta1.FinalisingStepTypeCanaryService:
|
||||
// wait a gracetime for safety
|
||||
if canaryStatus.LastUpdateTime != nil {
|
||||
if verifyTime := canaryStatus.LastUpdateTime.Add(time.Second * time.Duration(3)); verifyTime.After(time.Now()) {
|
||||
klog.Infof("restoring network configuration, but we need to wait %d seconds", 3)
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
canaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease
|
||||
|
||||
case v1beta1.FinalisingStepTypeBatchRelease:
|
||||
// set workload.pause=false; set workload.partition=0
|
||||
done, err := m.finalizingBatchRelease(c)
|
||||
if err != nil || !done {
|
||||
return done, err
|
||||
}
|
||||
canaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR
|
||||
|
||||
case v1beta1.FinalisingStepTypeDeleteBR:
|
||||
// delete batchRelease crd
|
||||
done, err := m.removeBatchRelease(c)
|
||||
if err != nil {
|
||||
klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
|
||||
return false, err
|
||||
} else if !done {
|
||||
return false, nil
|
||||
}
|
||||
klog.Infof("rollout(%s/%s) doCanaryFinalising success", c.Rollout.Namespace, c.Rollout.Name)
|
||||
return true, nil
|
||||
}
|
||||
// 3. set workload.pause=false; set workload.partition=0
|
||||
done, err = m.finalizingBatchRelease(c)
|
||||
if err != nil || !done {
|
||||
return done, err
|
||||
}
|
||||
// 4. modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods.
|
||||
done, err = m.trafficRoutingManager.FinalisingTrafficRouting(tr, false)
|
||||
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
|
||||
if err != nil || !done {
|
||||
return done, err
|
||||
}
|
||||
// 5. delete batchRelease crd
|
||||
done, err = m.removeBatchRelease(c)
|
||||
if err != nil {
|
||||
klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
|
||||
return false, err
|
||||
} else if !done {
|
||||
return false, nil
|
||||
}
|
||||
klog.Infof("rollout(%s/%s) doCanaryFinalising success", c.Rollout.Namespace, c.Rollout.Name)
|
||||
return true, nil
|
||||
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func (m *canaryReleaseManager) removeRolloutProgressingAnnotation(c *RolloutContext) error {
|
||||
|
|
|
@ -63,6 +63,7 @@ func TestRunCanary(t *testing.T) {
|
|||
obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1"
|
||||
obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547"
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 1
|
||||
obj.Status.CanaryStatus.NextStepIndex = 2
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade
|
||||
cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonInRolling
|
||||
|
@ -76,6 +77,7 @@ func TestRunCanary(t *testing.T) {
|
|||
s.CanaryStatus.StableRevision = "pod-template-hash-v1"
|
||||
s.CanaryStatus.CanaryRevision = "6f8cc56547"
|
||||
s.CanaryStatus.CurrentStepIndex = 1
|
||||
s.CanaryStatus.NextStepIndex = 2
|
||||
s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade
|
||||
cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonInRolling
|
||||
|
@ -138,6 +140,7 @@ func TestRunCanary(t *testing.T) {
|
|||
obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1"
|
||||
obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547"
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 1
|
||||
obj.Status.CanaryStatus.NextStepIndex = 2
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade
|
||||
cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonInRolling
|
||||
|
@ -183,6 +186,7 @@ func TestRunCanary(t *testing.T) {
|
|||
s.CanaryStatus.CanaryReplicas = 1
|
||||
s.CanaryStatus.CanaryReadyReplicas = 1
|
||||
s.CanaryStatus.CurrentStepIndex = 1
|
||||
s.CanaryStatus.NextStepIndex = 2
|
||||
s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting
|
||||
cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonInRolling
|
||||
|
@ -287,6 +291,7 @@ func TestRunCanaryPaused(t *testing.T) {
|
|||
obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1"
|
||||
obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547"
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 3
|
||||
obj.Status.CanaryStatus.NextStepIndex = 4
|
||||
obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStatePaused
|
||||
return obj
|
||||
|
@ -298,6 +303,7 @@ func TestRunCanaryPaused(t *testing.T) {
|
|||
obj.CanaryStatus.StableRevision = "pod-template-hash-v1"
|
||||
obj.CanaryStatus.CanaryRevision = "6f8cc56547"
|
||||
obj.CanaryStatus.CurrentStepIndex = 3
|
||||
obj.CanaryStatus.NextStepIndex = 4
|
||||
obj.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
|
||||
obj.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStatePaused
|
||||
return obj
|
||||
|
|
|
@ -45,6 +45,8 @@ type RolloutContext struct {
|
|||
RecheckTime *time.Time
|
||||
// wait stable workload pods ready
|
||||
WaitReady bool
|
||||
// finalising reason
|
||||
FinalizeReason string
|
||||
}
|
||||
|
||||
// parameter1 retryReconcile, parameter2 error
|
||||
|
@ -122,6 +124,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout
|
|||
klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason)
|
||||
var done bool
|
||||
rolloutContext.WaitReady = true
|
||||
rolloutContext.FinalizeReason = v1beta1.FinaliseReasonSuccess
|
||||
done, err = r.doFinalising(rolloutContext)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -146,6 +149,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout
|
|||
case v1alpha1.ProgressingReasonCancelling:
|
||||
klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason)
|
||||
var done bool
|
||||
rolloutContext.FinalizeReason = v1beta1.FinaliseReasonRollback
|
||||
done, err = r.doFinalising(rolloutContext)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -411,22 +415,36 @@ func isRollingBackInBatches(rollout *v1beta1.Rollout, workload *util.Workload) b
|
|||
// 1. modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods
|
||||
// 2. remove batchRelease CR.
|
||||
func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) {
|
||||
if len(c.Rollout.Spec.Strategy.Canary.TrafficRoutings) > 0 {
|
||||
if c.Rollout.Spec.Strategy.HasTrafficRoutings() {
|
||||
// modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods
|
||||
tr := newTrafficRoutingContext(c)
|
||||
done, err := r.trafficRoutingManager.FinalisingTrafficRouting(tr, false)
|
||||
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
|
||||
done, err := r.trafficRoutingManager.RouteAllTrafficToCanaryORStable(tr, v1beta1.FinaliseReasonContinuous)
|
||||
if err != nil || !done {
|
||||
c.NewStatus.GetSubStatus().LastUpdateTime = tr.LastUpdateTime
|
||||
return done, err
|
||||
}
|
||||
}
|
||||
done, err := r.canaryManager.removeBatchRelease(c)
|
||||
|
||||
releaseManager, err := r.getReleaseManager(c.Rollout)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
done, err := releaseManager.removeBatchRelease(c)
|
||||
if err != nil {
|
||||
klog.Errorf("rollout(%s/%s) DoFinalising batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
|
||||
return false, err
|
||||
} else if !done {
|
||||
return false, nil
|
||||
}
|
||||
// restore Service and network api
|
||||
if c.Rollout.Spec.Strategy.HasTrafficRoutings() {
|
||||
tr := newTrafficRoutingContext(c)
|
||||
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(tr)
|
||||
c.NewStatus.GetSubStatus().LastUpdateTime = tr.LastUpdateTime
|
||||
if err != nil || !done {
|
||||
return done, err
|
||||
}
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -242,7 +242,9 @@ func TestReconcileRolloutProgressing(t *testing.T) {
|
|||
return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1}
|
||||
},
|
||||
getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()}
|
||||
s1 := demoService.DeepCopy()
|
||||
s1.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v1"
|
||||
return []*corev1.Service{s1}, []*netv1.Ingress{demoIngress.DeepCopy()}
|
||||
},
|
||||
getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease, *v1alpha1.TrafficRouting) {
|
||||
obj := rolloutDemo.DeepCopy()
|
||||
|
@ -277,6 +279,11 @@ func TestReconcileRolloutProgressing(t *testing.T) {
|
|||
s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
|
||||
s.CanaryStatus.CurrentStepIndex = 4
|
||||
s.CanaryStatus.NextStepIndex = 0
|
||||
// the first finalizing step is restorStableService, given that the stable service has
|
||||
// selector, and removing it will take a grace time (i.e. take at least 2 reconciles),
|
||||
// therefore, after calling ReconcileRolloutProgressing once, the finalizing step must be
|
||||
// restorStableService
|
||||
s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService
|
||||
s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonFinalising
|
||||
|
@ -291,6 +298,140 @@ func TestReconcileRolloutProgressing(t *testing.T) {
|
|||
},
|
||||
{
|
||||
name: "ReconcileRolloutProgressing finalizing2",
|
||||
getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) {
|
||||
dep1 := deploymentDemo.DeepCopy()
|
||||
delete(dep1.Annotations, util.InRolloutProgressingAnnotation)
|
||||
dep1.Status = apps.DeploymentStatus{
|
||||
ObservedGeneration: 2,
|
||||
Replicas: 10,
|
||||
UpdatedReplicas: 5,
|
||||
ReadyReplicas: 10,
|
||||
AvailableReplicas: 10,
|
||||
}
|
||||
rs1 := rsDemo.DeepCopy()
|
||||
return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1}
|
||||
},
|
||||
getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()}
|
||||
},
|
||||
getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease, *v1alpha1.TrafficRouting) {
|
||||
obj := rolloutDemo.DeepCopy()
|
||||
obj.Annotations[v1alpha1.TrafficRoutingAnnotation] = "tr-demo"
|
||||
obj.Status.CanaryStatus.ObservedWorkloadGeneration = 2
|
||||
obj.Status.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd"
|
||||
obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1"
|
||||
obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547"
|
||||
obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 4
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
// given that the stable service has no selector, and removing it will immediately return
|
||||
// ture. Therefore, we expect the finalizing step to be next step, i.e. restoreGateway
|
||||
obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService
|
||||
cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonFinalising
|
||||
cond.Status = corev1.ConditionTrue
|
||||
util.SetRolloutCondition(&obj.Status, *cond)
|
||||
br := batchDemo.DeepCopy()
|
||||
br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{
|
||||
{
|
||||
CanaryReplicas: intstr.FromInt(1),
|
||||
},
|
||||
}
|
||||
tr := demoTR.DeepCopy()
|
||||
tr.Finalizers = []string{util.ProgressingRolloutFinalizer(rolloutDemo.Name)}
|
||||
return obj, br, tr
|
||||
},
|
||||
expectStatus: func() *v1beta1.RolloutStatus {
|
||||
s := rolloutDemo.Status.DeepCopy()
|
||||
s.CanaryStatus.ObservedWorkloadGeneration = 2
|
||||
s.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd"
|
||||
s.CanaryStatus.StableRevision = "pod-template-hash-v1"
|
||||
s.CanaryStatus.CanaryRevision = "6f8cc56547"
|
||||
s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
|
||||
s.CanaryStatus.CurrentStepIndex = 4
|
||||
s.CanaryStatus.NextStepIndex = 0
|
||||
s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway
|
||||
s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonFinalising
|
||||
cond.Status = corev1.ConditionTrue
|
||||
util.SetRolloutCondition(s, *cond)
|
||||
return s
|
||||
},
|
||||
expectTr: func() *v1alpha1.TrafficRouting {
|
||||
tr := demoTR.DeepCopy()
|
||||
return tr
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "ReconcileRolloutProgressing finalizing3",
|
||||
getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) {
|
||||
dep1 := deploymentDemo.DeepCopy()
|
||||
delete(dep1.Annotations, util.InRolloutProgressingAnnotation)
|
||||
dep1.Status = apps.DeploymentStatus{
|
||||
ObservedGeneration: 2,
|
||||
Replicas: 10,
|
||||
UpdatedReplicas: 5,
|
||||
ReadyReplicas: 10,
|
||||
AvailableReplicas: 10,
|
||||
}
|
||||
rs1 := rsDemo.DeepCopy()
|
||||
return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1}
|
||||
},
|
||||
getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()}
|
||||
},
|
||||
getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease, *v1alpha1.TrafficRouting) {
|
||||
obj := rolloutDemo.DeepCopy()
|
||||
obj.Annotations[v1alpha1.TrafficRoutingAnnotation] = "tr-demo"
|
||||
obj.Status.CanaryStatus.ObservedWorkloadGeneration = 2
|
||||
obj.Status.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd"
|
||||
obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1"
|
||||
obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547"
|
||||
obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 4
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
// restoring gateway will return true immediately
|
||||
// Rollout will go on to next step, i.e. deleteCanaryService
|
||||
obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway
|
||||
cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonFinalising
|
||||
cond.Status = corev1.ConditionTrue
|
||||
util.SetRolloutCondition(&obj.Status, *cond)
|
||||
br := batchDemo.DeepCopy()
|
||||
br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{
|
||||
{
|
||||
CanaryReplicas: intstr.FromInt(1),
|
||||
},
|
||||
}
|
||||
tr := demoTR.DeepCopy()
|
||||
tr.Finalizers = []string{util.ProgressingRolloutFinalizer(rolloutDemo.Name)}
|
||||
return obj, br, tr
|
||||
},
|
||||
expectStatus: func() *v1beta1.RolloutStatus {
|
||||
s := rolloutDemo.Status.DeepCopy()
|
||||
s.CanaryStatus.ObservedWorkloadGeneration = 2
|
||||
s.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd"
|
||||
s.CanaryStatus.StableRevision = "pod-template-hash-v1"
|
||||
s.CanaryStatus.CanaryRevision = "6f8cc56547"
|
||||
s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
|
||||
s.CanaryStatus.CurrentStepIndex = 4
|
||||
s.CanaryStatus.NextStepIndex = 0
|
||||
s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeCanaryService
|
||||
s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonFinalising
|
||||
cond.Status = corev1.ConditionTrue
|
||||
util.SetRolloutCondition(s, *cond)
|
||||
return s
|
||||
},
|
||||
expectTr: func() *v1alpha1.TrafficRouting {
|
||||
tr := demoTR.DeepCopy()
|
||||
return tr
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "ReconcileRolloutProgressing finalizing4",
|
||||
getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) {
|
||||
dep1 := deploymentDemo.DeepCopy()
|
||||
delete(dep1.Annotations, util.InRolloutProgressingAnnotation)
|
||||
|
@ -316,6 +457,9 @@ func TestReconcileRolloutProgressing(t *testing.T) {
|
|||
obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 4
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
// current finalizing step is patchBatchRelease, since the batch release has been "patched"
|
||||
// we expect the finalizing step to be next step, i.e. deleteBatchRelease
|
||||
obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease
|
||||
cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonFinalising
|
||||
cond.Status = corev1.ConditionTrue
|
||||
|
@ -338,6 +482,7 @@ func TestReconcileRolloutProgressing(t *testing.T) {
|
|||
s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
|
||||
s.CanaryStatus.CurrentStepIndex = 4
|
||||
s.CanaryStatus.NextStepIndex = 0
|
||||
s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR
|
||||
s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
cond2 := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing)
|
||||
cond2.Reason = v1alpha1.ProgressingReasonFinalising
|
||||
|
@ -373,6 +518,9 @@ func TestReconcileRolloutProgressing(t *testing.T) {
|
|||
obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 4
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
// deleteBatchRelease is the last step, and it won't wait a grace time
|
||||
// after this step, this release should be succeeded
|
||||
obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR
|
||||
cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing)
|
||||
cond.Reason = v1alpha1.ProgressingReasonFinalising
|
||||
cond.Status = corev1.ConditionTrue
|
||||
|
@ -389,6 +537,7 @@ func TestReconcileRolloutProgressing(t *testing.T) {
|
|||
s.CanaryStatus.CurrentStepIndex = 4
|
||||
s.CanaryStatus.NextStepIndex = 0
|
||||
s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR
|
||||
cond2 := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing)
|
||||
cond2.Reason = v1alpha1.ProgressingReasonCompleted
|
||||
cond2.Status = corev1.ConditionFalse
|
||||
|
|
|
@ -123,13 +123,13 @@ func (r *TrafficRoutingReconciler) Reconcile(ctx context.Context, req ctrl.Reque
|
|||
done, err = r.trafficRoutingManager.DoTrafficRouting(newTrafficRoutingContext(tr))
|
||||
}
|
||||
case v1alpha1.TrafficRoutingPhaseFinalizing:
|
||||
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr), false)
|
||||
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr))
|
||||
if done {
|
||||
newStatus.Phase = v1alpha1.TrafficRoutingPhaseHealthy
|
||||
newStatus.Message = "TrafficRouting is Healthy"
|
||||
}
|
||||
case v1alpha1.TrafficRoutingPhaseTerminating:
|
||||
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr), false)
|
||||
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr))
|
||||
if done {
|
||||
// remove trafficRouting finalizer
|
||||
err = r.handleFinalizer(tr)
|
||||
|
|
|
@ -126,7 +126,7 @@ func (m *Manager) DoTrafficRouting(c *TrafficRoutingContext) (bool, error) {
|
|||
if c.LastUpdateTime != nil {
|
||||
// wait seconds for network providers to consume the modification about workload, service and so on.
|
||||
if verifyTime := c.LastUpdateTime.Add(time.Second * time.Duration(trafficRouting.GracePeriodSeconds)); verifyTime.After(time.Now()) {
|
||||
klog.Infof("%s update workload or service selector, and wait 3 seconds", c.Key)
|
||||
klog.Infof("%s update workload or service selector, and wait %d seconds", c.Key, trafficRouting.GracePeriodSeconds)
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
@ -138,6 +138,15 @@ func (m *Manager) DoTrafficRouting(c *TrafficRoutingContext) (bool, error) {
|
|||
klog.Warningf("%s stableRevision or podTemplateHash can not be empty, and wait a moment", c.Key)
|
||||
return false, nil
|
||||
}
|
||||
/*
|
||||
Why is the serviceModified flag moved here?
|
||||
The rationale behind this is that when we create a canary Service, it is already instantiated with the appropriate selector.
|
||||
If the stable Service also has had selector patched previously, the logic will proceed to the EnsureRoutes function uninterrupted.
|
||||
It's important to note the following: Creating a new Service and updating the gateway resource occurs within a single reconciliation loop;
|
||||
this can lead to instability.
|
||||
Therefore, by moving the serviceModified flag, we introduce a grace period between these two operations to ensure stability.
|
||||
*/
|
||||
serviceModified := false
|
||||
// fetch canary service
|
||||
err = m.Get(context.TODO(), client.ObjectKey{Namespace: c.Namespace, Name: canaryServiceName}, canaryService)
|
||||
if err != nil && !errors.IsNotFound(err) {
|
||||
|
@ -148,9 +157,9 @@ func (m *Manager) DoTrafficRouting(c *TrafficRoutingContext) (bool, error) {
|
|||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
serviceModified = true
|
||||
}
|
||||
|
||||
serviceModified := false
|
||||
// patch canary service to only select the canary pods
|
||||
if canaryService.Spec.Selector[c.RevisionLabelKey] != c.CanaryRevision {
|
||||
body := fmt.Sprintf(`{"spec":{"selector":{"%s":"%s"}}}`, c.RevisionLabelKey, c.CanaryRevision)
|
||||
|
@ -180,6 +189,13 @@ func (m *Manager) DoTrafficRouting(c *TrafficRoutingContext) (bool, error) {
|
|||
if serviceModified {
|
||||
return false, nil
|
||||
}
|
||||
} else {
|
||||
// for end-to-end canary deployment scenario and scenario when DisableGenerateCanaryService is on
|
||||
// selector is not needed, we should remove it
|
||||
verify, err := m.restoreStableService(c)
|
||||
if err != nil || !verify {
|
||||
return false, err
|
||||
}
|
||||
}
|
||||
|
||||
// new network provider, ingress or gateway
|
||||
|
@ -199,7 +215,7 @@ func (m *Manager) DoTrafficRouting(c *TrafficRoutingContext) (bool, error) {
|
|||
return true, nil
|
||||
}
|
||||
|
||||
func (m *Manager) FinalisingTrafficRouting(c *TrafficRoutingContext, onlyRestoreStableService bool) (bool, error) {
|
||||
func (m *Manager) FinalisingTrafficRouting(c *TrafficRoutingContext) (bool, error) {
|
||||
if len(c.ObjectRef) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
|
@ -235,31 +251,16 @@ func (m *Manager) FinalisingTrafficRouting(c *TrafficRoutingContext, onlyRestore
|
|||
verify, err := m.restoreStableService(c)
|
||||
if err != nil || !verify {
|
||||
return false, err
|
||||
} else if onlyRestoreStableService {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// First route 100% traffic to stable service
|
||||
c.Strategy.Traffic = utilpointer.StringPtr("0%")
|
||||
verify, err = trController.EnsureRoutes(context.TODO(), &c.Strategy)
|
||||
if err != nil {
|
||||
return false, err
|
||||
} else if !verify {
|
||||
c.LastUpdateTime = &metav1.Time{Time: time.Now()}
|
||||
return false, nil
|
||||
}
|
||||
if c.LastUpdateTime != nil {
|
||||
// After restore the stable service configuration, give network provider 3 seconds to react
|
||||
if verifyTime := c.LastUpdateTime.Add(time.Second * time.Duration(trafficRouting.GracePeriodSeconds)); verifyTime.After(time.Now()) {
|
||||
klog.Infof("%s route 100% traffic to stable service, and wait a moment", c.Key)
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
||||
// modify network(ingress & gateway api) configuration, route all traffic to stable service
|
||||
if err = trController.Finalise(context.TODO()); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
//TODO - we should wait grace time between Finalish and removal of canary service
|
||||
// to avoid a very rare case which could cause minor traffic loss (espically, Istio)
|
||||
// However, it seems difficult to implement this unless re-write Finalise function:
|
||||
|
||||
// end to end deployment, don't remove the canary service;
|
||||
// because canary service is stable service
|
||||
if !c.OnlyTrafficRouting && !c.DisableGenerateCanaryService {
|
||||
|
@ -274,6 +275,145 @@ func (m *Manager) FinalisingTrafficRouting(c *TrafficRoutingContext, onlyRestore
|
|||
return true, nil
|
||||
}
|
||||
|
||||
// Route All Traffic To Canary OR Stable based on the FinalizeReason
|
||||
func (m *Manager) RouteAllTrafficToCanaryORStable(c *TrafficRoutingContext, FinalizeReason string) (bool, error) {
|
||||
if len(c.ObjectRef) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
if c.OnlyTrafficRouting {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
klog.Infof("%s RouteAllTrafficTo with Reason %s", c.Key, FinalizeReason)
|
||||
trafficToCanary := ""
|
||||
switch FinalizeReason {
|
||||
case v1beta1.FinaliseReasonRollback, v1beta1.FinaliseReasonContinuous:
|
||||
trafficToCanary = "0%"
|
||||
case v1beta1.FinaliseReasonSuccess:
|
||||
trafficToCanary = "100%"
|
||||
default:
|
||||
return true, nil
|
||||
}
|
||||
|
||||
trafficRouting := c.ObjectRef[0]
|
||||
if trafficRouting.GracePeriodSeconds <= 0 {
|
||||
trafficRouting.GracePeriodSeconds = defaultGracePeriodSeconds
|
||||
}
|
||||
|
||||
cServiceName := getCanaryServiceName(trafficRouting.Service, c.OnlyTrafficRouting, c.DisableGenerateCanaryService)
|
||||
trController, err := newNetworkProvider(m.Client, c, trafficRouting.Service, cServiceName)
|
||||
if err != nil {
|
||||
klog.Errorf("%s newTrafficRoutingController failed: %s", c.Key, err.Error())
|
||||
return false, err
|
||||
}
|
||||
|
||||
//fetch stable service
|
||||
stableService := &corev1.Service{}
|
||||
err = m.Get(context.TODO(), client.ObjectKey{Namespace: c.Namespace, Name: trafficRouting.Service}, stableService)
|
||||
if err != nil {
|
||||
if errors.IsNotFound(err) {
|
||||
return true, nil
|
||||
}
|
||||
klog.Errorf("%s get stable service(%s) failed: %s", c.Key, trafficRouting.Service, err.Error())
|
||||
return false, err
|
||||
}
|
||||
if stableService.Spec.Selector[c.RevisionLabelKey] == "" && !c.DisableGenerateCanaryService {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Route 100% traffic to service that won't be scaled down
|
||||
c.Strategy.Traffic = utilpointer.StringPtr(trafficToCanary)
|
||||
verify, err := trController.EnsureRoutes(context.TODO(), &c.Strategy)
|
||||
if err != nil {
|
||||
return false, err
|
||||
} else if !verify {
|
||||
c.LastUpdateTime = &metav1.Time{Time: time.Now()}
|
||||
klog.Infof("%s updated %s traffic, wait a grace time", c.Key, trafficToCanary)
|
||||
return false, nil
|
||||
}
|
||||
if c.LastUpdateTime != nil {
|
||||
// After restore the stable service configuration, give network provider seconds to react
|
||||
if verifyTime := c.LastUpdateTime.Add(time.Second * time.Duration(trafficRouting.GracePeriodSeconds)); verifyTime.After(time.Now()) {
|
||||
klog.Infof("%s routed %s traffic to canary pods, but you need wait %d seconds", c.Key, trafficToCanary, trafficRouting.GracePeriodSeconds)
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
||||
// if all traffic is routed canary, then it is ok to restore stable Service
|
||||
// it is mainly to avoid affect of bug in ingress-nginx controller
|
||||
if trafficToCanary == "100%" {
|
||||
verify, err := m.restoreStableService(c)
|
||||
if err != nil || !verify {
|
||||
return false, err
|
||||
}
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (m *Manager) PatchStableService(c *TrafficRoutingContext) (bool, error) {
|
||||
if len(c.ObjectRef) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
trafficRouting := c.ObjectRef[0]
|
||||
if trafficRouting.GracePeriodSeconds <= 0 {
|
||||
trafficRouting.GracePeriodSeconds = defaultGracePeriodSeconds
|
||||
}
|
||||
if c.OnlyTrafficRouting {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
//fetch stable service
|
||||
stableService := &corev1.Service{}
|
||||
err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Namespace, Name: trafficRouting.Service}, stableService)
|
||||
if err != nil {
|
||||
klog.Errorf("%s get stable service(%s) failed: %s", c.Key, trafficRouting.Service, err.Error())
|
||||
// not found, wait a moment, retry
|
||||
if errors.IsNotFound(err) {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
|
||||
if stableService.Spec.Selector[c.RevisionLabelKey] == c.StableRevision {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// patch stable service to only select the stable pods
|
||||
body := fmt.Sprintf(`{"spec":{"selector":{"%s":"%s"}}}`, c.RevisionLabelKey, c.StableRevision)
|
||||
if err = m.Patch(context.TODO(), stableService, client.RawPatch(types.StrategicMergePatchType, []byte(body))); err != nil {
|
||||
klog.Errorf("%s patch stable service(%s) selector failed: %s", c.Key, stableService.Name, err.Error())
|
||||
return false, err
|
||||
}
|
||||
klog.Infof("%s do something special: add stable service(%s) selector(%s=%s) success", c.Key, stableService.Name, c.RevisionLabelKey, c.StableRevision)
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (m *Manager) RestoreStableService(c *TrafficRoutingContext) (bool, error) {
|
||||
if len(c.ObjectRef) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
trafficRouting := c.ObjectRef[0]
|
||||
if trafficRouting.GracePeriodSeconds <= 0 {
|
||||
trafficRouting.GracePeriodSeconds = defaultGracePeriodSeconds
|
||||
}
|
||||
//fetch stable service
|
||||
stableService := &corev1.Service{}
|
||||
err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Namespace, Name: trafficRouting.Service}, stableService)
|
||||
if err != nil {
|
||||
if errors.IsNotFound(err) {
|
||||
return true, nil
|
||||
}
|
||||
klog.Errorf("%s get stable service(%s) failed: %s", c.Key, trafficRouting.Service, err.Error())
|
||||
return false, err
|
||||
}
|
||||
// restore stable Service
|
||||
verify, err := m.restoreStableService(c)
|
||||
if err != nil || !verify {
|
||||
return false, err
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func newNetworkProvider(c client.Client, con *TrafficRoutingContext, sService, cService string) (network.NetworkProvider, error) {
|
||||
trafficRouting := con.ObjectRef[0]
|
||||
if trafficRouting.CustomNetworkRefs != nil {
|
||||
|
@ -364,7 +504,7 @@ func (m *Manager) restoreStableService(c *TrafficRoutingContext) (bool, error) {
|
|||
klog.Errorf("%s patch stable service(%s) failed: %s", c.Key, trafficRouting.Service, err.Error())
|
||||
return false, err
|
||||
}
|
||||
klog.Infof("remove %s stable service(%s) pod revision selector, and wait a moment", c.Key, trafficRouting.Service)
|
||||
klog.Infof("removed %s stable service(%s) pod revision selector, and wait %d seconds (log once)", c.Key, trafficRouting.Service, trafficRouting.GracePeriodSeconds)
|
||||
c.LastUpdateTime = &metav1.Time{Time: time.Now()}
|
||||
return false, nil
|
||||
}
|
||||
|
@ -373,7 +513,7 @@ func (m *Manager) restoreStableService(c *TrafficRoutingContext) (bool, error) {
|
|||
}
|
||||
// After restore the stable service configuration, give network provider 3 seconds to react
|
||||
if verifyTime := c.LastUpdateTime.Add(time.Second * time.Duration(trafficRouting.GracePeriodSeconds)); verifyTime.After(time.Now()) {
|
||||
klog.Infof("%s restoring stable service(%s), and wait a moment", c.Key, trafficRouting.Service)
|
||||
klog.Infof("%s restoring stable service(%s), and wait %d seconds", c.Key, trafficRouting.Service, trafficRouting.GracePeriodSeconds)
|
||||
return false, nil
|
||||
}
|
||||
klog.Infof("%s doFinalising stable service(%s) success", c.Key, trafficRouting.Service)
|
||||
|
|
|
@ -774,6 +774,7 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) {
|
|||
getRollout: func() (*v1beta1.Rollout, *util.Workload) {
|
||||
obj := demoIstioRollout.DeepCopy()
|
||||
obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-10 * time.Second)}
|
||||
obj.Spec.Strategy.Canary.TrafficRoutings[0].GracePeriodSeconds = 1
|
||||
return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey}
|
||||
},
|
||||
expectUnstructureds: func() []*unstructured.Unstructured {
|
||||
|
@ -803,7 +804,6 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) {
|
|||
objects = append(objects, u)
|
||||
return objects
|
||||
},
|
||||
// Rollout(/rollout-demo) is doing trafficRouting({"traffic":"5%"}), and wait a moment
|
||||
expectDone: true,
|
||||
},
|
||||
{
|
||||
|
@ -833,6 +833,7 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) {
|
|||
obj := demoIstioRollout.DeepCopy()
|
||||
// set DisableGenerateCanaryService as true
|
||||
obj.Spec.Strategy.Canary.DisableGenerateCanaryService = true
|
||||
obj.Spec.Strategy.Canary.TrafficRoutings[0].GracePeriodSeconds = 1
|
||||
obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-10 * time.Second)}
|
||||
return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey}
|
||||
},
|
||||
|
@ -863,7 +864,6 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) {
|
|||
objects = append(objects, u)
|
||||
return objects
|
||||
},
|
||||
// Rollout(/rollout-demo) is doing trafficRouting({"traffic":"5%"}), and wait a moment
|
||||
expectDone: true,
|
||||
},
|
||||
}
|
||||
|
@ -897,11 +897,22 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) {
|
|||
if err != nil {
|
||||
t.Fatalf("InitializeTrafficRouting failed: %s", err)
|
||||
}
|
||||
// now we need to wait at least 2x grace time to keep traffic stable:
|
||||
// create the canary service -> grace time -> update the gateway -> grace time
|
||||
// therefore, before both grace times are over, DoTrafficRouting should return false
|
||||
// firstly, create the canary Service, before the grace time over, return false
|
||||
_, err = manager.DoTrafficRouting(c)
|
||||
if err != nil {
|
||||
t.Fatalf("DoTrafficRouting failed: %s", err)
|
||||
}
|
||||
// may return false due to in the course of doing trafficRouting, let's do it again
|
||||
time.Sleep(1 * time.Second)
|
||||
// secondly, update the gateway, before the grace time over, return false
|
||||
_, err = manager.DoTrafficRouting(c)
|
||||
if err != nil {
|
||||
t.Fatalf("DoTrafficRouting failed: %s", err)
|
||||
}
|
||||
time.Sleep(1 * time.Second)
|
||||
// now, both grace times are over, it should be true
|
||||
done, err := manager.DoTrafficRouting(c)
|
||||
if err != nil {
|
||||
t.Fatalf("DoTrafficRouting failed: %s", err)
|
||||
|
@ -920,12 +931,12 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) {
|
|||
|
||||
func TestFinalisingTrafficRouting(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
getObj func() ([]*corev1.Service, []*netv1.Ingress)
|
||||
getRollout func() (*v1beta1.Rollout, *util.Workload)
|
||||
onlyRestoreStableService bool
|
||||
expectObj func() ([]*corev1.Service, []*netv1.Ingress)
|
||||
expectDone bool
|
||||
name string
|
||||
getObj func() ([]*corev1.Service, []*netv1.Ingress)
|
||||
getRollout func() (*v1beta1.Rollout, *util.Workload)
|
||||
onlyTrafficRouting bool
|
||||
expectObj func() ([]*corev1.Service, []*netv1.Ingress)
|
||||
expectDone bool
|
||||
}{
|
||||
{
|
||||
name: "FinalisingTrafficRouting test1",
|
||||
|
@ -950,7 +961,6 @@ func TestFinalisingTrafficRouting(t *testing.T) {
|
|||
obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-time.Hour)}
|
||||
return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey}
|
||||
},
|
||||
onlyRestoreStableService: true,
|
||||
expectObj: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
s1 := demoService.DeepCopy()
|
||||
s2 := demoService.DeepCopy()
|
||||
|
@ -964,6 +974,8 @@ func TestFinalisingTrafficRouting(t *testing.T) {
|
|||
c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary"
|
||||
return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2}
|
||||
},
|
||||
// the Serive has selector on it, which takes a grace time to remove
|
||||
// that's why expectDone is false
|
||||
expectDone: false,
|
||||
},
|
||||
{
|
||||
|
@ -985,123 +997,60 @@ func TestFinalisingTrafficRouting(t *testing.T) {
|
|||
obj := demoRollout.DeepCopy()
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 4
|
||||
obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-10 * time.Second)}
|
||||
return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey}
|
||||
},
|
||||
onlyRestoreStableService: true,
|
||||
expectObj: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
s1 := demoService.DeepCopy()
|
||||
s2 := demoService.DeepCopy()
|
||||
s2.Name = "echoserver-canary"
|
||||
s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2"
|
||||
c1 := demoIngress.DeepCopy()
|
||||
c2 := demoIngress.DeepCopy()
|
||||
c2.Name = "echoserver-canary"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100"
|
||||
c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary"
|
||||
return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2}
|
||||
return []*corev1.Service{s1}, []*netv1.Ingress{c1}
|
||||
},
|
||||
expectDone: false,
|
||||
// the Serive has selector removed, therefore code will go on and return ture
|
||||
expectDone: true,
|
||||
},
|
||||
{
|
||||
name: "FinalisingTrafficRouting test3",
|
||||
name: "canary Service already clear",
|
||||
getObj: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
s1 := demoService.DeepCopy()
|
||||
s2 := demoService.DeepCopy()
|
||||
s2.Name = "echoserver-canary"
|
||||
s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2"
|
||||
s1.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v1"
|
||||
c1 := demoIngress.DeepCopy()
|
||||
c2 := demoIngress.DeepCopy()
|
||||
c2.Name = "echoserver-canary"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100"
|
||||
c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary"
|
||||
return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2}
|
||||
return []*corev1.Service{s1}, []*netv1.Ingress{c1}
|
||||
},
|
||||
getRollout: func() (*v1beta1.Rollout, *util.Workload) {
|
||||
obj := demoRollout.DeepCopy()
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 4
|
||||
obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-10 * time.Second)}
|
||||
obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-time.Hour)}
|
||||
return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey}
|
||||
},
|
||||
onlyRestoreStableService: true,
|
||||
expectObj: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
s1 := demoService.DeepCopy()
|
||||
s2 := demoService.DeepCopy()
|
||||
s2.Name = "echoserver-canary"
|
||||
s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2"
|
||||
s1.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v1"
|
||||
c1 := demoIngress.DeepCopy()
|
||||
c2 := demoIngress.DeepCopy()
|
||||
c2.Name = "echoserver-canary"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100"
|
||||
c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary"
|
||||
return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2}
|
||||
return []*corev1.Service{s1}, []*netv1.Ingress{c1}
|
||||
},
|
||||
expectDone: true,
|
||||
},
|
||||
{
|
||||
name: "FinalisingTrafficRouting test4",
|
||||
name: "OnlyTrafficRouting true",
|
||||
getObj: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
s1 := demoService.DeepCopy()
|
||||
s2 := demoService.DeepCopy()
|
||||
s2.Name = "echoserver-canary"
|
||||
s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2"
|
||||
c1 := demoIngress.DeepCopy()
|
||||
c2 := demoIngress.DeepCopy()
|
||||
c2.Name = "echoserver-canary"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100"
|
||||
c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary"
|
||||
return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2}
|
||||
return []*corev1.Service{s1}, []*netv1.Ingress{c1, c2}
|
||||
},
|
||||
getRollout: func() (*v1beta1.Rollout, *util.Workload) {
|
||||
obj := demoRollout.DeepCopy()
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 4
|
||||
obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-3 * time.Second)}
|
||||
obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-time.Hour)}
|
||||
return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey}
|
||||
},
|
||||
onlyRestoreStableService: false,
|
||||
expectObj: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
s1 := demoService.DeepCopy()
|
||||
s2 := demoService.DeepCopy()
|
||||
s2.Name = "echoserver-canary"
|
||||
s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2"
|
||||
c1 := demoIngress.DeepCopy()
|
||||
c2 := demoIngress.DeepCopy()
|
||||
c2.Name = "echoserver-canary"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "0"
|
||||
c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary"
|
||||
return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2}
|
||||
},
|
||||
expectDone: false,
|
||||
},
|
||||
{
|
||||
name: "FinalisingTrafficRouting test5",
|
||||
getObj: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
s1 := demoService.DeepCopy()
|
||||
s2 := demoService.DeepCopy()
|
||||
s2.Name = "echoserver-canary"
|
||||
s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2"
|
||||
c1 := demoIngress.DeepCopy()
|
||||
c2 := demoIngress.DeepCopy()
|
||||
c2.Name = "echoserver-canary"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true"
|
||||
c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "0"
|
||||
c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary"
|
||||
return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2}
|
||||
},
|
||||
getRollout: func() (*v1beta1.Rollout, *util.Workload) {
|
||||
obj := demoRollout.DeepCopy()
|
||||
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
|
||||
obj.Status.CanaryStatus.CurrentStepIndex = 4
|
||||
obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-3 * time.Second)}
|
||||
return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey}
|
||||
},
|
||||
onlyRestoreStableService: false,
|
||||
onlyTrafficRouting: true,
|
||||
expectObj: func() ([]*corev1.Service, []*netv1.Ingress) {
|
||||
s1 := demoService.DeepCopy()
|
||||
c1 := demoIngress.DeepCopy()
|
||||
|
@ -1125,18 +1074,19 @@ func TestFinalisingTrafficRouting(t *testing.T) {
|
|||
newStatus := rollout.Status.DeepCopy()
|
||||
currentStep := rollout.Spec.Strategy.Canary.Steps[newStatus.CanaryStatus.CurrentStepIndex-1]
|
||||
c := &TrafficRoutingContext{
|
||||
Key: fmt.Sprintf("Rollout(%s/%s)", rollout.Namespace, rollout.Name),
|
||||
Namespace: rollout.Namespace,
|
||||
ObjectRef: rollout.Spec.Strategy.Canary.TrafficRoutings,
|
||||
Strategy: currentStep.TrafficRoutingStrategy,
|
||||
OwnerRef: *metav1.NewControllerRef(rollout, v1beta1.SchemeGroupVersion.WithKind("Rollout")),
|
||||
RevisionLabelKey: workload.RevisionLabelKey,
|
||||
StableRevision: newStatus.CanaryStatus.StableRevision,
|
||||
CanaryRevision: newStatus.CanaryStatus.PodTemplateHash,
|
||||
LastUpdateTime: newStatus.CanaryStatus.LastUpdateTime,
|
||||
Key: fmt.Sprintf("Rollout(%s/%s)", rollout.Namespace, rollout.Name),
|
||||
Namespace: rollout.Namespace,
|
||||
ObjectRef: rollout.Spec.Strategy.Canary.TrafficRoutings,
|
||||
Strategy: currentStep.TrafficRoutingStrategy,
|
||||
OwnerRef: *metav1.NewControllerRef(rollout, v1beta1.SchemeGroupVersion.WithKind("Rollout")),
|
||||
RevisionLabelKey: workload.RevisionLabelKey,
|
||||
StableRevision: newStatus.CanaryStatus.StableRevision,
|
||||
CanaryRevision: newStatus.CanaryStatus.PodTemplateHash,
|
||||
LastUpdateTime: newStatus.CanaryStatus.LastUpdateTime,
|
||||
OnlyTrafficRouting: cs.onlyTrafficRouting,
|
||||
}
|
||||
manager := NewTrafficRoutingManager(client)
|
||||
done, err := manager.FinalisingTrafficRouting(c, cs.onlyRestoreStableService)
|
||||
done, err := manager.FinalisingTrafficRouting(c)
|
||||
if err != nil {
|
||||
t.Fatalf("DoTrafficRouting failed: %s", err)
|
||||
}
|
||||
|
|
|
@ -112,9 +112,9 @@ func (r *customController) EnsureRoutes(ctx context.Context, strategy *v1beta1.T
|
|||
done := true
|
||||
// *strategy.Weight == 0 indicates traffic routing is doing finalising and tries to route whole traffic to stable service
|
||||
// then directly do finalising
|
||||
if strategy.Traffic != nil && *strategy.Traffic == "0%" {
|
||||
return true, nil
|
||||
}
|
||||
// if strategy.Traffic != nil && *strategy.Traffic == "0%" {
|
||||
// return true, nil
|
||||
// }
|
||||
var err error
|
||||
customNetworkRefList := make([]*unstructured.Unstructured, len(r.conf.TrafficConf))
|
||||
|
||||
|
|
Loading…
Reference in New Issue