Merge pull request #4729 from chaosi-zju/hpav5

fix deployment replicas syncer in case that `status.replicas` haven't been collected from member cluster to template
This commit is contained in:
karmada-bot 2024-03-27 18:38:38 +08:00 committed by GitHub
commit ff7322acf2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 83 additions and 31 deletions

View File

@ -18,7 +18,8 @@ package deploymentreplicassyncer
import ( import (
"context" "context"
"fmt" "encoding/json"
"time"
appsv1 "k8s.io/api/apps/v1" appsv1 "k8s.io/api/apps/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors" apierrors "k8s.io/apimachinery/pkg/api/errors"
@ -38,6 +39,8 @@ import (
const ( const (
// ControllerName is the controller name that will be used when reporting events. // ControllerName is the controller name that will be used when reporting events.
ControllerName = "deployment-replicas-syncer" ControllerName = "deployment-replicas-syncer"
waitDeploymentStatusInterval = 1 * time.Second
) )
// DeploymentReplicasSyncer is to sync deployment replicas from status field to spec field. // DeploymentReplicasSyncer is to sync deployment replicas from status field to spec field.
@ -98,6 +101,7 @@ func (r *DeploymentReplicasSyncer) Reconcile(ctx context.Context, req controller
binding := &workv1alpha2.ResourceBinding{} binding := &workv1alpha2.ResourceBinding{}
bindingName := names.GenerateBindingName(util.DeploymentKind, req.Name) bindingName := names.GenerateBindingName(util.DeploymentKind, req.Name)
// 1. get latest binding
if err := r.Client.Get(ctx, client.ObjectKey{Namespace: req.Namespace, Name: bindingName}, binding); err != nil { if err := r.Client.Get(ctx, client.ObjectKey{Namespace: req.Namespace, Name: bindingName}, binding); err != nil {
if apierrors.IsNotFound(err) { if apierrors.IsNotFound(err) {
klog.Infof("no need to update deployment replicas for binding not found") klog.Infof("no need to update deployment replicas for binding not found")
@ -106,11 +110,12 @@ func (r *DeploymentReplicasSyncer) Reconcile(ctx context.Context, req controller
return controllerruntime.Result{}, err return controllerruntime.Result{}, err
} }
// if it is not divided schedule type, no need to update replicas // 2. if it is not divided schedule type, no need to update replicas
if binding.Spec.Placement.ReplicaSchedulingType() != policyv1alpha1.ReplicaSchedulingTypeDivided { if binding.Spec.Placement.ReplicaSchedulingType() != policyv1alpha1.ReplicaSchedulingTypeDivided {
return controllerruntime.Result{}, nil return controllerruntime.Result{}, nil
} }
// 3. get latest deployment
if err := r.Client.Get(ctx, req.NamespacedName, deployment); err != nil { if err := r.Client.Get(ctx, req.NamespacedName, deployment); err != nil {
if apierrors.IsNotFound(err) { if apierrors.IsNotFound(err) {
klog.Infof("no need to update deployment replicas for deployment not found") klog.Infof("no need to update deployment replicas for deployment not found")
@ -119,39 +124,18 @@ func (r *DeploymentReplicasSyncer) Reconcile(ctx context.Context, req controller
return controllerruntime.Result{}, err return controllerruntime.Result{}, err
} }
// if replicas in spec already the same as in status, no need to update replicas // 4. if replicas in spec already the same as in status, no need to update replicas
if deployment.Spec.Replicas != nil && *deployment.Spec.Replicas == deployment.Status.Replicas { if deployment.Spec.Replicas != nil && *deployment.Spec.Replicas == deployment.Status.Replicas {
klog.Infof("replicas in spec field (%d) already equal to in status field (%d)", *deployment.Spec.Replicas, deployment.Status.Replicas) klog.Infof("replicas in spec field (%d) already equal to in status field (%d)", *deployment.Spec.Replicas, deployment.Status.Replicas)
return controllerruntime.Result{}, nil return controllerruntime.Result{}, nil
} }
// make sure the replicas change in deployment.spec can sync to binding.spec, otherwise retry // 5. judge the deployment modification in spec has taken effect and its status has been collected, otherwise retry.
if deployment.Spec.Replicas == nil || *deployment.Spec.Replicas != binding.Spec.Replicas { if !isDeploymentStatusCollected(deployment, binding) {
klog.V(4).Infof("wait until replicas of binding (%d) equal to replicas of deployment (%d)", return controllerruntime.Result{RequeueAfter: waitDeploymentStatusInterval}, nil
binding.Spec.Replicas, *deployment.Spec.Replicas)
return controllerruntime.Result{}, fmt.Errorf("retry to wait replicas change sync to binding")
} }
// make sure the scheduler observed generation equal to generation in binding, otherwise retry // 6. update spec replicas with status replicas
if binding.Generation != binding.Status.SchedulerObservedGeneration {
klog.V(4).Infof("wait until scheduler observed generation (%d) equal to generation in binding (%d)",
binding.Status.SchedulerObservedGeneration, binding.Generation)
return controllerruntime.Result{}, fmt.Errorf("retry to wait scheduler observed generation")
}
if len(binding.Status.AggregatedStatus) != len(binding.Spec.Clusters) {
klog.V(4).Infof("wait until all clusters status collected, got: %d, expected: %d",
len(binding.Status.AggregatedStatus), len(binding.Spec.Clusters))
return controllerruntime.Result{}, fmt.Errorf("retry to wait status in binding collected")
}
for _, status := range binding.Status.AggregatedStatus {
if status.Status == nil {
klog.V(4).Infof("wait until aggregated status of cluster %s collected", status.ClusterName)
return controllerruntime.Result{}, fmt.Errorf("retry to wait status in binding collected")
}
}
// update replicas
oldReplicas := *deployment.Spec.Replicas oldReplicas := *deployment.Spec.Replicas
deployment.Spec.Replicas = &deployment.Status.Replicas deployment.Spec.Replicas = &deployment.Status.Replicas
if err := r.Client.Update(ctx, deployment); err != nil { if err := r.Client.Update(ctx, deployment); err != nil {
@ -159,8 +143,64 @@ func (r *DeploymentReplicasSyncer) Reconcile(ctx context.Context, req controller
return controllerruntime.Result{}, err return controllerruntime.Result{}, err
} }
klog.Infof("successfully udpate deployment (%s/%s) replicas from %d to %d", deployment.Namespace, klog.Infof("successfully update deployment (%s/%s) replicas from %d to %d", deployment.Namespace,
deployment.Namespace, oldReplicas, deployment.Status.Replicas) deployment.Name, oldReplicas, deployment.Status.Replicas)
return controllerruntime.Result{}, nil return controllerruntime.Result{}, nil
} }
// isDeploymentStatusCollected judge whether deployment modification in spec has taken effect and its status has been collected.
func isDeploymentStatusCollected(deployment *appsv1.Deployment, binding *workv1alpha2.ResourceBinding) bool {
// make sure the replicas change in deployment.spec can sync to binding.spec, otherwise retry
if deployment.Spec.Replicas == nil || *deployment.Spec.Replicas != binding.Spec.Replicas {
klog.V(4).Infof("wait until replicas of binding (%d) equal to replicas of deployment (%d)",
binding.Spec.Replicas, *deployment.Spec.Replicas)
return false
}
// make sure the scheduler observed generation equal to generation in binding, otherwise retry
if binding.Generation != binding.Status.SchedulerObservedGeneration {
klog.V(4).Infof("wait until scheduler observed generation (%d) equal to generation in binding (%d)",
binding.Status.SchedulerObservedGeneration, binding.Generation)
return false
}
// make sure the number of aggregated status in binding is as expected.
if len(binding.Status.AggregatedStatus) != len(binding.Spec.Clusters) {
klog.V(4).Infof("wait until all clusters status collected, got: %d, expected: %d",
len(binding.Status.AggregatedStatus), len(binding.Spec.Clusters))
return false
}
// make sure the status.replicas in binding has been collected from work to binding.
bindingStatusSumReplicas := int32(0)
for _, status := range binding.Status.AggregatedStatus {
if status.Status == nil {
klog.V(4).Infof("wait until aggregated status of cluster %s collected", status.ClusterName)
return false
}
itemStatus := &appsv1.DeploymentStatus{}
if err := json.Unmarshal(status.Status.Raw, itemStatus); err != nil {
klog.Errorf("unmarshal status.raw of cluster %s in binding failed: %+v", status.ClusterName, err)
return false
}
// if member cluster deployment is controlled by hpa, its status.replicas must > 0 (since hpa.minReplicas > 0),
// so, if its status replicas is 0, means the status haven't been collected from member cluster, and needs waiting.
if itemStatus.Replicas <= 0 {
klog.V(4).Infof("wait until aggregated status replicas of cluster %s collected", status.ClusterName)
return false
}
bindingStatusSumReplicas += itemStatus.Replicas
}
// make sure the latest collected status.replicas is synced from binding to template.
// e.g: user set deployment.spec.replicas = 2, then sum replicas in binding.status becomes 2, however, if now
// deployment.status.replicas is still 1, we shouldn't update spec.replicas to 1 until status.replicas becomes 2.
if deployment.Status.Replicas != bindingStatusSumReplicas {
klog.V(4).Infof("wait until deployment status replicas (%d) equal to binding status replicas (%d)",
deployment.Status.Replicas, bindingStatusSumReplicas)
return false
}
return true
}

View File

@ -17,6 +17,7 @@ limitations under the License.
package native package native
import ( import (
"bytes"
"fmt" "fmt"
appsv1 "k8s.io/api/apps/v1" appsv1 "k8s.io/api/apps/v1"
@ -74,7 +75,18 @@ func reflectDeploymentStatus(object *unstructured.Unstructured) (*runtime.RawExt
AvailableReplicas: deploymentStatus.AvailableReplicas, AvailableReplicas: deploymentStatus.AvailableReplicas,
UnavailableReplicas: deploymentStatus.UnavailableReplicas, UnavailableReplicas: deploymentStatus.UnavailableReplicas,
} }
return helper.BuildStatusRawExtension(grabStatus)
grabStatusRaw, err := helper.BuildStatusRawExtension(grabStatus)
if err != nil {
return nil, err
}
// if status is empty struct, it actually means status haven't been collected from member cluster.
if bytes.Equal(grabStatusRaw.Raw, []byte("{}")) {
return nil, nil
}
return grabStatusRaw, nil
} }
func reflectServiceStatus(object *unstructured.Unstructured) (*runtime.RawExtension, error) { func reflectServiceStatus(object *unstructured.Unstructured) (*runtime.RawExtension, error) {