kruise/pkg/controller/workloadspread/workloadspread_controller.go

919 lines
35 KiB
Go

/*
Copyright 2021 The Kruise Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package workloadspread
import (
"context"
"encoding/json"
"flag"
"fmt"
"math"
"strings"
"time"
appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/tools/record"
"k8s.io/klog/v2"
kubecontroller "k8s.io/kubernetes/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"
appsv1alpha1 "github.com/openkruise/kruise/apis/apps/v1alpha1"
ctrlUtil "github.com/openkruise/kruise/pkg/controller/util"
"github.com/openkruise/kruise/pkg/features"
"github.com/openkruise/kruise/pkg/util"
utilclient "github.com/openkruise/kruise/pkg/util/client"
"github.com/openkruise/kruise/pkg/util/configuration"
"github.com/openkruise/kruise/pkg/util/controllerfinder"
utildiscovery "github.com/openkruise/kruise/pkg/util/discovery"
utilfeature "github.com/openkruise/kruise/pkg/util/feature"
"github.com/openkruise/kruise/pkg/util/fieldindex"
"github.com/openkruise/kruise/pkg/util/ratelimiter"
"github.com/openkruise/kruise/pkg/util/requeueduration"
wsutil "github.com/openkruise/kruise/pkg/util/workloadspread"
)
func init() {
flag.IntVar(&concurrentReconciles, "workloadspread-workers", concurrentReconciles, "Max concurrent workers for WorkloadSpread controller.")
}
var (
concurrentReconciles = 3
)
const (
controllerName = "workloadspread-controller"
// CreatPodTimeout sets maximum time from the moment a pod is added to CreatePods in WorkloadSpread.Status by webhook
// to the time when the pod is expected to be seen by controller. If the pod has not been found by controller
// during that time it is assumed, which means it won't be created at all and corresponding record in map can be
// removed from WorkloadSpread.Status. It is assumed that pod/ws apiserver to controller latency is relatively small (like 1-2sec)
// so the below value should be more enough.
CreatPodTimeout = 30 * time.Second
// DeletePodTimeout is similar to the CreatePodTimeout and it's the time duration for deleting Pod.
DeletePodTimeout = 15 * time.Second
// FakeSubsetName is a fake subset name for such pods that do not match any subsets
FakeSubsetName = "kruise.io/workloadspread-fake-subset-name"
// IgnorePatchExistingPodsAnnotation ignore ws.Spec.Subsets[x].Patch for existing pods
IgnorePatchExistingPodsAnnotation = "workloadspread.kruise.io/ignore-patch-existing-pods-metadata"
)
var (
controllerKruiseKindWS = appsv1alpha1.SchemeGroupVersion.WithKind("WorkloadSpread")
controllerKruiseKindCS = appsv1alpha1.SchemeGroupVersion.WithKind("CloneSet")
controllerKruiseKindSts = appsv1alpha1.SchemeGroupVersion.WithKind("StatefulSet")
controllerKindSts = appsv1.SchemeGroupVersion.WithKind("StatefulSet")
controllerKindRS = appsv1.SchemeGroupVersion.WithKind("ReplicaSet")
controllerKindDep = appsv1.SchemeGroupVersion.WithKind("Deployment")
controllerKindJob = batchv1.SchemeGroupVersion.WithKind("Job")
)
// this is a short cut for any sub-functions to notify the reconcile how long to wait to requeue
var durationStore = requeueduration.DurationStore{}
// Add creates a new WorkloadSpread Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller
// and Start it when the Manager is Started.
func Add(mgr manager.Manager) error {
if !utildiscovery.DiscoverGVK(controllerKruiseKindWS) || !utilfeature.DefaultFeatureGate.Enabled(features.WorkloadSpread) {
return nil
}
return add(mgr, newReconciler(mgr))
}
// add adds a new Controller to mgr with r as the reconcile.Reconciler
func add(mgr manager.Manager, r reconcile.Reconciler) error {
// Create a new controller
c, err := controller.New(controllerName, mgr, controller.Options{
Reconciler: r, MaxConcurrentReconciles: concurrentReconciles, CacheSyncTimeout: util.GetControllerCacheSyncTimeout(),
RateLimiter: ratelimiter.DefaultControllerRateLimiter()})
if err != nil {
return err
}
// Watch WorkloadSpread
err = c.Watch(source.Kind(mgr.GetCache(), &appsv1alpha1.WorkloadSpread{}, &handler.TypedEnqueueRequestForObject[*appsv1alpha1.WorkloadSpread]{}))
if err != nil {
return err
}
// Watch for changes to Pods have a specific annotation
err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Pod{}, &podEventHandler{}))
if err != nil {
return err
}
// Watch for replica changes to CloneSet
err = c.Watch(source.Kind(mgr.GetCache(), client.Object(&appsv1alpha1.CloneSet{}), &workloadEventHandler{Reader: mgr.GetCache()}))
if err != nil {
return err
}
// Watch for replica changes to Deployment
err = c.Watch(source.Kind(mgr.GetCache(), client.Object(&appsv1.Deployment{}), &workloadEventHandler{Reader: mgr.GetCache()}))
if err != nil {
return err
}
// Watch for replica changes to ReplicaSet
err = c.Watch(source.Kind(mgr.GetCache(), client.Object(&appsv1.ReplicaSet{}), &workloadEventHandler{Reader: mgr.GetCache()}))
if err != nil {
return err
}
// Watch for parallelism changes to Job
err = c.Watch(source.Kind(mgr.GetCache(), client.Object(&batchv1.Job{}), &workloadEventHandler{Reader: mgr.GetCache()}))
if err != nil {
return err
}
// Watch for replicas changes to other CRD
whiteList, err := configuration.GetWSWatchCustomWorkloadWhiteList(mgr.GetClient())
if err != nil {
return err
}
if len(whiteList.Workloads) > 0 {
workloadHandler := &workloadEventHandler{Reader: mgr.GetClient()}
for _, workload := range whiteList.Workloads {
if _, err := ctrlUtil.AddWatcherDynamically(mgr, c, workloadHandler, workload.GroupVersionKind, "WorkloadSpread"); err != nil {
return err
}
}
}
return nil
}
// newReconciler returns a new reconcile.Reconciler
func newReconciler(mgr manager.Manager) reconcile.Reconciler {
cli := utilclient.NewClientFromManager(mgr, controllerName)
return &ReconcileWorkloadSpread{
Client: cli,
scheme: mgr.GetScheme(),
recorder: mgr.GetEventRecorderFor(controllerName),
controllerFinder: controllerfinder.Finder,
}
}
var _ reconcile.Reconciler = &ReconcileWorkloadSpread{}
// ReconcileWorkloadSpread reconciles a WorkloadSpread object
type ReconcileWorkloadSpread struct {
client.Client
scheme *runtime.Scheme
recorder record.EventRecorder
controllerFinder *controllerfinder.ControllerFinder
}
// +kubebuilder:rbac:groups=apps.kruise.io,resources=workloadspreads,verbs=get;list;watch;update;patch
// +kubebuilder:rbac:groups=apps.kruise.io,resources=workloadspreads/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=apps.kruise.io,resources=workloadspreads/finalizers,verbs=update
// +kubebuilder:rbac:groups=apps.kruise.io,resources=clonesets,verbs=get;list;watch
// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch
// +kubebuilder:rbac:groups=apps,resources=replicasets,verbs=get;list;watch
// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete
func (r *ReconcileWorkloadSpread) Reconcile(_ context.Context, req reconcile.Request) (reconcile.Result, error) {
ws := &appsv1alpha1.WorkloadSpread{}
err := r.Get(context.TODO(), req.NamespacedName, ws)
if (err != nil && errors.IsNotFound(err)) || (err == nil && !ws.DeletionTimestamp.IsZero()) {
// delete cache if this workloadSpread has been deleted
if cacheErr := util.GlobalCache.Delete(&appsv1alpha1.WorkloadSpread{
TypeMeta: metav1.TypeMeta{
APIVersion: "apps.kruise.io/v1alpha1",
Kind: "WorkloadSpread",
},
ObjectMeta: metav1.ObjectMeta{
Namespace: req.Namespace,
Name: req.Name,
},
}); cacheErr != nil {
klog.ErrorS(cacheErr, "Failed to delete workloadSpread cache after deletion", "workloadSpread", req)
}
return reconcile.Result{}, nil
} else if err != nil {
// Error reading the object - requeue the request.
return reconcile.Result{}, err
}
startTime := time.Now()
klog.V(3).InfoS("Began to process WorkloadSpread", "workloadSpread", klog.KObj(ws))
err = r.syncWorkloadSpread(ws)
klog.V(3).InfoS("Finished syncing WorkloadSpread", "workloadSpread", klog.KObj(ws), "cost", time.Since(startTime))
return reconcile.Result{RequeueAfter: durationStore.Pop(getWorkloadSpreadKey(ws))}, err
}
func (r *ReconcileWorkloadSpread) getPodJob(ref *appsv1alpha1.TargetReference, namespace string) ([]*corev1.Pod, int32, error) {
ok, err := wsutil.VerifyGroupKind(ref, controllerKindJob.Kind, []string{controllerKindJob.Group})
if err != nil || !ok {
return nil, 0, err
}
job := &batchv1.Job{}
err = r.Get(context.TODO(), client.ObjectKey{Namespace: namespace, Name: ref.Name}, job)
if err != nil {
// when error is NotFound, it is ok here.
if errors.IsNotFound(err) {
klog.V(3).InfoS("Could not find Job", "job", klog.KRef(namespace, ref.Name))
return nil, 0, nil
}
return nil, 0, err
}
labelSelector, err := util.ValidatedLabelSelectorAsSelector(job.Spec.Selector)
if err != nil {
klog.ErrorS(err, "Failed to get labelSelector")
return nil, 0, err
}
podList := &corev1.PodList{}
listOption := &client.ListOptions{
Namespace: namespace,
LabelSelector: labelSelector,
FieldSelector: fields.SelectorFromSet(fields.Set{fieldindex.IndexNameForOwnerRefUID: string(job.UID)}),
}
err = r.List(context.TODO(), podList, listOption)
if err != nil {
return nil, 0, err
}
matchedPods := make([]*corev1.Pod, 0, len(podList.Items))
for i := range podList.Items {
matchedPods = append(matchedPods, &podList.Items[i])
}
return matchedPods, *(job.Spec.Parallelism), nil
}
func (r *ReconcileWorkloadSpread) getReplicasPathList(ws *appsv1alpha1.WorkloadSpread) ([]string, error) {
if ws.Spec.TargetReference == nil {
return nil, nil
}
if ws.Spec.TargetFilter != nil && len(ws.Spec.TargetFilter.ReplicasPathList) > 0 {
return ws.Spec.TargetFilter.ReplicasPathList, nil
}
whiteList, err := configuration.GetWSWatchCustomWorkloadWhiteList(r.Client)
if err != nil {
return nil, err
}
gv, err := schema.ParseGroupVersion(ws.Spec.TargetReference.APIVersion)
if err != nil {
return nil, err
}
for _, wl := range whiteList.Workloads {
if wl.GroupVersion() != gv || wl.GroupVersionKind.Kind != ws.Spec.TargetReference.Kind {
continue
}
klog.V(5).InfoS("found replicas path in whitelist", "path", wl.ReplicasPath, "workloadSpread", klog.KObj(ws))
return []string{wl.ReplicasPath}, nil
}
return nil, nil
}
// getPodsForWorkloadSpread returns Pods managed by the WorkloadSpread object.
// return two parameters
// 1. podList for workloadSpread
// 2. workloadReplicas
func (r *ReconcileWorkloadSpread) getPodsForWorkloadSpread(ws *appsv1alpha1.WorkloadSpread) ([]*corev1.Pod, int32, error) {
if ws.Spec.TargetReference == nil {
return nil, 0, nil
}
var pods []*corev1.Pod
var workloadReplicas int32
var err error
targetRef := ws.Spec.TargetReference
switch targetRef.Kind {
case controllerKindJob.Kind:
pods, workloadReplicas, err = r.getPodJob(targetRef, ws.Namespace)
default:
pods, workloadReplicas, err = r.controllerFinder.GetPodsForRef(targetRef.APIVersion, targetRef.Kind, ws.Namespace, targetRef.Name, false)
}
if err != nil {
klog.ErrorS(err, "WorkloadSpread handled targetReference failed", "workloadSpread", klog.KObj(ws))
return nil, 0, err
}
workloadReplicas, pods, err = r.filterWorkload(ws, pods, workloadReplicas)
if err != nil {
klog.ErrorS(err, "Filter workload failed", "workloadSpread", klog.KObj(ws))
return nil, 0, err
}
return pods, workloadReplicas, err
}
func (r *ReconcileWorkloadSpread) filterWorkload(ws *appsv1alpha1.WorkloadSpread, pods []*corev1.Pod, replicas int32) (int32, []*corev1.Pod, error) {
klog.V(5).InfoS("before workload filtering", "pods", len(pods), "replicas", replicas, "workloadSpread", klog.KObj(ws))
replicasPathList, err := r.getReplicasPathList(ws)
if err != nil {
return replicas, pods, err
}
var filteredReplicas int32
if len(replicasPathList) > 0 {
// replicas path list configured in someplace, should overwrite replicas value
targetRef := ws.Spec.TargetReference
wl, err := r.controllerFinder.GetControllerAsUnstructured(controllerfinder.ControllerReference{
APIVersion: targetRef.APIVersion,
Kind: targetRef.Kind,
Name: targetRef.Name,
}, ws.Namespace)
if err != nil {
return replicas, pods, client.IgnoreNotFound(err)
}
for _, replicasPath := range replicasPathList {
n, err := wsutil.GetReplicasFromObject(wl, replicasPath)
if err != nil {
return replicas, pods, err
}
filteredReplicas += n
}
klog.V(4).InfoS("replicas after filtering", "replicas", filteredReplicas,
"replicasPathList", replicasPathList, "workloadSpread", klog.KObj(ws))
} else {
filteredReplicas = replicas
klog.V(4).InfoS("replicas not filtered", "workloadSpread", klog.KObj(ws))
}
var filteredPods []*corev1.Pod
if ws.Spec.TargetFilter != nil && ws.Spec.TargetFilter.Selector != nil {
for _, pod := range pods {
selected, err := wsutil.IsPodSelected(ws.Spec.TargetFilter, pod.Labels)
if err != nil {
return replicas, pods, err
}
if selected {
filteredPods = append(filteredPods, pod)
}
}
klog.V(4).InfoS("pods after filtering", "pods", len(filteredPods), "selector", ws.Spec.TargetFilter.Selector)
} else {
filteredPods = pods
}
return filteredReplicas, filteredPods, nil
}
// syncWorkloadSpread is the main logic of the WorkloadSpread controller. Firstly, we get Pods from workload managed by
// WorkloadSpread and then classify these Pods to each corresponding subset. Secondly, we set Pod deletion-cost annotation
// value by compare the number of subset's Pods with the subset's maxReplicas, and then we consider rescheduling failed Pods.
// Lastly, we update the WorkloadSpread's Status and clean up scheduled failed Pods. controller should collaborate with webhook
// to maintain WorkloadSpread status together. The controller is responsible for calculating the real status, and the webhook
// mainly counts missingReplicas and records the creation or deletion entry of Pod into map.
func (r *ReconcileWorkloadSpread) syncWorkloadSpread(ws *appsv1alpha1.WorkloadSpread) error {
if ws.Spec.TargetReference == nil {
klog.InfoS("WorkloadSpread has no target reference", "workloadSpread", klog.KObj(ws))
return nil
}
pods, workloadReplicas, err := r.getPodsForWorkloadSpread(ws)
if err != nil {
klog.ErrorS(err, "WorkloadSpread got matched pods failed", "workloadSpread", klog.KObj(ws))
return err
}
if len(pods) == 0 {
klog.InfoS("WorkloadSpread had no matched pods", "workloadSpread", klog.KObj(ws), "targetWorkloadReplicas", workloadReplicas)
}
// group Pods by pod-revision and subset
versionedPodMap, subsetPodMap, err := r.groupVersionedPods(ws, pods, workloadReplicas)
if err != nil {
return err
}
// update deletion-cost for each subset
err = r.updateDeletionCost(ws, versionedPodMap, workloadReplicas)
if err != nil {
return err
}
// calculate status and reschedule
status, scheduleFailedPodMap := r.calculateWorkloadSpreadStatus(ws, versionedPodMap, subsetPodMap, workloadReplicas)
if status == nil {
return nil
}
// update status
err = r.UpdateWorkloadSpreadStatus(ws, status)
if err != nil {
return err
}
// clean up unschedulable Pods
return r.cleanupUnscheduledPods(ws, scheduleFailedPodMap)
}
func getInjectWorkloadSpreadFromPod(pod *corev1.Pod) *wsutil.InjectWorkloadSpread {
injectStr, exist := pod.GetAnnotations()[wsutil.MatchedWorkloadSpreadSubsetAnnotations]
if !exist {
return nil
}
injectWS := &wsutil.InjectWorkloadSpread{}
err := json.Unmarshal([]byte(injectStr), injectWS)
if err != nil {
klog.ErrorS(err, "Failed to unmarshal JSON from Pod", "JSON", injectStr, "pod", klog.KObj(pod))
return nil
}
return injectWS
}
// groupVersionedPods will group pods by pod version and subset
func (r *ReconcileWorkloadSpread) groupVersionedPods(ws *appsv1alpha1.WorkloadSpread, allPods []*corev1.Pod, replicas int32) (map[string]map[string][]*corev1.Pod, map[string][]*corev1.Pod, error) {
versionedPods := map[string][]*corev1.Pod{}
for _, pod := range allPods {
version := wsutil.GetPodVersion(pod)
versionedPods[version] = append(versionedPods[version], pod)
}
subsetPodMap := map[string][]*corev1.Pod{}
versionedPodMap := map[string]map[string][]*corev1.Pod{}
// group pods by version
for version, pods := range versionedPods {
// group pods by subset
podMap, err := r.groupPodBySubset(ws, pods, replicas)
if err != nil {
return nil, nil, err
}
for subset, ps := range podMap {
subsetPodMap[subset] = append(subsetPodMap[subset], ps...)
}
versionedPodMap[version] = podMap
}
return versionedPodMap, subsetPodMap, nil
}
// groupPodBySubset returns a map, the key is the name of subset and the value represents the Pods of the corresponding subset.
func (r *ReconcileWorkloadSpread) groupPodBySubset(ws *appsv1alpha1.WorkloadSpread, pods []*corev1.Pod, replicas int32) (map[string][]*corev1.Pod, error) {
podMap := make(map[string][]*corev1.Pod, len(ws.Spec.Subsets)+1)
podMap[FakeSubsetName] = []*corev1.Pod{}
subsetMissingReplicas := make(map[string]int)
for _, subset := range ws.Spec.Subsets {
podMap[subset.Name] = []*corev1.Pod{}
subsetMissingReplicas[subset.Name], _ = intstr.GetScaledValueFromIntOrPercent(
intstr.ValueOrDefault(subset.MaxReplicas, intstr.FromInt32(math.MaxInt32)), int(replicas), true)
}
// count managed pods for each subset
for i := range pods {
injectWS := getInjectWorkloadSpreadFromPod(pods[i])
if isNotMatchedWS(injectWS, ws) {
continue
}
if _, exist := podMap[injectWS.Subset]; !exist {
continue
}
subsetMissingReplicas[injectWS.Subset]--
}
for i := range pods {
subsetName, err := r.getSuitableSubsetNameForPod(ws, pods[i], subsetMissingReplicas)
if err != nil {
return nil, err
}
if _, exist := podMap[subsetName]; exist {
podMap[subsetName] = append(podMap[subsetName], pods[i])
} else {
// for the scene where the original subset of the pod was deleted.
podMap[FakeSubsetName] = append(podMap[FakeSubsetName], pods[i])
}
}
return podMap, nil
}
// getSuitableSubsetNameForPod will return (FakeSubsetName, nil) if not found suitable subset for pod
func (r *ReconcileWorkloadSpread) getSuitableSubsetNameForPod(ws *appsv1alpha1.WorkloadSpread, pod *corev1.Pod, subsetMissingReplicas map[string]int) (string, error) {
injectWS := getInjectWorkloadSpreadFromPod(pod)
if isNotMatchedWS(injectWS, ws) {
// process the pods that were created before workloadSpread
matchedSubset, err := r.getAndUpdateSuitableSubsetName(ws, pod, subsetMissingReplicas)
klog.V(3).InfoS("no subset injected to pod, find a suitable one", "pod", klog.KObj(pod), "workloadSpread", klog.KObj(ws), "matchedSubset", matchedSubset)
if err != nil {
return "", err
} else if matchedSubset == nil {
return FakeSubsetName, nil
}
return matchedSubset.Name, nil
}
return injectWS.Subset, nil
}
// getSuitableSubsetForOldPod returns a suitable subset for the pod which was created before workloadSpread.
// getSuitableSubsetForOldPod will return (nil, nil) if there is no suitable subset for the pod.
func (r *ReconcileWorkloadSpread) getAndUpdateSuitableSubsetName(ws *appsv1alpha1.WorkloadSpread, pod *corev1.Pod, subsetMissingReplicas map[string]int) (*appsv1alpha1.WorkloadSpreadSubset, error) {
if len(pod.Spec.NodeName) == 0 {
return nil, nil
}
node := &corev1.Node{}
if err := r.Get(context.TODO(), types.NamespacedName{Name: pod.Spec.NodeName}, node); err != nil {
if errors.IsNotFound(err) {
return nil, nil
}
return nil, err
}
var maxPreferredScore int64 = -1
var favoriteSubset *appsv1alpha1.WorkloadSpreadSubset
for i := range ws.Spec.Subsets {
subset := &ws.Spec.Subsets[i]
// in case of that this pod was scheduled to the node which matches a subset of workloadSpread
matched, preferredScore, err := matchesSubset(pod, node, subset, subsetMissingReplicas[subset.Name])
if err != nil {
// requiredSelectorTerm field was validated at webhook stage, so this error should not occur
// this error should not be returned, because it is a non-transient error
klog.ErrorS(err, "Unexpected error occurred when matching pod with subset, please check requiredSelectorTerm field of subset in WorkloadSpread",
"pod", klog.KObj(pod), "subsetName", subset.Name, "workloadSpread", klog.KObj(ws))
}
klog.V(4).InfoS("preferred score for subset", "pod", klog.KObj(pod), "subsetName", subset.Name, "workloadSpread", klog.KObj(ws), "preferredScore", preferredScore, "node", node.Name)
// select the most favorite subsets for the pod by subset.PreferredNodeSelectorTerms
if matched && preferredScore > maxPreferredScore {
favoriteSubset = subset
maxPreferredScore = preferredScore
}
}
if favoriteSubset != nil {
if err := r.patchFavoriteSubsetMetadataToPod(pod, ws, favoriteSubset); err != nil {
return nil, err
}
subsetMissingReplicas[favoriteSubset.Name]--
return favoriteSubset, nil
}
return nil, nil
}
// patchFavoriteSubsetMetadataToPod patch MatchedWorkloadSpreadSubsetAnnotations to the pod,
// and select labels/annotations form favoriteSubset.patch, then patch them to the pod;
func (r *ReconcileWorkloadSpread) patchFavoriteSubsetMetadataToPod(pod *corev1.Pod, ws *appsv1alpha1.WorkloadSpread, favoriteSubset *appsv1alpha1.WorkloadSpreadSubset) error {
patchMetadata := make(map[string]interface{})
// decode favoriteSubset.patch.raw and add their labels and annotations to the patch
if favoriteSubset.Patch.Raw != nil && !strings.EqualFold(ws.Annotations[IgnorePatchExistingPodsAnnotation], "true") {
patchField := map[string]interface{}{}
if err := json.Unmarshal(favoriteSubset.Patch.Raw, &patchField); err == nil {
if metadata, ok := patchField["metadata"].(map[string]interface{}); ok && metadata != nil {
patchMetadata = metadata
}
}
}
injectWS, _ := json.Marshal(&wsutil.InjectWorkloadSpread{
Name: ws.Name,
Subset: favoriteSubset.Name,
})
if annotations, ok := patchMetadata["annotations"].(map[string]interface{}); ok && annotations != nil {
annotations[wsutil.MatchedWorkloadSpreadSubsetAnnotations] = string(injectWS)
} else {
patchMetadata["annotations"] = map[string]interface{}{
wsutil.MatchedWorkloadSpreadSubsetAnnotations: string(injectWS),
}
}
patch, _ := json.Marshal(map[string]interface{}{
"metadata": patchMetadata,
})
if err := r.Patch(context.TODO(), pod, client.RawPatch(types.StrategicMergePatchType, patch)); err != nil {
klog.ErrorS(err, `Failed to patch "matched-workloadspread" annotation for pod`,
"pod", klog.KObj(pod), "annotationValue", fmt.Sprintf("{Name: %s, Subset: %s}", ws.Name, favoriteSubset.Name))
return err
}
return nil
}
// return two parameters
// 1. current WorkloadSpreadStatus
// 2. a map, the key is the subsetName, the value is the schedule failed Pods belongs to the subset.
func (r *ReconcileWorkloadSpread) calculateWorkloadSpreadStatus(ws *appsv1alpha1.WorkloadSpread,
versionedPodMap map[string]map[string][]*corev1.Pod, subsetPodMap map[string][]*corev1.Pod,
workloadReplicas int32) (*appsv1alpha1.WorkloadSpreadStatus, map[string][]*corev1.Pod) {
status := appsv1alpha1.WorkloadSpreadStatus{}
// set the generation in the returned status
status.ObservedGeneration = ws.Generation
// status.ObservedWorkloadReplicas = workloadReplicas
status.VersionedSubsetStatuses = make(map[string][]appsv1alpha1.WorkloadSpreadSubsetStatus, len(versionedPodMap))
// overall subset statuses
var scheduleFailedPodMap map[string][]*corev1.Pod
status.SubsetStatuses, scheduleFailedPodMap = r.calculateWorkloadSpreadSubsetStatuses(ws, ws.Status.SubsetStatuses, subsetPodMap, workloadReplicas)
// versioned subset statuses calculated by observed pods
for version, podMap := range versionedPodMap {
status.VersionedSubsetStatuses[version], _ = r.calculateWorkloadSpreadSubsetStatuses(ws, ws.Status.VersionedSubsetStatuses[version], podMap, workloadReplicas)
}
// Consider this case:
// A Pod has been created and processed by webhook, but the Pod is not cached by controller.
// We have to keep the subsetStatus for this version even though there is no Pod belonging to it.
for version := range ws.Status.VersionedSubsetStatuses {
if _, exist := versionedPodMap[version]; exist {
continue
}
versionSubsetStatues, _ := r.calculateWorkloadSpreadSubsetStatuses(ws, ws.Status.VersionedSubsetStatuses[version], nil, workloadReplicas)
if !isEmptySubsetStatuses(versionSubsetStatues) {
status.VersionedSubsetStatuses[version] = versionSubsetStatues
}
}
return &status, scheduleFailedPodMap
}
func isEmptySubsetStatuses(statues []appsv1alpha1.WorkloadSpreadSubsetStatus) bool {
replicas, creating, deleting := 0, 0, 0
for _, subset := range statues {
replicas += int(subset.Replicas)
creating += len(subset.CreatingPods)
deleting += len(subset.DeletingPods)
}
return replicas+creating+deleting == 0
}
func (r *ReconcileWorkloadSpread) calculateWorkloadSpreadSubsetStatuses(ws *appsv1alpha1.WorkloadSpread,
oldSubsetStatuses []appsv1alpha1.WorkloadSpreadSubsetStatus, podMap map[string][]*corev1.Pod, workloadReplicas int32,
) ([]appsv1alpha1.WorkloadSpreadSubsetStatus, map[string][]*corev1.Pod) {
subsetStatuses := make([]appsv1alpha1.WorkloadSpreadSubsetStatus, len(ws.Spec.Subsets))
scheduleFailedPodMap := make(map[string][]*corev1.Pod)
// Using a map to restore name and old status of subset, because user could adjust the spec's subset sequence
// to change priority of subset. We guarantee that operation and use subset name to distinguish which subset
// from old status.
oldSubsetStatusMap := make(map[string]*appsv1alpha1.WorkloadSpreadSubsetStatus, len(oldSubsetStatuses))
for i := range oldSubsetStatuses {
oldSubsetStatusMap[oldSubsetStatuses[i].Name] = &oldSubsetStatuses[i]
}
var rescheduleCriticalSeconds int32
if ws.Spec.ScheduleStrategy.Type == appsv1alpha1.AdaptiveWorkloadSpreadScheduleStrategyType &&
ws.Spec.ScheduleStrategy.Adaptive != nil &&
ws.Spec.ScheduleStrategy.Adaptive.RescheduleCriticalSeconds != nil {
rescheduleCriticalSeconds = *ws.Spec.ScheduleStrategy.Adaptive.RescheduleCriticalSeconds
}
for i := 0; i < len(ws.Spec.Subsets); i++ {
subset := &ws.Spec.Subsets[i]
// calculate subset status
subsetStatus := r.calculateWorkloadSpreadSubsetStatus(ws, podMap[subset.Name], subset,
oldSubsetStatusMap[subset.Name], workloadReplicas)
if subsetStatus == nil {
return nil, nil
}
// don't reschedule the last subset.
if rescheduleCriticalSeconds > 0 {
if i != len(ws.Spec.Subsets)-1 {
pods := r.rescheduleSubset(ws, podMap[subset.Name], subsetStatus, oldSubsetStatusMap[subset.Name])
scheduleFailedPodMap[subset.Name] = pods
} else {
oldCondition := GetWorkloadSpreadSubsetCondition(oldSubsetStatusMap[subset.Name], appsv1alpha1.SubsetSchedulable)
if oldCondition != nil {
setWorkloadSpreadSubsetCondition(subsetStatus, oldCondition.DeepCopy())
}
setWorkloadSpreadSubsetCondition(subsetStatus, NewWorkloadSpreadSubsetCondition(appsv1alpha1.SubsetSchedulable, corev1.ConditionTrue, "", ""))
}
} else {
removeWorkloadSpreadSubsetCondition(subsetStatus, appsv1alpha1.SubsetSchedulable)
}
subsetStatuses[i] = *subsetStatus
}
return subsetStatuses, scheduleFailedPodMap
}
// calculateWorkloadSpreadSubsetStatus returns the current subsetStatus for subset.
func (r *ReconcileWorkloadSpread) calculateWorkloadSpreadSubsetStatus(ws *appsv1alpha1.WorkloadSpread,
pods []*corev1.Pod,
subset *appsv1alpha1.WorkloadSpreadSubset,
oldSubsetStatus *appsv1alpha1.WorkloadSpreadSubsetStatus,
workloadReplicas int32) *appsv1alpha1.WorkloadSpreadSubsetStatus {
// current subsetStatus in this reconcile
subsetStatus := &appsv1alpha1.WorkloadSpreadSubsetStatus{}
subsetStatus.Name = subset.Name
subsetStatus.CreatingPods = make(map[string]metav1.Time)
subsetStatus.DeletingPods = make(map[string]metav1.Time)
var err error
var subsetMaxReplicas int
if subset.MaxReplicas == nil {
// MaxReplicas is nil, which means there is no limit for subset replicas, using -1 to represent it.
subsetMaxReplicas = -1
} else {
subsetMaxReplicas, err = intstr.GetScaledValueFromIntOrPercent(subset.MaxReplicas, int(workloadReplicas), true)
if err != nil || subsetMaxReplicas < 0 {
klog.ErrorS(err, "Failed to get maxReplicas value from subset of WorkloadSpread", "subsetName", subset.Name, "workloadSpread", klog.KObj(ws))
return nil
}
}
// initialize missingReplicas to subsetMaxReplicas
subsetStatus.MissingReplicas = int32(subsetMaxReplicas)
currentTime := time.Now()
var oldCreatingPods map[string]metav1.Time
var oldDeletingPods map[string]metav1.Time
if oldSubsetStatus != nil {
// make a deep copy because we may need to remove some element later and compare old status with current status.
oldCreatingPods = make(map[string]metav1.Time, len(oldSubsetStatus.CreatingPods))
for k, v := range oldSubsetStatus.CreatingPods {
oldCreatingPods[k] = v
}
oldDeletingPods = oldSubsetStatus.DeletingPods
}
var active int32
for _, pod := range pods {
// remove this Pod from creatingPods map because this Pod has been created.
injectWS := getInjectWorkloadSpreadFromPod(pod)
if injectWS != nil && injectWS.UID != "" {
// Deployment or other native k8s workload has not generated the full pod.Name when webhook is mutating Pod.
// So webhook generates a UID to identify Pod and restore it into the creatingPods map. The generated
// UID and pod.Name have the same function.
delete(oldCreatingPods, injectWS.UID)
} else {
delete(oldCreatingPods, pod.Name)
}
// not active
if !kubecontroller.IsPodActive(pod) {
continue
}
active++
// count missingReplicas
if subsetStatus.MissingReplicas > 0 {
subsetStatus.MissingReplicas--
}
// some Pods in oldDeletingPods map, which records Pods we want to delete by webhook.
if deleteTime, exist := oldDeletingPods[pod.Name]; exist {
expectedDeletion := deleteTime.Time.Add(DeletePodTimeout)
// deleted this Pod timeout, so we consider removing it from oldDeletingPods map, which means deleted failed.
if expectedDeletion.Before(currentTime) {
r.recorder.Eventf(ws, corev1.EventTypeWarning,
"DeletePodFailed", "Pod %s/%s was expected to be deleted but it wasn't", ws.Namespace, pod.Name)
} else {
// no timeout, there may be some latency, to restore it into deletingPods map.
subsetStatus.DeletingPods[pod.Name] = deleteTime
// missingReplicas + 1, suppose it has been deleted
if subsetStatus.MissingReplicas < int32(subsetMaxReplicas) {
subsetStatus.MissingReplicas++
}
// requeue key in order to clean it from map when expectedDeletion is equal to currentTime.
durationStore.Push(getWorkloadSpreadKey(ws), expectedDeletion.Sub(currentTime))
}
}
}
// record active replicas number
subsetStatus.Replicas = active
// oldCreatingPods has remaining Pods that not be found by controller.
for podID, createTime := range oldCreatingPods {
expectedCreation := createTime.Time.Add(CreatPodTimeout)
// created this Pod timeout
if expectedCreation.Before(currentTime) {
r.recorder.Eventf(ws, corev1.EventTypeWarning,
"CreatePodFailed", "Pod %s/%s was expected to be created but it wasn't", ws.Namespace, podID)
} else {
// no timeout, to restore it into creatingPods map.
subsetStatus.CreatingPods[podID] = createTime
// missingReplicas - 1, suppose it has been created
if subsetStatus.MissingReplicas > 0 {
subsetStatus.MissingReplicas--
}
// requeue key when expectedCreation is equal to currentTime.
durationStore.Push(getWorkloadSpreadKey(ws), expectedCreation.Sub(currentTime))
}
}
return subsetStatus
}
func (r *ReconcileWorkloadSpread) UpdateWorkloadSpreadStatus(ws *appsv1alpha1.WorkloadSpread,
status *appsv1alpha1.WorkloadSpreadStatus) error {
if apiequality.Semantic.DeepEqual(status, ws.Status) {
return nil
}
clone := ws.DeepCopy()
clone.Status = *status
err := r.writeWorkloadSpreadStatus(clone)
logStatusChanges(ws, status, err)
return err
}
func logStatusChanges(ws *appsv1alpha1.WorkloadSpread, status *appsv1alpha1.WorkloadSpreadStatus, err error) {
if err != nil {
klog.ErrorS(err, "Failed to update WorkloadSpread status", "workloadSpread", klog.KObj(ws), "status", status)
return
}
oldSubsetStatuses := ws.Status.SubsetStatuses
oldSubsetStatusMap := make(map[string]*appsv1alpha1.WorkloadSpreadSubsetStatus, len(oldSubsetStatuses))
for i := range oldSubsetStatuses {
oldSubsetStatusMap[oldSubsetStatuses[i].Name] = &oldSubsetStatuses[i]
}
var log string
for i, subset := range ws.Spec.Subsets {
oldStatus, ok := oldSubsetStatusMap[subset.Name]
if !ok {
oldStatus = &appsv1alpha1.WorkloadSpreadSubsetStatus{
Name: subset.Name,
}
}
newStatus := status.SubsetStatuses[i]
log = fmt.Sprintf(" (<subset name: %s>", subset.Name)
if oldStatus.Replicas != newStatus.Replicas {
log += fmt.Sprintf(" <Replicas: %d -> %d>", oldStatus.Replicas, newStatus.Replicas)
} else {
log += fmt.Sprintf(" <Replicas: %d>", newStatus.Replicas)
}
if oldStatus.MissingReplicas != newStatus.MissingReplicas {
log += fmt.Sprintf(" <missingReplicas: %d -> %d>", oldStatus.MissingReplicas, newStatus.MissingReplicas)
} else {
log += fmt.Sprintf(" <missingReplicas: %d>", newStatus.MissingReplicas)
}
if len(oldStatus.CreatingPods) != len(newStatus.CreatingPods) {
log += fmt.Sprintf(" <creatingPods length: %d -> %d>", len(oldStatus.CreatingPods), len(newStatus.CreatingPods))
} else {
log += fmt.Sprintf(" <creatingPods length: %d>", len(newStatus.CreatingPods))
}
if len(oldStatus.DeletingPods) != len(newStatus.DeletingPods) {
log += fmt.Sprintf(" <deletingPods length: %d -> %d>", len(oldStatus.DeletingPods), len(newStatus.DeletingPods))
} else {
log += fmt.Sprintf(" <deletingPods length: %d>", len(newStatus.DeletingPods))
}
log += ")"
}
klog.V(3).InfoS("WorkloadSpread status changed", "workloadSpread", klog.KObj(ws), "details", log)
}
func (r *ReconcileWorkloadSpread) writeWorkloadSpreadStatus(ws *appsv1alpha1.WorkloadSpread) error {
unlock := util.GlobalKeyedMutex.Lock(string(ws.GetUID()))
defer unlock()
// If this update fails, don't retry it. Allow the failure to get handled &
// retried in `processNextWorkItem()`.
err := r.Status().Update(context.TODO(), ws)
if err == nil {
if cacheErr := util.GlobalCache.Add(ws); cacheErr != nil {
klog.ErrorS(cacheErr, "Failed to update WorkloadSpread cache after update status", "workloadSpread", klog.KObj(ws))
}
}
return err
}
func getWorkloadSpreadKey(o metav1.Object) string {
return o.GetNamespace() + "/" + o.GetName()
}
func isNotMatchedWS(injectWS *wsutil.InjectWorkloadSpread, ws *appsv1alpha1.WorkloadSpread) bool {
if injectWS == nil || injectWS.Name != ws.Name || injectWS.Subset == "" {
return true
}
return false
}