/* Copyright 2023 The Kruise Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package imagelistpulljob import ( "context" "fmt" "hash/fnv" "reflect" "time" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/rand" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/retry" "k8s.io/klog/v2" hashutil "k8s.io/kubernetes/pkg/util/hash" "k8s.io/kubernetes/pkg/util/slice" "k8s.io/utils/clock" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" appsv1alpha1 "github.com/openkruise/kruise/apis/apps/v1alpha1" "github.com/openkruise/kruise/pkg/features" "github.com/openkruise/kruise/pkg/util" utilclient "github.com/openkruise/kruise/pkg/util/client" utildiscovery "github.com/openkruise/kruise/pkg/util/discovery" "github.com/openkruise/kruise/pkg/util/expectations" utilfeature "github.com/openkruise/kruise/pkg/util/feature" "github.com/openkruise/kruise/pkg/util/fieldindex" ) var ( concurrentReconciles = 3 controllerKind = appsv1alpha1.SchemeGroupVersion.WithKind("ImageListPullJob") slowStartInitialBatchSize = 1 controllerName = "imagelistpulljob-controller" resourceVersionExpectations = expectations.NewResourceVersionExpectation() scaleExpectations = expectations.NewScaleExpectations() ) // Add creates a new ImageListPullJob Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. func Add(mgr manager.Manager) error { if !utildiscovery.DiscoverGVK(controllerKind) || !utilfeature.DefaultFeatureGate.Enabled(features.KruiseDaemon) || !utilfeature.DefaultFeatureGate.Enabled(features.ImagePullJobGate) { return nil } return add(mgr, newReconciler(mgr)) } // newReconciler returns a new reconcile.Reconciler func newReconciler(mgr manager.Manager) *ReconcileImageListPullJob { return &ReconcileImageListPullJob{ Client: utilclient.NewClientFromManager(mgr, controllerName), scheme: mgr.GetScheme(), clock: clock.RealClock{}, recorder: mgr.GetEventRecorderFor(controllerName), } } // add a new Controller to mgr with r as the reconcile.Reconciler func add(mgr manager.Manager, r *ReconcileImageListPullJob) error { // Create a new controller c, err := controller.New(controllerName, mgr, controller.Options{Reconciler: r, MaxConcurrentReconciles: concurrentReconciles, CacheSyncTimeout: util.GetControllerCacheSyncTimeout()}) if err != nil { return err } // Watch for changes to ImageListPullJob err = c.Watch(source.Kind(mgr.GetCache(), &appsv1alpha1.ImageListPullJob{}), &handler.EnqueueRequestForObject{}) if err != nil { return err } // Watch for changes to ImagePullJob // todo the imagelistpulljob(status) will not change if the pull job status does not change significantly (ex. number of failed nodeimage changes from 1 to 2) err = c.Watch(source.Kind(mgr.GetCache(), &appsv1alpha1.ImagePullJob{}), &imagePullJobEventHandler{ enqueueHandler: handler.EnqueueRequestForOwner( mgr.GetScheme(), mgr.GetRESTMapper(), &appsv1alpha1.ImageListPullJob{}, handler.OnlyControllerOwner()), }) if err != nil { return err } return nil } var _ reconcile.Reconciler = &ReconcileImageListPullJob{} // ReconcileImageListPullJob reconciles a ImageListPullJob object type ReconcileImageListPullJob struct { client.Client scheme *runtime.Scheme clock clock.Clock recorder record.EventRecorder } // +kubebuilder:rbac:groups=apps.kruise.io,resources=imagelistpulljobs,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=apps.kruise.io,resources=imagelistpulljobs/status,verbs=get;update;patch // +kubebuilder:rbac:groups=apps.kruise.io,resources=imagelistpulljobs/finalizers,verbs=update // Reconcile reads that state of the cluster for a ImageListPullJob object and makes changes based on the state read // and what is in the ImageListPullJob.Spec // Automatically generate RBAC rules to allow the Controller to read and write ImageListPullJob func (r *ReconcileImageListPullJob) Reconcile(_ context.Context, request reconcile.Request) (res reconcile.Result, err error) { klog.V(5).InfoS("Starting to process ImageListPullJob", "imageListPullJob", request) // 1.Fetch the ImageListPullJob instance job := &appsv1alpha1.ImageListPullJob{} err = r.Get(context.TODO(), request.NamespacedName, job) if err != nil { if errors.IsNotFound(err) { // Object not found, return. Created objects are automatically garbage collected. // For additional cleanup logic use finalizers. return reconcile.Result{}, nil } return reconcile.Result{}, err } hash, err := r.refreshJobTemplateHash(job) if err != nil { return reconcile.Result{}, fmt.Errorf("refresh job template hash error: %v", err) } // The Job has been finished if job.Status.CompletionTime != nil { var leftTime time.Duration if job.Spec.CompletionPolicy.TTLSecondsAfterFinished != nil { leftTime = time.Duration(*job.Spec.CompletionPolicy.TTLSecondsAfterFinished)*time.Second - time.Since(job.Status.CompletionTime.Time) if leftTime <= 0 { klog.InfoS("Deleting ImageListPullJob for ttlSecondsAfterFinished", "imageListPullJob", klog.KObj(job)) if err = r.Delete(context.TODO(), job); err != nil { return reconcile.Result{}, fmt.Errorf("delete ImageListPullJob error: %v", err) } return reconcile.Result{}, nil } } return reconcile.Result{RequeueAfter: leftTime}, nil } if scaleSatisfied, unsatisfiedDuration, scaleDirtyImagePullJobs := scaleExpectations.SatisfiedExpectations(request.String()); !scaleSatisfied { if unsatisfiedDuration >= expectations.ExpectationTimeout { klog.InfoS("Expectation unsatisfied overtime for ImageListPullJob", "imageListPullJob", request, "scaleDirtyImagePullJobs", scaleDirtyImagePullJobs, "overtime", unsatisfiedDuration) return reconcile.Result{}, nil } klog.V(4).InfoS("Not satisfied scale for ImageListPullJob", "imageListPullJob", request, "scaleDirtyImagePullJobs", scaleDirtyImagePullJobs) return reconcile.Result{RequeueAfter: expectations.ExpectationTimeout - unsatisfiedDuration}, nil } // 2. Get ImagePullJob owned by this job imagePullJobsMap, err := r.getOwnedImagePullJob(job) if err != nil { return reconcile.Result{}, fmt.Errorf("failed to get imagePullJob: %v", err) } // If resourceVersion expectations have not satisfied yet, just skip this reconcile for _, imagePullJob := range imagePullJobsMap { resourceVersionExpectations.Observe(imagePullJob) if isSatisfied, unsatisfiedDuration := resourceVersionExpectations.IsSatisfied(imagePullJob); !isSatisfied { if unsatisfiedDuration >= expectations.ExpectationTimeout { klog.InfoS("Expectation unsatisfied overtime for ImageListPullJob", "imageListPullJob", request, "timeout", unsatisfiedDuration) return reconcile.Result{}, nil } klog.V(4).InfoS("Not satisfied resourceVersion for ImageListPullJob", "imageListPullJob", request) return reconcile.Result{RequeueAfter: expectations.ExpectationTimeout - unsatisfiedDuration}, nil } } // 3. Calculate the new status for this job newStatus := r.calculateStatus(job, imagePullJobsMap) // 4. Compute ImagePullJobActions needToCreate, needToDelete := r.computeImagePullJobActions(job, imagePullJobsMap, hash) // 5. Sync ImagePullJob err = r.syncImagePullJob(job, needToCreate, needToDelete) if err != nil { return reconcile.Result{}, err } // 6. Update status if !util.IsJSONObjectEqual(&job.Status, newStatus) { if err = r.updateStatus(job, newStatus); err != nil { return reconcile.Result{}, fmt.Errorf("update ImageListPullJob status error: %v", err) } } return reconcile.Result{}, nil } func (r *ReconcileImageListPullJob) refreshJobTemplateHash(job *appsv1alpha1.ImageListPullJob) (string, error) { newHash := func(job *appsv1alpha1.ImageListPullJob) string { jobTemplateHasher := fnv.New32a() hashutil.DeepHashObject(jobTemplateHasher, job.Spec.ImagePullJobTemplate) return rand.SafeEncodeString(fmt.Sprint(jobTemplateHasher.Sum32())) }(job) oldHash := job.Labels[appsv1.ControllerRevisionHashLabelKey] if newHash == oldHash { return newHash, nil } emptyJob := &appsv1alpha1.ImageListPullJob{} emptyJob.SetName(job.Name) emptyJob.SetNamespace(job.Namespace) body := fmt.Sprintf(`{"metadata":{"labels":{"%s":"%s"}}}`, appsv1.ControllerRevisionHashLabelKey, newHash) return newHash, r.Patch(context.TODO(), emptyJob, client.RawPatch(types.MergePatchType, []byte(body))) } func (r *ReconcileImageListPullJob) updateStatus(job *appsv1alpha1.ImageListPullJob, newStatus *appsv1alpha1.ImageListPullJobStatus) error { return retry.RetryOnConflict(retry.DefaultBackoff, func() error { imageListPullJob := &appsv1alpha1.ImageListPullJob{} if err := r.Get(context.TODO(), types.NamespacedName{Namespace: job.Namespace, Name: job.Name}, imageListPullJob); err != nil { return err } imageListPullJob.Status = *newStatus return r.Status().Update(context.TODO(), imageListPullJob) }) } func (r *ReconcileImageListPullJob) computeImagePullJobActions(job *appsv1alpha1.ImageListPullJob, imagePullJobs map[string]*appsv1alpha1.ImagePullJob, hash string) ([]*appsv1alpha1.ImagePullJob, []*appsv1alpha1.ImagePullJob) { var needToDelete, needToCreate []*appsv1alpha1.ImagePullJob //1. need to create images, needToDelete := r.filterImagesAndImagePullJobs(job, imagePullJobs, hash) needToCreate = r.newImagePullJobs(job, images, hash) // some images delete from ImageListPullJob.Spec.Images for image, imagePullJob := range imagePullJobs { if !slice.ContainsString(job.Spec.Images, image, nil) { needToDelete = append(needToDelete, imagePullJob) } } return needToCreate, needToDelete } func (r *ReconcileImageListPullJob) calculateStatus(job *appsv1alpha1.ImageListPullJob, imagePullJobs map[string]*appsv1alpha1.ImagePullJob) *appsv1alpha1.ImageListPullJobStatus { var active, completed, succeeded int32 // record the failed image status var failedImageStatuses []*appsv1alpha1.FailedImageStatus for _, imagePullJob := range imagePullJobs { if imagePullJob.Status.StartTime == nil { continue } if imagePullJob.Status.Active > 0 { active = active + 1 } if imagePullJob.Status.Failed > 0 { failedImagePullJobStatus := &appsv1alpha1.FailedImageStatus{ ImagePullJob: imagePullJob.Name, Name: imagePullJob.Spec.Image, Message: fmt.Sprintf("Please check for details which nodes failed by 'kubectl get ImagePullJob %s'.", imagePullJob.Name), } failedImageStatuses = append(failedImageStatuses, failedImagePullJobStatus) } if imagePullJob.Status.Desired == (imagePullJob.Status.Failed + imagePullJob.Status.Succeeded) { completed = completed + 1 } if imagePullJob.Status.Desired == imagePullJob.Status.Succeeded { succeeded = succeeded + 1 } } // 4. status newStatus := &appsv1alpha1.ImageListPullJobStatus{ Desired: int32(len(job.Spec.Images)), Active: active, Completed: completed, Succeeded: succeeded, StartTime: job.Status.StartTime, FailedImageStatuses: failedImageStatuses, } now := metav1.NewTime(r.clock.Now()) if newStatus.StartTime == nil { newStatus.StartTime = &now } if job.Spec.CompletionPolicy.Type != appsv1alpha1.Never && newStatus.Desired == newStatus.Completed { newStatus.CompletionTime = &now } return newStatus } func (r *ReconcileImageListPullJob) syncImagePullJob(job *appsv1alpha1.ImageListPullJob, needToCreate, needToDelete []*appsv1alpha1.ImagePullJob) error { //1. manage creating var errs []error if len(needToCreate) > 0 { var createdNum int var createdErr error createdNum, createdErr = util.SlowStartBatch(len(needToCreate), slowStartInitialBatchSize, func(idx int) error { imagePullJob := needToCreate[idx] key := types.NamespacedName{Namespace: job.Namespace, Name: job.Name}.String() scaleExpectations.ExpectScale(key, expectations.Create, imagePullJob.Spec.Image) err := r.Create(context.TODO(), imagePullJob) if err != nil { scaleExpectations.ObserveScale(key, expectations.Create, imagePullJob.Spec.Image) } return err }) if createdErr == nil { r.recorder.Eventf(job, corev1.EventTypeNormal, "Successful create ImagePullJob", "Create %d ImagePullJob", createdNum) } else { errs = append(errs, createdErr) } } //2. manage deleting if len(needToDelete) > 0 { var deleteErrs []error for _, imagePullJob := range needToDelete { key := types.NamespacedName{Namespace: job.Namespace, Name: job.Name}.String() scaleExpectations.ExpectScale(key, expectations.Delete, imagePullJob.Spec.Image) if err := r.Delete(context.TODO(), imagePullJob); err != nil { scaleExpectations.ObserveScale(key, expectations.Delete, imagePullJob.Spec.Image) deleteErrs = append(deleteErrs, fmt.Errorf("fail to delete ImagePullJob (%s/%s) for : %s", imagePullJob.Namespace, imagePullJob.Name, err)) } } if len(deleteErrs) > 0 { errs = append(errs, deleteErrs...) } else { r.recorder.Eventf(job, corev1.EventTypeNormal, "Successful delete ImagePullJob", "Delete %d ImagePullJob", len(needToDelete)) } } return utilerrors.NewAggregate(errs) } func (r *ReconcileImageListPullJob) filterImagesAndImagePullJobs(job *appsv1alpha1.ImageListPullJob, imagePullJobs map[string]*appsv1alpha1.ImagePullJob, hash string) ([]string, []*appsv1alpha1.ImagePullJob) { var images, imagesInCurrentImagePullJob []string var needToDelete []*appsv1alpha1.ImagePullJob for image := range imagePullJobs { imagesInCurrentImagePullJob = append(imagesInCurrentImagePullJob, image) } for _, image := range job.Spec.Images { // should create imagePullJob for new image if len(imagePullJobs) <= 0 || !slice.ContainsString(imagesInCurrentImagePullJob, image, nil) { images = append(images, image) continue } imagePullJob, ok := imagePullJobs[image] if !ok { klog.InfoS("Could not found imagePullJob for image name", "imageName", image) continue } // should create new imagePullJob if the template is changed. if !isConsistentVersion(imagePullJob, &job.Spec.ImagePullJobTemplate, hash) { images = append(images, image) // should delete old imagepulljob needToDelete = append(needToDelete, imagePullJob) } } return images, needToDelete } func (r *ReconcileImageListPullJob) newImagePullJobs(job *appsv1alpha1.ImageListPullJob, images []string, hash string) []*appsv1alpha1.ImagePullJob { var needToCreate []*appsv1alpha1.ImagePullJob if len(images) <= 0 { return needToCreate } for _, image := range images { imagePullJob := &appsv1alpha1.ImagePullJob{ ObjectMeta: metav1.ObjectMeta{ Namespace: job.Namespace, GenerateName: fmt.Sprintf("%s-", job.Name), Labels: map[string]string{ appsv1.ControllerRevisionHashLabelKey: hash, }, Annotations: make(map[string]string), OwnerReferences: []metav1.OwnerReference{ *metav1.NewControllerRef(job, controllerKind), }, }, Spec: appsv1alpha1.ImagePullJobSpec{ Image: image, ImagePullJobTemplate: job.Spec.ImagePullJobTemplate, }, } needToCreate = append(needToCreate, imagePullJob) } return needToCreate } func (r *ReconcileImageListPullJob) getOwnedImagePullJob(job *appsv1alpha1.ImageListPullJob) (map[string]*appsv1alpha1.ImagePullJob, error) { opts := &client.ListOptions{ Namespace: job.Namespace, FieldSelector: fields.SelectorFromSet(fields.Set{fieldindex.IndexNameForOwnerRefUID: string(job.UID)}), } imagePullJobList := &appsv1alpha1.ImagePullJobList{} err := r.List(context.TODO(), imagePullJobList, opts, utilclient.DisableDeepCopy) if err != nil { return nil, err } imagePullJobsMap := make(map[string]*appsv1alpha1.ImagePullJob) for i := range imagePullJobList.Items { imagePullJob := imagePullJobList.Items[i] if imagePullJob.DeletionTimestamp.IsZero() { imagePullJobsMap[imagePullJob.Spec.Image] = &imagePullJob } } return imagePullJobsMap, nil } func isConsistentVersion(oldImagePullJob *appsv1alpha1.ImagePullJob, newTemplate *appsv1alpha1.ImagePullJobTemplate, hash string) bool { oldHash, exists := oldImagePullJob.Labels[appsv1.ControllerRevisionHashLabelKey] if oldHash == hash { return true } if !exists && reflect.DeepEqual(oldImagePullJob.Spec.ImagePullJobTemplate, *newTemplate) { return true } klog.V(4).InfoS("ImagePullJob specification changed", "imagePullJob", klog.KObj(oldImagePullJob)) return false }