diff --git a/pkg/controllers/execution/execution_controller.go b/pkg/controllers/execution/execution_controller.go index bb60ec883..7c0ee9ca3 100644 --- a/pkg/controllers/execution/execution_controller.go +++ b/pkg/controllers/execution/execution_controller.go @@ -212,7 +212,9 @@ func (c *Controller) cleanupPolicyClaimMetadata(ctx context.Context, work *workv util.RemoveLabels(workload, util.ManagedResourceLabels...) util.RemoveAnnotations(workload, util.ManagedResourceAnnotations...) - if err := c.ObjectWatcher.Update(ctx, cluster.Name, workload, clusterObj); err != nil { + operationResult, err := c.ObjectWatcher.Update(ctx, cluster.Name, workload, clusterObj) + metrics.CountUpdateResourceToCluster(err, workload.GetAPIVersion(), workload.GetKind(), cluster.Name, string(operationResult)) + if err != nil { klog.Errorf("Failed to update metadata in the given member cluster %v, err is %v", cluster.Name, err) return err } @@ -232,6 +234,7 @@ func (c *Controller) tryDeleteWorkload(ctx context.Context, clusterName string, } err = c.ObjectWatcher.Delete(ctx, clusterName, workload) + metrics.CountDeleteResourceFromCluster(err, workload.GetAPIVersion(), workload.GetKind(), clusterName) if err != nil { klog.Errorf("Failed to delete resource in the given member cluster %v, err is %v", clusterName, err) return err @@ -312,13 +315,15 @@ func (c *Controller) tryCreateOrUpdateWorkload(ctx context.Context, clusterName return err } err = c.ObjectWatcher.Create(ctx, clusterName, workload) + metrics.CountCreateResourceToCluster(err, workload.GetAPIVersion(), workload.GetKind(), clusterName, false) if err != nil { return err } return nil } - err = c.ObjectWatcher.Update(ctx, clusterName, workload, clusterObj) + operationResult, err := c.ObjectWatcher.Update(ctx, clusterName, workload, clusterObj) + metrics.CountUpdateResourceToCluster(err, workload.GetAPIVersion(), workload.GetKind(), clusterName, string(operationResult)) if err != nil { return err } diff --git a/pkg/controllers/status/work_status_controller.go b/pkg/controllers/status/work_status_controller.go index af02fab27..de2983bbb 100644 --- a/pkg/controllers/status/work_status_controller.go +++ b/pkg/controllers/status/work_status_controller.go @@ -255,8 +255,8 @@ func (c *WorkStatusController) updateResource(ctx context.Context, observedObj * } if needUpdate { - updateErr := c.ObjectWatcher.Update(ctx, clusterName, desiredObj, observedObj) - metrics.CountUpdateResourceToCluster(updateErr, desiredObj.GetAPIVersion(), desiredObj.GetKind(), clusterName) + operationResult, updateErr := c.ObjectWatcher.Update(ctx, clusterName, desiredObj, observedObj) + metrics.CountUpdateResourceToCluster(updateErr, desiredObj.GetAPIVersion(), desiredObj.GetKind(), clusterName, string(operationResult)) if updateErr != nil { klog.Errorf("Updating %s failed: %v", fedKey.String(), updateErr) return updateErr @@ -299,7 +299,6 @@ func (c *WorkStatusController) handleDeleteEvent(ctx context.Context, key keys.F } reCreateErr := c.recreateResourceIfNeeded(ctx, work, key) - metrics.CountRecreateResourceToCluster(reCreateErr, key.GroupVersion().String(), key.Kind, key.Cluster) if reCreateErr != nil { c.updateAppliedCondition(ctx, work, metav1.ConditionFalse, "ReCreateFailed", reCreateErr.Error()) return reCreateErr @@ -321,6 +320,7 @@ func (c *WorkStatusController) recreateResourceIfNeeded(ctx context.Context, wor manifest.GetName() == workloadKey.Name { klog.Infof("Recreating resource(%s).", workloadKey.String()) err := c.ObjectWatcher.Create(ctx, workloadKey.Cluster, manifest) + metrics.CountCreateResourceToCluster(err, workloadKey.GroupVersion().String(), workloadKey.Kind, workloadKey.Cluster, true) if err != nil { c.eventf(manifest, corev1.EventTypeWarning, events.EventReasonSyncWorkloadFailed, "Failed to create or update resource(%s/%s) in member cluster(%s): %v", manifest.GetNamespace(), manifest.GetName(), workloadKey.Cluster, err) return err diff --git a/pkg/metrics/resource.go b/pkg/metrics/resource.go index a4c0caa99..4b7a7f037 100644 --- a/pkg/metrics/resource.go +++ b/pkg/metrics/resource.go @@ -17,6 +17,7 @@ limitations under the License. package metrics import ( + "strconv" "time" "github.com/prometheus/client_golang/prometheus" @@ -30,8 +31,9 @@ const ( policyApplyAttemptsMetricsName = "policy_apply_attempts_total" syncWorkDurationMetricsName = "binding_sync_work_duration_seconds" syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds" - recreateResourceToCluster = "recreate_resource_to_cluster" + createResourceToCluster = "create_resource_to_cluster" updateResourceToCluster = "update_resource_to_cluster" + deleteResourceFromCluster = "delete_resource_from_cluster" policyPreemptionMetricsName = "policy_preemption_total" cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds" cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds" @@ -69,14 +71,19 @@ var ( Buckets: prometheus.ExponentialBuckets(0.001, 2, 12), }, []string{"result"}) - recreateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: recreateResourceToCluster, - Help: "Number of recreating operation of the resource to a target member cluster. By the result, 'error' means a resource recreated failed. Otherwise 'success'. Cluster means the target member cluster.", - }, []string{"result", "apiversion", "kind", "cluster"}) + createResourceWhenSyncWork = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: createResourceToCluster, + Help: "Number of creation operations against a target member cluster. The 'result' label indicates outcome ('success' or 'error'), 'recreate' indicates whether the operation is recreated (true/false). Labels 'apiversion', 'kind', and 'cluster' specify the resource type, API version, and target cluster respectively.", + }, []string{"result", "apiversion", "kind", "cluster", "recreate"}) - updateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{ + updateResourceWhenSyncWork = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: updateResourceToCluster, - Help: "Number of updating operation of the resource to a target member cluster. By the result, 'error' means a resource updated failed. Otherwise 'success'. Cluster means the target member cluster.", + Help: "Number of updating operation of the resource to a target member cluster. By the result, 'error' means a resource updated failed. Otherwise 'success'. Cluster means the target member cluster. operationResult means the result of the update operation.", + }, []string{"result", "apiversion", "kind", "cluster", "operationResult"}) + + deleteResourceWhenSyncWork = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: deleteResourceFromCluster, + Help: "Number of deletion operations against a target member cluster. The 'result' label indicates outcome ('success' or 'error'). Labels 'apiversion', 'kind', and 'cluster' specify the resource's API version, type, and source cluster respectively.", }, []string{"result", "apiversion", "kind", "cluster"}) policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ @@ -130,14 +137,19 @@ func ObserveSyncWorkloadLatency(err error, start time.Time) { syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start)) } -// CountRecreateResourceToCluster records the number of recreating operation of the resource to a target member cluster. -func CountRecreateResourceToCluster(err error, apiVersion, kind, cluster string) { - recreateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc() +// CountCreateResourceToCluster records the number of creation operations of the resource for a target member cluster. +func CountCreateResourceToCluster(err error, apiVersion, kind, cluster string, recreate bool) { + createResourceWhenSyncWork.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster, strconv.FormatBool(recreate)).Inc() } // CountUpdateResourceToCluster records the number of updating operation of the resource to a target member cluster. -func CountUpdateResourceToCluster(err error, apiVersion, kind, cluster string) { - updateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc() +func CountUpdateResourceToCluster(err error, apiVersion, kind, cluster string, operationResult string) { + updateResourceWhenSyncWork.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster, operationResult).Inc() +} + +// CountDeleteResourceFromCluster records the number of deletion operations of the resource from a target member cluster. +func CountDeleteResourceFromCluster(err error, apiVersion, kind, cluster string) { + deleteResourceWhenSyncWork.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc() } // CountPolicyPreemption records the numbers of policy preemption. @@ -173,8 +185,9 @@ func ResourceCollectors() []prometheus.Collector { policyApplyAttempts, syncWorkDurationHistogram, syncWorkloadDurationHistogram, - recreateResourceWhenSyncWorkStatus, - updateResourceWhenSyncWorkStatus, + createResourceWhenSyncWork, + updateResourceWhenSyncWork, + deleteResourceWhenSyncWork, policyPreemptionCounter, cronFederatedHPADurationHistogram, cronFederatedHPARuleDurationHistogram, diff --git a/pkg/util/objectwatcher/objectwatcher.go b/pkg/util/objectwatcher/objectwatcher.go index 47a98560e..6b059c684 100644 --- a/pkg/util/objectwatcher/objectwatcher.go +++ b/pkg/util/objectwatcher/objectwatcher.go @@ -39,10 +39,22 @@ import ( "github.com/karmada-io/karmada/pkg/util/restmapper" ) +// OperationResult is the action result of an Update call. +type OperationResult string + +const ( + // OperationResultNone means that the update operation was not performed and the resource has not been changed. + OperationResultNone OperationResult = "none" + // OperationResultUnchanged means that the update operation was performed but the resource did not change. + OperationResultUnchanged OperationResult = "unchanged" + // OperationResultUpdated means that an existing resource is updated. + OperationResultUpdated OperationResult = "updated" +) + // ObjectWatcher manages operations for object dispatched to member clusters. type ObjectWatcher interface { Create(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error - Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) error + Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) (operationResult OperationResult, err error) Delete(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error NeedsUpdate(clusterName string, desiredObj, clusterObj *unstructured.Unstructured) (bool, error) } @@ -138,43 +150,48 @@ func (o *objectWatcherImpl) retainClusterFields(desired, observed *unstructured. return desired, nil } -func (o *objectWatcherImpl) Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) error { +func (o *objectWatcherImpl) Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) (OperationResult, error) { updateAllowed := o.allowUpdate(clusterName, desireObj, clusterObj) if !updateAllowed { // The existing resource is not managed by Karmada, and no conflict resolution found, avoid updating the existing resource by default. - return fmt.Errorf("resource(kind=%s, %s/%s) already exists in the cluster %v and the %s strategy value is empty, Karmada will not manage this resource", + return OperationResultNone, fmt.Errorf("resource(kind=%s, %s/%s) already exists in the cluster %v and the %s strategy value is empty, Karmada will not manage this resource", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, workv1alpha2.ResourceConflictResolutionAnnotation) } dynamicClusterClient, err := o.ClusterClientSetFunc(clusterName, o.KubeClientSet) if err != nil { klog.Errorf("Failed to build dynamic cluster client for cluster %s, err: %v.", clusterName, err) - return err + return OperationResultNone, err } gvr, err := restmapper.GetGroupVersionResource(o.RESTMapper, desireObj.GroupVersionKind()) if err != nil { klog.Errorf("Failed to update the resource(kind=%s, %s/%s) in the cluster %s as mapping GVK to GVR failed: %v", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, err) - return err + return OperationResultNone, err } desireObj, err = o.retainClusterFields(desireObj, clusterObj) if err != nil { klog.Errorf("Failed to retain fields for resource(kind=%s, %s/%s) in cluster %s: %v", clusterObj.GetKind(), clusterObj.GetNamespace(), clusterObj.GetName(), clusterName, err) - return err + return OperationResultNone, err } resource, err := dynamicClusterClient.DynamicClientSet.Resource(gvr).Namespace(desireObj.GetNamespace()).Update(ctx, desireObj, metav1.UpdateOptions{}) if err != nil { klog.Errorf("Failed to update resource(kind=%s, %s/%s) in cluster %s, err: %v.", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, err) - return err + return OperationResultNone, err } - klog.Infof("Updated the resource(kind=%s, %s/%s) on cluster(%s).", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName) - // record version o.recordVersion(resource, clusterName) - return nil + + if clusterObj.GetResourceVersion() == resource.GetResourceVersion() { + klog.Infof("Updated the resource(kind=%s, %s/%s) on cluster(%s) but the cluster object was not changed.", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName) + return OperationResultUnchanged, nil + } + + klog.Infof("Updated the resource(kind=%s, %s/%s) on cluster(%s).", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName) + return OperationResultUpdated, nil } func (o *objectWatcherImpl) Delete(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error { @@ -242,21 +259,13 @@ func (o *objectWatcherImpl) genObjectKey(obj *unstructured.Unstructured) string func (o *objectWatcherImpl) recordVersion(clusterObj *unstructured.Unstructured, clusterName string) { objVersion := lifted.ObjectVersion(clusterObj) objectKey := o.genObjectKey(clusterObj) - if o.isClusterVersionRecordExist(clusterName) { - o.updateVersionRecord(clusterName, objectKey, objVersion) - } else { - o.addVersionRecord(clusterName, objectKey, objVersion) + + o.Lock.Lock() + defer o.Lock.Unlock() + if o.VersionRecord[clusterName] == nil { + o.VersionRecord[clusterName] = make(map[string]string) } -} - -// isClusterVersionRecordExist checks if the version record map of given member cluster exist -func (o *objectWatcherImpl) isClusterVersionRecordExist(clusterName string) bool { - o.Lock.RLock() - defer o.Lock.RUnlock() - - _, exist := o.VersionRecord[clusterName] - - return exist + o.VersionRecord[clusterName][objectKey] = objVersion } // getVersionRecord will return the recorded version of given resource(if exist) @@ -268,20 +277,6 @@ func (o *objectWatcherImpl) getVersionRecord(clusterName, resourceName string) ( return version, exist } -// addVersionRecord will add new version record of given resource -func (o *objectWatcherImpl) addVersionRecord(clusterName, resourceName, version string) { - o.Lock.Lock() - defer o.Lock.Unlock() - o.VersionRecord[clusterName] = map[string]string{resourceName: version} -} - -// updateVersionRecord will update the recorded version of given resource -func (o *objectWatcherImpl) updateVersionRecord(clusterName, resourceName, version string) { - o.Lock.Lock() - defer o.Lock.Unlock() - o.VersionRecord[clusterName][resourceName] = version -} - // deleteVersionRecord will delete the recorded version of given resource func (o *objectWatcherImpl) deleteVersionRecord(clusterName, resourceName string) { o.Lock.Lock()