Merge pull request #6148 from ctripcloud/object-watcher-metrics

update object watcher metrics and simplify cluster object version recording
This commit is contained in:
karmada-bot 2025-03-10 14:12:31 +08:00 committed by GitHub
commit 178f193ec1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 70 additions and 57 deletions

View File

@ -212,7 +212,9 @@ func (c *Controller) cleanupPolicyClaimMetadata(ctx context.Context, work *workv
util.RemoveLabels(workload, util.ManagedResourceLabels...) util.RemoveLabels(workload, util.ManagedResourceLabels...)
util.RemoveAnnotations(workload, util.ManagedResourceAnnotations...) util.RemoveAnnotations(workload, util.ManagedResourceAnnotations...)
if err := c.ObjectWatcher.Update(ctx, cluster.Name, workload, clusterObj); err != nil { operationResult, err := c.ObjectWatcher.Update(ctx, cluster.Name, workload, clusterObj)
metrics.CountUpdateResourceToCluster(err, workload.GetAPIVersion(), workload.GetKind(), cluster.Name, string(operationResult))
if err != nil {
klog.Errorf("Failed to update metadata in the given member cluster %v, err is %v", cluster.Name, err) klog.Errorf("Failed to update metadata in the given member cluster %v, err is %v", cluster.Name, err)
return err return err
} }
@ -232,6 +234,7 @@ func (c *Controller) tryDeleteWorkload(ctx context.Context, clusterName string,
} }
err = c.ObjectWatcher.Delete(ctx, clusterName, workload) err = c.ObjectWatcher.Delete(ctx, clusterName, workload)
metrics.CountDeleteResourceFromCluster(err, workload.GetAPIVersion(), workload.GetKind(), clusterName)
if err != nil { if err != nil {
klog.Errorf("Failed to delete resource in the given member cluster %v, err is %v", clusterName, err) klog.Errorf("Failed to delete resource in the given member cluster %v, err is %v", clusterName, err)
return err return err
@ -312,13 +315,15 @@ func (c *Controller) tryCreateOrUpdateWorkload(ctx context.Context, clusterName
return err return err
} }
err = c.ObjectWatcher.Create(ctx, clusterName, workload) err = c.ObjectWatcher.Create(ctx, clusterName, workload)
metrics.CountCreateResourceToCluster(err, workload.GetAPIVersion(), workload.GetKind(), clusterName, false)
if err != nil { if err != nil {
return err return err
} }
return nil return nil
} }
err = c.ObjectWatcher.Update(ctx, clusterName, workload, clusterObj) operationResult, err := c.ObjectWatcher.Update(ctx, clusterName, workload, clusterObj)
metrics.CountUpdateResourceToCluster(err, workload.GetAPIVersion(), workload.GetKind(), clusterName, string(operationResult))
if err != nil { if err != nil {
return err return err
} }

View File

@ -255,8 +255,8 @@ func (c *WorkStatusController) updateResource(ctx context.Context, observedObj *
} }
if needUpdate { if needUpdate {
updateErr := c.ObjectWatcher.Update(ctx, clusterName, desiredObj, observedObj) operationResult, updateErr := c.ObjectWatcher.Update(ctx, clusterName, desiredObj, observedObj)
metrics.CountUpdateResourceToCluster(updateErr, desiredObj.GetAPIVersion(), desiredObj.GetKind(), clusterName) metrics.CountUpdateResourceToCluster(updateErr, desiredObj.GetAPIVersion(), desiredObj.GetKind(), clusterName, string(operationResult))
if updateErr != nil { if updateErr != nil {
klog.Errorf("Updating %s failed: %v", fedKey.String(), updateErr) klog.Errorf("Updating %s failed: %v", fedKey.String(), updateErr)
return updateErr return updateErr
@ -299,7 +299,6 @@ func (c *WorkStatusController) handleDeleteEvent(ctx context.Context, key keys.F
} }
reCreateErr := c.recreateResourceIfNeeded(ctx, work, key) reCreateErr := c.recreateResourceIfNeeded(ctx, work, key)
metrics.CountRecreateResourceToCluster(reCreateErr, key.GroupVersion().String(), key.Kind, key.Cluster)
if reCreateErr != nil { if reCreateErr != nil {
c.updateAppliedCondition(ctx, work, metav1.ConditionFalse, "ReCreateFailed", reCreateErr.Error()) c.updateAppliedCondition(ctx, work, metav1.ConditionFalse, "ReCreateFailed", reCreateErr.Error())
return reCreateErr return reCreateErr
@ -321,6 +320,7 @@ func (c *WorkStatusController) recreateResourceIfNeeded(ctx context.Context, wor
manifest.GetName() == workloadKey.Name { manifest.GetName() == workloadKey.Name {
klog.Infof("Recreating resource(%s).", workloadKey.String()) klog.Infof("Recreating resource(%s).", workloadKey.String())
err := c.ObjectWatcher.Create(ctx, workloadKey.Cluster, manifest) err := c.ObjectWatcher.Create(ctx, workloadKey.Cluster, manifest)
metrics.CountCreateResourceToCluster(err, workloadKey.GroupVersion().String(), workloadKey.Kind, workloadKey.Cluster, true)
if err != nil { if err != nil {
c.eventf(manifest, corev1.EventTypeWarning, events.EventReasonSyncWorkloadFailed, "Failed to create or update resource(%s/%s) in member cluster(%s): %v", manifest.GetNamespace(), manifest.GetName(), workloadKey.Cluster, err) c.eventf(manifest, corev1.EventTypeWarning, events.EventReasonSyncWorkloadFailed, "Failed to create or update resource(%s/%s) in member cluster(%s): %v", manifest.GetNamespace(), manifest.GetName(), workloadKey.Cluster, err)
return err return err

View File

@ -17,6 +17,7 @@ limitations under the License.
package metrics package metrics
import ( import (
"strconv"
"time" "time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
@ -30,8 +31,9 @@ const (
policyApplyAttemptsMetricsName = "policy_apply_attempts_total" policyApplyAttemptsMetricsName = "policy_apply_attempts_total"
syncWorkDurationMetricsName = "binding_sync_work_duration_seconds" syncWorkDurationMetricsName = "binding_sync_work_duration_seconds"
syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds" syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds"
recreateResourceToCluster = "recreate_resource_to_cluster" createResourceToCluster = "create_resource_to_cluster"
updateResourceToCluster = "update_resource_to_cluster" updateResourceToCluster = "update_resource_to_cluster"
deleteResourceFromCluster = "delete_resource_from_cluster"
policyPreemptionMetricsName = "policy_preemption_total" policyPreemptionMetricsName = "policy_preemption_total"
cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds" cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds"
cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds" cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds"
@ -69,14 +71,19 @@ var (
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12), Buckets: prometheus.ExponentialBuckets(0.001, 2, 12),
}, []string{"result"}) }, []string{"result"})
recreateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{ createResourceWhenSyncWork = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: recreateResourceToCluster, Name: createResourceToCluster,
Help: "Number of recreating operation of the resource to a target member cluster. By the result, 'error' means a resource recreated failed. Otherwise 'success'. Cluster means the target member cluster.", Help: "Number of creation operations against a target member cluster. The 'result' label indicates outcome ('success' or 'error'), 'recreate' indicates whether the operation is recreated (true/false). Labels 'apiversion', 'kind', and 'cluster' specify the resource type, API version, and target cluster respectively.",
}, []string{"result", "apiversion", "kind", "cluster"}) }, []string{"result", "apiversion", "kind", "cluster", "recreate"})
updateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{ updateResourceWhenSyncWork = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: updateResourceToCluster, Name: updateResourceToCluster,
Help: "Number of updating operation of the resource to a target member cluster. By the result, 'error' means a resource updated failed. Otherwise 'success'. Cluster means the target member cluster.", Help: "Number of updating operation of the resource to a target member cluster. By the result, 'error' means a resource updated failed. Otherwise 'success'. Cluster means the target member cluster. operationResult means the result of the update operation.",
}, []string{"result", "apiversion", "kind", "cluster", "operationResult"})
deleteResourceWhenSyncWork = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: deleteResourceFromCluster,
Help: "Number of deletion operations against a target member cluster. The 'result' label indicates outcome ('success' or 'error'). Labels 'apiversion', 'kind', and 'cluster' specify the resource's API version, type, and source cluster respectively.",
}, []string{"result", "apiversion", "kind", "cluster"}) }, []string{"result", "apiversion", "kind", "cluster"})
policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
@ -130,14 +137,19 @@ func ObserveSyncWorkloadLatency(err error, start time.Time) {
syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start)) syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
} }
// CountRecreateResourceToCluster records the number of recreating operation of the resource to a target member cluster. // CountCreateResourceToCluster records the number of creation operations of the resource for a target member cluster.
func CountRecreateResourceToCluster(err error, apiVersion, kind, cluster string) { func CountCreateResourceToCluster(err error, apiVersion, kind, cluster string, recreate bool) {
recreateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc() createResourceWhenSyncWork.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster, strconv.FormatBool(recreate)).Inc()
} }
// CountUpdateResourceToCluster records the number of updating operation of the resource to a target member cluster. // CountUpdateResourceToCluster records the number of updating operation of the resource to a target member cluster.
func CountUpdateResourceToCluster(err error, apiVersion, kind, cluster string) { func CountUpdateResourceToCluster(err error, apiVersion, kind, cluster string, operationResult string) {
updateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc() updateResourceWhenSyncWork.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster, operationResult).Inc()
}
// CountDeleteResourceFromCluster records the number of deletion operations of the resource from a target member cluster.
func CountDeleteResourceFromCluster(err error, apiVersion, kind, cluster string) {
deleteResourceWhenSyncWork.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc()
} }
// CountPolicyPreemption records the numbers of policy preemption. // CountPolicyPreemption records the numbers of policy preemption.
@ -173,8 +185,9 @@ func ResourceCollectors() []prometheus.Collector {
policyApplyAttempts, policyApplyAttempts,
syncWorkDurationHistogram, syncWorkDurationHistogram,
syncWorkloadDurationHistogram, syncWorkloadDurationHistogram,
recreateResourceWhenSyncWorkStatus, createResourceWhenSyncWork,
updateResourceWhenSyncWorkStatus, updateResourceWhenSyncWork,
deleteResourceWhenSyncWork,
policyPreemptionCounter, policyPreemptionCounter,
cronFederatedHPADurationHistogram, cronFederatedHPADurationHistogram,
cronFederatedHPARuleDurationHistogram, cronFederatedHPARuleDurationHistogram,

View File

@ -39,10 +39,22 @@ import (
"github.com/karmada-io/karmada/pkg/util/restmapper" "github.com/karmada-io/karmada/pkg/util/restmapper"
) )
// OperationResult is the action result of an Update call.
type OperationResult string
const (
// OperationResultNone means that the update operation was not performed and the resource has not been changed.
OperationResultNone OperationResult = "none"
// OperationResultUnchanged means that the update operation was performed but the resource did not change.
OperationResultUnchanged OperationResult = "unchanged"
// OperationResultUpdated means that an existing resource is updated.
OperationResultUpdated OperationResult = "updated"
)
// ObjectWatcher manages operations for object dispatched to member clusters. // ObjectWatcher manages operations for object dispatched to member clusters.
type ObjectWatcher interface { type ObjectWatcher interface {
Create(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error Create(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error
Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) error Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) (operationResult OperationResult, err error)
Delete(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error Delete(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error
NeedsUpdate(clusterName string, desiredObj, clusterObj *unstructured.Unstructured) (bool, error) NeedsUpdate(clusterName string, desiredObj, clusterObj *unstructured.Unstructured) (bool, error)
} }
@ -138,43 +150,48 @@ func (o *objectWatcherImpl) retainClusterFields(desired, observed *unstructured.
return desired, nil return desired, nil
} }
func (o *objectWatcherImpl) Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) error { func (o *objectWatcherImpl) Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) (OperationResult, error) {
updateAllowed := o.allowUpdate(clusterName, desireObj, clusterObj) updateAllowed := o.allowUpdate(clusterName, desireObj, clusterObj)
if !updateAllowed { if !updateAllowed {
// The existing resource is not managed by Karmada, and no conflict resolution found, avoid updating the existing resource by default. // The existing resource is not managed by Karmada, and no conflict resolution found, avoid updating the existing resource by default.
return fmt.Errorf("resource(kind=%s, %s/%s) already exists in the cluster %v and the %s strategy value is empty, Karmada will not manage this resource", return OperationResultNone, fmt.Errorf("resource(kind=%s, %s/%s) already exists in the cluster %v and the %s strategy value is empty, Karmada will not manage this resource",
desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, workv1alpha2.ResourceConflictResolutionAnnotation) desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, workv1alpha2.ResourceConflictResolutionAnnotation)
} }
dynamicClusterClient, err := o.ClusterClientSetFunc(clusterName, o.KubeClientSet) dynamicClusterClient, err := o.ClusterClientSetFunc(clusterName, o.KubeClientSet)
if err != nil { if err != nil {
klog.Errorf("Failed to build dynamic cluster client for cluster %s, err: %v.", clusterName, err) klog.Errorf("Failed to build dynamic cluster client for cluster %s, err: %v.", clusterName, err)
return err return OperationResultNone, err
} }
gvr, err := restmapper.GetGroupVersionResource(o.RESTMapper, desireObj.GroupVersionKind()) gvr, err := restmapper.GetGroupVersionResource(o.RESTMapper, desireObj.GroupVersionKind())
if err != nil { if err != nil {
klog.Errorf("Failed to update the resource(kind=%s, %s/%s) in the cluster %s as mapping GVK to GVR failed: %v", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, err) klog.Errorf("Failed to update the resource(kind=%s, %s/%s) in the cluster %s as mapping GVK to GVR failed: %v", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, err)
return err return OperationResultNone, err
} }
desireObj, err = o.retainClusterFields(desireObj, clusterObj) desireObj, err = o.retainClusterFields(desireObj, clusterObj)
if err != nil { if err != nil {
klog.Errorf("Failed to retain fields for resource(kind=%s, %s/%s) in cluster %s: %v", clusterObj.GetKind(), clusterObj.GetNamespace(), clusterObj.GetName(), clusterName, err) klog.Errorf("Failed to retain fields for resource(kind=%s, %s/%s) in cluster %s: %v", clusterObj.GetKind(), clusterObj.GetNamespace(), clusterObj.GetName(), clusterName, err)
return err return OperationResultNone, err
} }
resource, err := dynamicClusterClient.DynamicClientSet.Resource(gvr).Namespace(desireObj.GetNamespace()).Update(ctx, desireObj, metav1.UpdateOptions{}) resource, err := dynamicClusterClient.DynamicClientSet.Resource(gvr).Namespace(desireObj.GetNamespace()).Update(ctx, desireObj, metav1.UpdateOptions{})
if err != nil { if err != nil {
klog.Errorf("Failed to update resource(kind=%s, %s/%s) in cluster %s, err: %v.", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, err) klog.Errorf("Failed to update resource(kind=%s, %s/%s) in cluster %s, err: %v.", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, err)
return err return OperationResultNone, err
} }
klog.Infof("Updated the resource(kind=%s, %s/%s) on cluster(%s).", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName)
// record version // record version
o.recordVersion(resource, clusterName) o.recordVersion(resource, clusterName)
return nil
if clusterObj.GetResourceVersion() == resource.GetResourceVersion() {
klog.Infof("Updated the resource(kind=%s, %s/%s) on cluster(%s) but the cluster object was not changed.", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName)
return OperationResultUnchanged, nil
}
klog.Infof("Updated the resource(kind=%s, %s/%s) on cluster(%s).", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName)
return OperationResultUpdated, nil
} }
func (o *objectWatcherImpl) Delete(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error { func (o *objectWatcherImpl) Delete(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error {
@ -242,21 +259,13 @@ func (o *objectWatcherImpl) genObjectKey(obj *unstructured.Unstructured) string
func (o *objectWatcherImpl) recordVersion(clusterObj *unstructured.Unstructured, clusterName string) { func (o *objectWatcherImpl) recordVersion(clusterObj *unstructured.Unstructured, clusterName string) {
objVersion := lifted.ObjectVersion(clusterObj) objVersion := lifted.ObjectVersion(clusterObj)
objectKey := o.genObjectKey(clusterObj) objectKey := o.genObjectKey(clusterObj)
if o.isClusterVersionRecordExist(clusterName) {
o.updateVersionRecord(clusterName, objectKey, objVersion) o.Lock.Lock()
} else { defer o.Lock.Unlock()
o.addVersionRecord(clusterName, objectKey, objVersion) if o.VersionRecord[clusterName] == nil {
o.VersionRecord[clusterName] = make(map[string]string)
} }
} o.VersionRecord[clusterName][objectKey] = objVersion
// isClusterVersionRecordExist checks if the version record map of given member cluster exist
func (o *objectWatcherImpl) isClusterVersionRecordExist(clusterName string) bool {
o.Lock.RLock()
defer o.Lock.RUnlock()
_, exist := o.VersionRecord[clusterName]
return exist
} }
// getVersionRecord will return the recorded version of given resource(if exist) // getVersionRecord will return the recorded version of given resource(if exist)
@ -268,20 +277,6 @@ func (o *objectWatcherImpl) getVersionRecord(clusterName, resourceName string) (
return version, exist return version, exist
} }
// addVersionRecord will add new version record of given resource
func (o *objectWatcherImpl) addVersionRecord(clusterName, resourceName, version string) {
o.Lock.Lock()
defer o.Lock.Unlock()
o.VersionRecord[clusterName] = map[string]string{resourceName: version}
}
// updateVersionRecord will update the recorded version of given resource
func (o *objectWatcherImpl) updateVersionRecord(clusterName, resourceName, version string) {
o.Lock.Lock()
defer o.Lock.Unlock()
o.VersionRecord[clusterName][resourceName] = version
}
// deleteVersionRecord will delete the recorded version of given resource // deleteVersionRecord will delete the recorded version of given resource
func (o *objectWatcherImpl) deleteVersionRecord(clusterName, resourceName string) { func (o *objectWatcherImpl) deleteVersionRecord(clusterName, resourceName string) {
o.Lock.Lock() o.Lock.Lock()