update object watcher metrics and simplify cluster object version recording

Signed-off-by: zach593 <zach_li@outlook.com>
This commit is contained in:
zach593 2025-03-09 00:37:55 +08:00
parent 3b6c0e0fa2
commit d226ce11f3
4 changed files with 70 additions and 57 deletions

View File

@ -212,7 +212,9 @@ func (c *Controller) cleanupPolicyClaimMetadata(ctx context.Context, work *workv
util.RemoveLabels(workload, util.ManagedResourceLabels...)
util.RemoveAnnotations(workload, util.ManagedResourceAnnotations...)
if err := c.ObjectWatcher.Update(ctx, cluster.Name, workload, clusterObj); err != nil {
operationResult, err := c.ObjectWatcher.Update(ctx, cluster.Name, workload, clusterObj)
metrics.CountUpdateResourceToCluster(err, workload.GetAPIVersion(), workload.GetKind(), cluster.Name, string(operationResult))
if err != nil {
klog.Errorf("Failed to update metadata in the given member cluster %v, err is %v", cluster.Name, err)
return err
}
@ -232,6 +234,7 @@ func (c *Controller) tryDeleteWorkload(ctx context.Context, clusterName string,
}
err = c.ObjectWatcher.Delete(ctx, clusterName, workload)
metrics.CountDeleteResourceFromCluster(err, workload.GetAPIVersion(), workload.GetKind(), clusterName)
if err != nil {
klog.Errorf("Failed to delete resource in the given member cluster %v, err is %v", clusterName, err)
return err
@ -312,13 +315,15 @@ func (c *Controller) tryCreateOrUpdateWorkload(ctx context.Context, clusterName
return err
}
err = c.ObjectWatcher.Create(ctx, clusterName, workload)
metrics.CountCreateResourceToCluster(err, workload.GetAPIVersion(), workload.GetKind(), clusterName, false)
if err != nil {
return err
}
return nil
}
err = c.ObjectWatcher.Update(ctx, clusterName, workload, clusterObj)
operationResult, err := c.ObjectWatcher.Update(ctx, clusterName, workload, clusterObj)
metrics.CountUpdateResourceToCluster(err, workload.GetAPIVersion(), workload.GetKind(), clusterName, string(operationResult))
if err != nil {
return err
}

View File

@ -255,8 +255,8 @@ func (c *WorkStatusController) updateResource(ctx context.Context, observedObj *
}
if needUpdate {
updateErr := c.ObjectWatcher.Update(ctx, clusterName, desiredObj, observedObj)
metrics.CountUpdateResourceToCluster(updateErr, desiredObj.GetAPIVersion(), desiredObj.GetKind(), clusterName)
operationResult, updateErr := c.ObjectWatcher.Update(ctx, clusterName, desiredObj, observedObj)
metrics.CountUpdateResourceToCluster(updateErr, desiredObj.GetAPIVersion(), desiredObj.GetKind(), clusterName, string(operationResult))
if updateErr != nil {
klog.Errorf("Updating %s failed: %v", fedKey.String(), updateErr)
return updateErr
@ -299,7 +299,6 @@ func (c *WorkStatusController) handleDeleteEvent(ctx context.Context, key keys.F
}
reCreateErr := c.recreateResourceIfNeeded(ctx, work, key)
metrics.CountRecreateResourceToCluster(reCreateErr, key.GroupVersion().String(), key.Kind, key.Cluster)
if reCreateErr != nil {
c.updateAppliedCondition(ctx, work, metav1.ConditionFalse, "ReCreateFailed", reCreateErr.Error())
return reCreateErr
@ -321,6 +320,7 @@ func (c *WorkStatusController) recreateResourceIfNeeded(ctx context.Context, wor
manifest.GetName() == workloadKey.Name {
klog.Infof("Recreating resource(%s).", workloadKey.String())
err := c.ObjectWatcher.Create(ctx, workloadKey.Cluster, manifest)
metrics.CountCreateResourceToCluster(err, workloadKey.GroupVersion().String(), workloadKey.Kind, workloadKey.Cluster, true)
if err != nil {
c.eventf(manifest, corev1.EventTypeWarning, events.EventReasonSyncWorkloadFailed, "Failed to create or update resource(%s/%s) in member cluster(%s): %v", manifest.GetNamespace(), manifest.GetName(), workloadKey.Cluster, err)
return err

View File

@ -17,6 +17,7 @@ limitations under the License.
package metrics
import (
"strconv"
"time"
"github.com/prometheus/client_golang/prometheus"
@ -30,8 +31,9 @@ const (
policyApplyAttemptsMetricsName = "policy_apply_attempts_total"
syncWorkDurationMetricsName = "binding_sync_work_duration_seconds"
syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds"
recreateResourceToCluster = "recreate_resource_to_cluster"
createResourceToCluster = "create_resource_to_cluster"
updateResourceToCluster = "update_resource_to_cluster"
deleteResourceFromCluster = "delete_resource_from_cluster"
policyPreemptionMetricsName = "policy_preemption_total"
cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds"
cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds"
@ -69,14 +71,19 @@ var (
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12),
}, []string{"result"})
recreateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: recreateResourceToCluster,
Help: "Number of recreating operation of the resource to a target member cluster. By the result, 'error' means a resource recreated failed. Otherwise 'success'. Cluster means the target member cluster.",
}, []string{"result", "apiversion", "kind", "cluster"})
createResourceWhenSyncWork = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: createResourceToCluster,
Help: "Number of creation operations against a target member cluster. The 'result' label indicates outcome ('success' or 'error'), 'recreate' indicates whether the operation is recreated (true/false). Labels 'apiversion', 'kind', and 'cluster' specify the resource type, API version, and target cluster respectively.",
}, []string{"result", "apiversion", "kind", "cluster", "recreate"})
updateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{
updateResourceWhenSyncWork = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: updateResourceToCluster,
Help: "Number of updating operation of the resource to a target member cluster. By the result, 'error' means a resource updated failed. Otherwise 'success'. Cluster means the target member cluster.",
Help: "Number of updating operation of the resource to a target member cluster. By the result, 'error' means a resource updated failed. Otherwise 'success'. Cluster means the target member cluster. operationResult means the result of the update operation.",
}, []string{"result", "apiversion", "kind", "cluster", "operationResult"})
deleteResourceWhenSyncWork = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: deleteResourceFromCluster,
Help: "Number of deletion operations against a target member cluster. The 'result' label indicates outcome ('success' or 'error'). Labels 'apiversion', 'kind', and 'cluster' specify the resource's API version, type, and source cluster respectively.",
}, []string{"result", "apiversion", "kind", "cluster"})
policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
@ -130,14 +137,19 @@ func ObserveSyncWorkloadLatency(err error, start time.Time) {
syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
}
// CountRecreateResourceToCluster records the number of recreating operation of the resource to a target member cluster.
func CountRecreateResourceToCluster(err error, apiVersion, kind, cluster string) {
recreateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc()
// CountCreateResourceToCluster records the number of creation operations of the resource for a target member cluster.
func CountCreateResourceToCluster(err error, apiVersion, kind, cluster string, recreate bool) {
createResourceWhenSyncWork.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster, strconv.FormatBool(recreate)).Inc()
}
// CountUpdateResourceToCluster records the number of updating operation of the resource to a target member cluster.
func CountUpdateResourceToCluster(err error, apiVersion, kind, cluster string) {
updateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc()
func CountUpdateResourceToCluster(err error, apiVersion, kind, cluster string, operationResult string) {
updateResourceWhenSyncWork.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster, operationResult).Inc()
}
// CountDeleteResourceFromCluster records the number of deletion operations of the resource from a target member cluster.
func CountDeleteResourceFromCluster(err error, apiVersion, kind, cluster string) {
deleteResourceWhenSyncWork.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc()
}
// CountPolicyPreemption records the numbers of policy preemption.
@ -173,8 +185,9 @@ func ResourceCollectors() []prometheus.Collector {
policyApplyAttempts,
syncWorkDurationHistogram,
syncWorkloadDurationHistogram,
recreateResourceWhenSyncWorkStatus,
updateResourceWhenSyncWorkStatus,
createResourceWhenSyncWork,
updateResourceWhenSyncWork,
deleteResourceWhenSyncWork,
policyPreemptionCounter,
cronFederatedHPADurationHistogram,
cronFederatedHPARuleDurationHistogram,

View File

@ -39,10 +39,22 @@ import (
"github.com/karmada-io/karmada/pkg/util/restmapper"
)
// OperationResult is the action result of an Update call.
type OperationResult string
const (
// OperationResultNone means that the update operation was not performed and the resource has not been changed.
OperationResultNone OperationResult = "none"
// OperationResultUnchanged means that the update operation was performed but the resource did not change.
OperationResultUnchanged OperationResult = "unchanged"
// OperationResultUpdated means that an existing resource is updated.
OperationResultUpdated OperationResult = "updated"
)
// ObjectWatcher manages operations for object dispatched to member clusters.
type ObjectWatcher interface {
Create(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error
Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) error
Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) (operationResult OperationResult, err error)
Delete(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error
NeedsUpdate(clusterName string, desiredObj, clusterObj *unstructured.Unstructured) (bool, error)
}
@ -138,43 +150,48 @@ func (o *objectWatcherImpl) retainClusterFields(desired, observed *unstructured.
return desired, nil
}
func (o *objectWatcherImpl) Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) error {
func (o *objectWatcherImpl) Update(ctx context.Context, clusterName string, desireObj, clusterObj *unstructured.Unstructured) (OperationResult, error) {
updateAllowed := o.allowUpdate(clusterName, desireObj, clusterObj)
if !updateAllowed {
// The existing resource is not managed by Karmada, and no conflict resolution found, avoid updating the existing resource by default.
return fmt.Errorf("resource(kind=%s, %s/%s) already exists in the cluster %v and the %s strategy value is empty, Karmada will not manage this resource",
return OperationResultNone, fmt.Errorf("resource(kind=%s, %s/%s) already exists in the cluster %v and the %s strategy value is empty, Karmada will not manage this resource",
desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, workv1alpha2.ResourceConflictResolutionAnnotation)
}
dynamicClusterClient, err := o.ClusterClientSetFunc(clusterName, o.KubeClientSet)
if err != nil {
klog.Errorf("Failed to build dynamic cluster client for cluster %s, err: %v.", clusterName, err)
return err
return OperationResultNone, err
}
gvr, err := restmapper.GetGroupVersionResource(o.RESTMapper, desireObj.GroupVersionKind())
if err != nil {
klog.Errorf("Failed to update the resource(kind=%s, %s/%s) in the cluster %s as mapping GVK to GVR failed: %v", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, err)
return err
return OperationResultNone, err
}
desireObj, err = o.retainClusterFields(desireObj, clusterObj)
if err != nil {
klog.Errorf("Failed to retain fields for resource(kind=%s, %s/%s) in cluster %s: %v", clusterObj.GetKind(), clusterObj.GetNamespace(), clusterObj.GetName(), clusterName, err)
return err
return OperationResultNone, err
}
resource, err := dynamicClusterClient.DynamicClientSet.Resource(gvr).Namespace(desireObj.GetNamespace()).Update(ctx, desireObj, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("Failed to update resource(kind=%s, %s/%s) in cluster %s, err: %v.", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName, err)
return err
return OperationResultNone, err
}
klog.Infof("Updated the resource(kind=%s, %s/%s) on cluster(%s).", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName)
// record version
o.recordVersion(resource, clusterName)
return nil
if clusterObj.GetResourceVersion() == resource.GetResourceVersion() {
klog.Infof("Updated the resource(kind=%s, %s/%s) on cluster(%s) but the cluster object was not changed.", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName)
return OperationResultUnchanged, nil
}
klog.Infof("Updated the resource(kind=%s, %s/%s) on cluster(%s).", desireObj.GetKind(), desireObj.GetNamespace(), desireObj.GetName(), clusterName)
return OperationResultUpdated, nil
}
func (o *objectWatcherImpl) Delete(ctx context.Context, clusterName string, desireObj *unstructured.Unstructured) error {
@ -242,21 +259,13 @@ func (o *objectWatcherImpl) genObjectKey(obj *unstructured.Unstructured) string
func (o *objectWatcherImpl) recordVersion(clusterObj *unstructured.Unstructured, clusterName string) {
objVersion := lifted.ObjectVersion(clusterObj)
objectKey := o.genObjectKey(clusterObj)
if o.isClusterVersionRecordExist(clusterName) {
o.updateVersionRecord(clusterName, objectKey, objVersion)
} else {
o.addVersionRecord(clusterName, objectKey, objVersion)
o.Lock.Lock()
defer o.Lock.Unlock()
if o.VersionRecord[clusterName] == nil {
o.VersionRecord[clusterName] = make(map[string]string)
}
}
// isClusterVersionRecordExist checks if the version record map of given member cluster exist
func (o *objectWatcherImpl) isClusterVersionRecordExist(clusterName string) bool {
o.Lock.RLock()
defer o.Lock.RUnlock()
_, exist := o.VersionRecord[clusterName]
return exist
o.VersionRecord[clusterName][objectKey] = objVersion
}
// getVersionRecord will return the recorded version of given resource(if exist)
@ -268,20 +277,6 @@ func (o *objectWatcherImpl) getVersionRecord(clusterName, resourceName string) (
return version, exist
}
// addVersionRecord will add new version record of given resource
func (o *objectWatcherImpl) addVersionRecord(clusterName, resourceName, version string) {
o.Lock.Lock()
defer o.Lock.Unlock()
o.VersionRecord[clusterName] = map[string]string{resourceName: version}
}
// updateVersionRecord will update the recorded version of given resource
func (o *objectWatcherImpl) updateVersionRecord(clusterName, resourceName, version string) {
o.Lock.Lock()
defer o.Lock.Unlock()
o.VersionRecord[clusterName][resourceName] = version
}
// deleteVersionRecord will delete the recorded version of given resource
func (o *objectWatcherImpl) deleteVersionRecord(clusterName, resourceName string) {
o.Lock.Lock()