Merge pull request #5247 from chaosi-zju/metrics
add metrics for recreate/update resource event when sync work status
This commit is contained in:
commit
c38f169916
|
@ -41,6 +41,7 @@ import (
|
||||||
workv1alpha1 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha1"
|
workv1alpha1 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha1"
|
||||||
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
|
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
|
||||||
"github.com/karmada-io/karmada/pkg/events"
|
"github.com/karmada-io/karmada/pkg/events"
|
||||||
|
"github.com/karmada-io/karmada/pkg/metrics"
|
||||||
"github.com/karmada-io/karmada/pkg/resourceinterpreter"
|
"github.com/karmada-io/karmada/pkg/resourceinterpreter"
|
||||||
"github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag"
|
"github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag"
|
||||||
"github.com/karmada-io/karmada/pkg/util"
|
"github.com/karmada-io/karmada/pkg/util"
|
||||||
|
@ -239,9 +240,11 @@ func (c *WorkStatusController) syncWorkStatus(key util.QueueKey) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
if needUpdate {
|
if needUpdate {
|
||||||
if err := c.ObjectWatcher.Update(clusterName, desiredObj, observedObj); err != nil {
|
updateErr := c.ObjectWatcher.Update(clusterName, desiredObj, observedObj)
|
||||||
klog.Errorf("Updating %s failed: %v", fedKey.String(), err)
|
metrics.CountUpdateResourceToCluster(updateErr, desiredObj.GetAPIVersion(), desiredObj.GetKind(), clusterName)
|
||||||
return err
|
if updateErr != nil {
|
||||||
|
klog.Errorf("Updating %s failed: %v", fedKey.String(), updateErr)
|
||||||
|
return updateErr
|
||||||
}
|
}
|
||||||
// We can't return even after a success updates, because that might lose the chance to collect status.
|
// We can't return even after a success updates, because that might lose the chance to collect status.
|
||||||
// Not all updates are real, they might be no change, in that case there will be no more event for this update,
|
// Not all updates are real, they might be no change, in that case there will be no more event for this update,
|
||||||
|
@ -283,6 +286,7 @@ func (c *WorkStatusController) handleDeleteEvent(key keys.FederatedKey) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
reCreateErr := c.recreateResourceIfNeeded(work, key)
|
reCreateErr := c.recreateResourceIfNeeded(work, key)
|
||||||
|
metrics.CountRecreateResourceToCluster(reCreateErr, key.GroupVersion().String(), key.Kind, key.Cluster)
|
||||||
if reCreateErr != nil {
|
if reCreateErr != nil {
|
||||||
c.updateAppliedCondition(work, metav1.ConditionFalse, "ReCreateFailed", reCreateErr.Error())
|
c.updateAppliedCondition(work, metav1.ConditionFalse, "ReCreateFailed", reCreateErr.Error())
|
||||||
return reCreateErr
|
return reCreateErr
|
||||||
|
|
|
@ -30,6 +30,8 @@ const (
|
||||||
policyApplyAttemptsMetricsName = "policy_apply_attempts_total"
|
policyApplyAttemptsMetricsName = "policy_apply_attempts_total"
|
||||||
syncWorkDurationMetricsName = "binding_sync_work_duration_seconds"
|
syncWorkDurationMetricsName = "binding_sync_work_duration_seconds"
|
||||||
syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds"
|
syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds"
|
||||||
|
recreateResourceToCluster = "recreate_resource_to_cluster"
|
||||||
|
updateResourceToCluster = "update_resource_to_cluster"
|
||||||
policyPreemptionMetricsName = "policy_preemption_total"
|
policyPreemptionMetricsName = "policy_preemption_total"
|
||||||
cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds"
|
cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds"
|
||||||
cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds"
|
cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds"
|
||||||
|
@ -67,6 +69,16 @@ var (
|
||||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12),
|
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12),
|
||||||
}, []string{"result"})
|
}, []string{"result"})
|
||||||
|
|
||||||
|
recreateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Name: recreateResourceToCluster,
|
||||||
|
Help: "Number of recreating operation of the resource to a target member cluster. By the result, 'error' means a resource recreated failed. Otherwise 'success'. Cluster means the target member cluster.",
|
||||||
|
}, []string{"result", "apiversion", "kind", "cluster"})
|
||||||
|
|
||||||
|
updateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Name: updateResourceToCluster,
|
||||||
|
Help: "Number of updating operation of the resource to a target member cluster. By the result, 'error' means a resource updated failed. Otherwise 'success'. Cluster means the target member cluster.",
|
||||||
|
}, []string{"result", "apiversion", "kind", "cluster"})
|
||||||
|
|
||||||
policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
|
policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
Name: policyPreemptionMetricsName,
|
Name: policyPreemptionMetricsName,
|
||||||
Help: "Number of preemption for the resource template. By the result, 'error' means a resource template failed to be preempted by other propagation policies. Otherwise 'success'.",
|
Help: "Number of preemption for the resource template. By the result, 'error' means a resource template failed to be preempted by other propagation policies. Otherwise 'success'.",
|
||||||
|
@ -118,6 +130,16 @@ func ObserveSyncWorkloadLatency(err error, start time.Time) {
|
||||||
syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
|
syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CountRecreateResourceToCluster records the number of recreating operation of the resource to a target member cluster.
|
||||||
|
func CountRecreateResourceToCluster(err error, apiVersion, kind, cluster string) {
|
||||||
|
recreateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc()
|
||||||
|
}
|
||||||
|
|
||||||
|
// CountUpdateResourceToCluster records the number of updating operation of the resource to a target member cluster.
|
||||||
|
func CountUpdateResourceToCluster(err error, apiVersion, kind, cluster string) {
|
||||||
|
updateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc()
|
||||||
|
}
|
||||||
|
|
||||||
// CountPolicyPreemption records the numbers of policy preemption.
|
// CountPolicyPreemption records the numbers of policy preemption.
|
||||||
func CountPolicyPreemption(err error) {
|
func CountPolicyPreemption(err error) {
|
||||||
policyPreemptionCounter.WithLabelValues(utilmetrics.GetResultByError(err)).Inc()
|
policyPreemptionCounter.WithLabelValues(utilmetrics.GetResultByError(err)).Inc()
|
||||||
|
@ -151,6 +173,8 @@ func ResourceCollectors() []prometheus.Collector {
|
||||||
policyApplyAttempts,
|
policyApplyAttempts,
|
||||||
syncWorkDurationHistogram,
|
syncWorkDurationHistogram,
|
||||||
syncWorkloadDurationHistogram,
|
syncWorkloadDurationHistogram,
|
||||||
|
recreateResourceWhenSyncWorkStatus,
|
||||||
|
updateResourceWhenSyncWorkStatus,
|
||||||
policyPreemptionCounter,
|
policyPreemptionCounter,
|
||||||
cronFederatedHPADurationHistogram,
|
cronFederatedHPADurationHistogram,
|
||||||
cronFederatedHPARuleDurationHistogram,
|
cronFederatedHPARuleDurationHistogram,
|
||||||
|
|
Loading…
Reference in New Issue