Merge pull request #3867 from whitewindmills/preempt-metrics-events

feat: add metrics and events for policy preemption
This commit is contained in:
karmada-bot 2023-08-02 15:33:45 +08:00 committed by GitHub
commit 946fc72fad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 58 additions and 6 deletions

View File

@ -1,13 +1,16 @@
package detector
import (
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/klog/v2"
policyv1alpha1 "github.com/karmada-io/karmada/pkg/apis/policy/v1alpha1"
"github.com/karmada-io/karmada/pkg/events"
"github.com/karmada-io/karmada/pkg/features"
"github.com/karmada-io/karmada/pkg/metrics"
"github.com/karmada-io/karmada/pkg/util"
"github.com/karmada-io/karmada/pkg/util/helper"
"github.com/karmada-io/karmada/pkg/util/names"
@ -74,7 +77,7 @@ func (d *ResourceDetector) handleClusterPropagationPolicyPreemption(policy *poli
}
// preemptPropagationPolicy preempts resource template that is claimed by PropagationPolicy.
func (d *ResourceDetector) preemptPropagationPolicy(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.PropagationPolicy) error {
func (d *ResourceDetector) preemptPropagationPolicy(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.PropagationPolicy) (err error) {
rtLabels := resourceTemplate.GetLabels()
claimedPolicyNamespace := util.GetLabelValue(rtLabels, policyv1alpha1.PropagationPolicyNamespaceLabel)
claimedPolicyName := util.GetLabelValue(rtLabels, policyv1alpha1.PropagationPolicyNameLabel)
@ -104,7 +107,18 @@ func (d *ResourceDetector) preemptPropagationPolicy(resourceTemplate *unstructur
return nil
}
if err := d.ClaimPolicyForObject(resourceTemplate, policy.Namespace, policy.Name); err != nil {
defer func() {
metrics.CountPolicyPreemption(err)
if err != nil {
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeWarning, events.EventReasonPreemptPolicyFailed,
"Propagation policy(%s/%s) failed to preempt propagation policy(%s/%s): %v", policy.Namespace, policy.Name, claimedPolicyNamespace, claimedPolicyName, err)
return
}
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeNormal, events.EventReasonPreemptPolicySucceed,
"Propagation policy(%s/%s) preempted propagation policy(%s/%s) successfully", policy.Namespace, policy.Name, claimedPolicyNamespace, claimedPolicyName)
}()
if err = d.ClaimPolicyForObject(resourceTemplate, policy.Namespace, policy.Name); err != nil {
klog.Errorf("Failed to claim new propagation policy(%s/%s) on resource template(%s, kind=%s, %s): %v.", policy.Namespace, policy.Name,
resourceTemplate.GetAPIVersion(), resourceTemplate.GetKind(), names.NamespacedKey(resourceTemplate.GetNamespace(), resourceTemplate.GetName()), err)
return err
@ -115,13 +129,24 @@ func (d *ResourceDetector) preemptPropagationPolicy(resourceTemplate *unstructur
}
// preemptClusterPropagationPolicyDirectly directly preempts resource template claimed by ClusterPropagationPolicy regardless of priority.
func (d *ResourceDetector) preemptClusterPropagationPolicyDirectly(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.PropagationPolicy) error {
func (d *ResourceDetector) preemptClusterPropagationPolicyDirectly(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.PropagationPolicy) (err error) {
claimedPolicyName := util.GetLabelValue(resourceTemplate.GetLabels(), policyv1alpha1.ClusterPropagationPolicyLabel)
if claimedPolicyName == "" {
return nil
}
if err := d.ClaimPolicyForObject(resourceTemplate, policy.Namespace, policy.Name); err != nil {
defer func() {
metrics.CountPolicyPreemption(err)
if err != nil {
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeWarning, events.EventReasonPreemptPolicyFailed,
"Propagation policy(%s/%s) failed to preempt cluster propagation policy(%s): %v", policy.Namespace, policy.Name, claimedPolicyName, err)
return
}
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeNormal, events.EventReasonPreemptPolicySucceed,
"Propagation policy(%s/%s) preempted cluster propagation policy(%s) successfully", policy.Namespace, policy.Name, claimedPolicyName)
}()
if err = d.ClaimPolicyForObject(resourceTemplate, policy.Namespace, policy.Name); err != nil {
klog.Errorf("Failed to claim new propagation policy(%s/%s) on resource template(%s, kind=%s, %s) directly: %v.", policy.Namespace, policy.Name,
resourceTemplate.GetAPIVersion(), resourceTemplate.GetKind(), names.NamespacedKey(resourceTemplate.GetNamespace(), resourceTemplate.GetName()), err)
return err
@ -132,7 +157,7 @@ func (d *ResourceDetector) preemptClusterPropagationPolicyDirectly(resourceTempl
}
// preemptClusterPropagationPolicy preempts resource template that is claimed by ClusterPropagationPolicy.
func (d *ResourceDetector) preemptClusterPropagationPolicy(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.ClusterPropagationPolicy) error {
func (d *ResourceDetector) preemptClusterPropagationPolicy(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.ClusterPropagationPolicy) (err error) {
claimedPolicyName := util.GetLabelValue(resourceTemplate.GetLabels(), policyv1alpha1.ClusterPropagationPolicyLabel)
if claimedPolicyName == "" {
return nil
@ -160,7 +185,18 @@ func (d *ResourceDetector) preemptClusterPropagationPolicy(resourceTemplate *uns
return nil
}
if err := d.ClaimClusterPolicyForObject(resourceTemplate, policy.Name); err != nil {
defer func() {
metrics.CountPolicyPreemption(err)
if err != nil {
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeWarning, events.EventReasonPreemptPolicyFailed,
"Cluster propagation policy(%s) failed to preempt cluster propagation policy(%s): %v", policy.Name, claimedPolicyName, err)
return
}
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeNormal, events.EventReasonPreemptPolicySucceed,
"Cluster propagation policy(%s) preempted cluster propagation policy(%s) successfully", policy.Name, claimedPolicyName)
}()
if err = d.ClaimClusterPolicyForObject(resourceTemplate, policy.Name); err != nil {
klog.Errorf("Failed to claim new cluster propagation policy(%s) on resource template(%s, kind=%s, %s): %v.", policy.Name,
resourceTemplate.GetAPIVersion(), resourceTemplate.GetKind(), names.NamespacedKey(resourceTemplate.GetNamespace(), resourceTemplate.GetName()), err)
return err

View File

@ -100,6 +100,10 @@ const (
EventReasonGetDependenciesSucceed = "GetDependenciesSucceed"
// EventReasonGetDependenciesFailed indicates get dependencies of resource template failed.
EventReasonGetDependenciesFailed = "GetDependenciesFailed"
// EventReasonPreemptPolicySucceed indicates policy preemption of resource template succeed.
EventReasonPreemptPolicySucceed = "PreemptPolicySucceed"
// EventReasonPreemptPolicyFailed indicates policy preemption of resource template failed.
EventReasonPreemptPolicyFailed = "PreemptPolicyFailed"
)
// Define events for ServiceImport objects.

View File

@ -14,6 +14,7 @@ const (
policyApplyAttemptsMetricsName = "policy_apply_attempts_total"
syncWorkDurationMetricsName = "binding_sync_work_duration_seconds"
syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds"
policyPreemptionMetricsName = "policy_preemption_total"
)
var (
@ -45,6 +46,11 @@ var (
Help: "Duration in seconds to sync the workload to a target cluster. By the result, 'error' means a work failed to sync workloads. Otherwise 'success'.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12),
}, []string{"result"})
policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: policyPreemptionMetricsName,
Help: "Number of preemption for the resource template. By the result, 'error' means a resource template failed to be preempted by other propagation policies. Otherwise 'success'.",
}, []string{"result"})
)
// ObserveFindMatchedPolicyLatency records the duration for the resource finding a matched policy.
@ -68,6 +74,11 @@ func ObserveSyncWorkloadLatency(err error, start time.Time) {
syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
}
// CountPolicyPreemption records the numbers of policy preemption.
func CountPolicyPreemption(err error) {
policyPreemptionCounter.WithLabelValues(utilmetrics.GetResultByError(err)).Inc()
}
// ResourceCollectors returns the collectors about resources.
func ResourceCollectors() []prometheus.Collector {
return []prometheus.Collector{
@ -76,6 +87,7 @@ func ResourceCollectors() []prometheus.Collector {
policyApplyAttempts,
syncWorkDurationHistogram,
syncWorkloadDurationHistogram,
policyPreemptionCounter,
}
}