add cluster success threshold
Signed-off-by: dddddai <dddwq@foxmail.com>
This commit is contained in:
parent
801d18767b
commit
1ebc680d16
|
@ -185,6 +185,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
|
|||
ClusterStatusUpdateFrequency: opts.ClusterStatusUpdateFrequency,
|
||||
ClusterLeaseDuration: opts.ClusterLeaseDuration,
|
||||
ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
|
||||
ClusterSuccessThreshold: opts.ClusterSuccessThreshold,
|
||||
ClusterFailureThreshold: opts.ClusterFailureThreshold,
|
||||
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
|
||||
ClusterAPIQPS: opts.ClusterAPIQPS,
|
||||
|
@ -223,6 +224,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (bool, error)
|
|||
ClusterStatusUpdateFrequency: ctx.Opts.ClusterStatusUpdateFrequency,
|
||||
ClusterLeaseDuration: ctx.Opts.ClusterLeaseDuration,
|
||||
ClusterLeaseRenewIntervalFraction: ctx.Opts.ClusterLeaseRenewIntervalFraction,
|
||||
ClusterSuccessThreshold: ctx.Opts.ClusterSuccessThreshold,
|
||||
ClusterFailureThreshold: ctx.Opts.ClusterFailureThreshold,
|
||||
ClusterCacheSyncTimeout: ctx.Opts.ClusterCacheSyncTimeout,
|
||||
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
|
||||
|
|
|
@ -38,6 +38,8 @@ type Options struct {
|
|||
// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
|
||||
// how long the current holder of a lease has last updated the lease.
|
||||
ClusterLeaseRenewIntervalFraction float64
|
||||
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
|
||||
ClusterSuccessThreshold metav1.Duration
|
||||
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
|
||||
ClusterFailureThreshold metav1.Duration
|
||||
// ClusterAPIQPS is the QPS to use while talking with cluster kube-apiserver.
|
||||
|
@ -100,6 +102,7 @@ func (o *Options) AddFlags(fs *pflag.FlagSet, allControllers []string) {
|
|||
"Specifies the expiration period of a cluster lease.")
|
||||
fs.Float64Var(&o.ClusterLeaseRenewIntervalFraction, "cluster-lease-renew-interval-fraction", 0.25,
|
||||
"Specifies the cluster lease renew interval fraction.")
|
||||
fs.DurationVar(&o.ClusterSuccessThreshold.Duration, "cluster-success-threshold", 30*time.Second, "The duration of successes for the cluster to be considered healthy after recovery.")
|
||||
fs.DurationVar(&o.ClusterFailureThreshold.Duration, "cluster-failure-threshold", 30*time.Second, "The duration of failure for the cluster to be considered unhealthy.")
|
||||
fs.Float32Var(&o.ClusterAPIQPS, "cluster-api-qps", 40.0, "QPS to use while talking with cluster kube-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.")
|
||||
fs.IntVar(&o.ClusterAPIBurst, "cluster-api-burst", 60, "Burst to use while talking with cluster kube-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.")
|
||||
|
|
|
@ -233,6 +233,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (enabled bool,
|
|||
ClusterStatusUpdateFrequency: opts.ClusterStatusUpdateFrequency,
|
||||
ClusterLeaseDuration: opts.ClusterLeaseDuration,
|
||||
ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
|
||||
ClusterSuccessThreshold: opts.ClusterSuccessThreshold,
|
||||
ClusterFailureThreshold: opts.ClusterFailureThreshold,
|
||||
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
|
||||
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
|
||||
|
@ -493,6 +494,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
|
|||
FailoverEvictionTimeout: opts.FailoverEvictionTimeout,
|
||||
ClusterLeaseDuration: opts.ClusterLeaseDuration,
|
||||
ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
|
||||
ClusterSuccessThreshold: opts.ClusterSuccessThreshold,
|
||||
ClusterFailureThreshold: opts.ClusterFailureThreshold,
|
||||
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
|
||||
ClusterAPIQPS: opts.ClusterAPIQPS,
|
||||
|
|
|
@ -46,6 +46,8 @@ type Options struct {
|
|||
// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
|
||||
// how long the current holder of a lease has last updated the lease.
|
||||
ClusterLeaseRenewIntervalFraction float64
|
||||
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
|
||||
ClusterSuccessThreshold metav1.Duration
|
||||
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
|
||||
ClusterFailureThreshold metav1.Duration
|
||||
// ClusterMonitorPeriod represents cluster-controller monitoring period, i.e. how often does
|
||||
|
@ -136,6 +138,7 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers, disabledByDefau
|
|||
"Specifies the expiration period of a cluster lease.")
|
||||
flags.Float64Var(&o.ClusterLeaseRenewIntervalFraction, "cluster-lease-renew-interval-fraction", 0.25,
|
||||
"Specifies the cluster lease renew interval fraction.")
|
||||
flags.DurationVar(&o.ClusterSuccessThreshold.Duration, "cluster-success-threshold", 30*time.Second, "The duration of successes for the cluster to be considered healthy after recovery.")
|
||||
flags.DurationVar(&o.ClusterFailureThreshold.Duration, "cluster-failure-threshold", 30*time.Second, "The duration of failure for the cluster to be considered unhealthy.")
|
||||
flags.DurationVar(&o.ClusterMonitorPeriod.Duration, "cluster-monitor-period", 5*time.Second,
|
||||
"Specifies how often karmada-controller-manager monitors cluster health status.")
|
||||
|
|
|
@ -40,6 +40,8 @@ type Options struct {
|
|||
// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
|
||||
// how long the current holder of a lease has last updated the lease.
|
||||
ClusterLeaseRenewIntervalFraction float64
|
||||
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
|
||||
ClusterSuccessThreshold metav1.Duration
|
||||
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
|
||||
ClusterFailureThreshold metav1.Duration
|
||||
// ClusterCacheSyncTimeout is the timeout period waiting for cluster cache to sync.
|
||||
|
|
|
@ -19,6 +19,8 @@ type clusterData struct {
|
|||
|
||||
type clusterConditionStore struct {
|
||||
clusterDataMap sync.Map
|
||||
// successThreshold is the duration of successes for the cluster to be considered healthy after recovery.
|
||||
successThreshold time.Duration
|
||||
// failureThreshold is the duration of failure for the cluster to be considered unhealthy.
|
||||
failureThreshold time.Duration
|
||||
}
|
||||
|
@ -46,9 +48,19 @@ func (c *clusterConditionStore) thresholdAdjustedReadyCondition(cluster *cluster
|
|||
}
|
||||
c.update(cluster.Name, saved)
|
||||
}
|
||||
if observedReadyCondition.Status != metav1.ConditionTrue &&
|
||||
curReadyCondition.Status == metav1.ConditionTrue &&
|
||||
now.Before(saved.thresholdStartTime.Add(c.failureThreshold)) {
|
||||
|
||||
var threshold time.Duration
|
||||
if observedReadyCondition.Status == metav1.ConditionTrue {
|
||||
threshold = c.successThreshold
|
||||
} else {
|
||||
threshold = c.failureThreshold
|
||||
}
|
||||
|
||||
// we only care about true/not true
|
||||
// for unknown->false, just return the observed ready condition
|
||||
if ((observedReadyCondition.Status == metav1.ConditionTrue && curReadyCondition.Status != metav1.ConditionTrue) ||
|
||||
(observedReadyCondition.Status != metav1.ConditionTrue && curReadyCondition.Status == metav1.ConditionTrue)) &&
|
||||
now.Before(saved.thresholdStartTime.Add(threshold)) {
|
||||
// retain old status until threshold exceeded to avoid network unstable problems.
|
||||
return curReadyCondition
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
)
|
||||
|
||||
func TestThresholdAdjustedReadyCondition(t *testing.T) {
|
||||
clusterSuccessThreshold := 30 * time.Second
|
||||
clusterFailureThreshold := 30 * time.Second
|
||||
|
||||
tests := []struct {
|
||||
|
@ -65,7 +66,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
|
|||
},
|
||||
},
|
||||
{
|
||||
name: "cluster becomes not ready but still not reach threshold",
|
||||
name: "cluster becomes not ready but still not reach failure threshold",
|
||||
clusterData: &clusterData{
|
||||
readyCondition: metav1.ConditionFalse,
|
||||
thresholdStartTime: time.Now().Add(-clusterFailureThreshold / 2),
|
||||
|
@ -84,7 +85,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
|
|||
},
|
||||
},
|
||||
{
|
||||
name: "cluster becomes not ready and reaches threshold",
|
||||
name: "cluster becomes not ready and reaches failure threshold",
|
||||
clusterData: &clusterData{
|
||||
readyCondition: metav1.ConditionFalse,
|
||||
thresholdStartTime: time.Now().Add(-clusterFailureThreshold),
|
||||
|
@ -122,10 +123,29 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
|
|||
},
|
||||
},
|
||||
{
|
||||
name: "cluster recovers",
|
||||
name: "cluster recovers but still not reach success threshold",
|
||||
clusterData: &clusterData{
|
||||
readyCondition: metav1.ConditionFalse,
|
||||
thresholdStartTime: time.Now().Add(-3 * clusterFailureThreshold),
|
||||
readyCondition: metav1.ConditionTrue,
|
||||
thresholdStartTime: time.Now().Add(-clusterSuccessThreshold / 2),
|
||||
},
|
||||
currentCondition: &metav1.Condition{
|
||||
Type: clusterv1alpha1.ClusterConditionReady,
|
||||
Status: metav1.ConditionFalse,
|
||||
},
|
||||
observedCondition: &metav1.Condition{
|
||||
Type: clusterv1alpha1.ClusterConditionReady,
|
||||
Status: metav1.ConditionTrue,
|
||||
},
|
||||
expectedCondition: &metav1.Condition{
|
||||
Type: clusterv1alpha1.ClusterConditionReady,
|
||||
Status: metav1.ConditionFalse,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "cluster recovers and reaches success threshold",
|
||||
clusterData: &clusterData{
|
||||
readyCondition: metav1.ConditionTrue,
|
||||
thresholdStartTime: time.Now().Add(-clusterSuccessThreshold),
|
||||
},
|
||||
currentCondition: &metav1.Condition{
|
||||
Type: clusterv1alpha1.ClusterConditionReady,
|
||||
|
@ -145,6 +165,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
|
|||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
cache := clusterConditionStore{
|
||||
successThreshold: clusterSuccessThreshold,
|
||||
failureThreshold: clusterFailureThreshold,
|
||||
}
|
||||
|
||||
|
|
|
@ -77,6 +77,8 @@ type ClusterStatusController struct {
|
|||
ClusterLeaseRenewIntervalFraction float64
|
||||
// ClusterLeaseControllers store clusters and their corresponding lease controllers.
|
||||
ClusterLeaseControllers sync.Map
|
||||
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
|
||||
ClusterSuccessThreshold metav1.Duration
|
||||
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
|
||||
ClusterFailureThreshold metav1.Duration
|
||||
// clusterConditionCache stores the condition status of each cluster.
|
||||
|
@ -117,6 +119,7 @@ func (c *ClusterStatusController) Reconcile(ctx context.Context, req controllerr
|
|||
// SetupWithManager creates a controller and register to controller manager.
|
||||
func (c *ClusterStatusController) SetupWithManager(mgr controllerruntime.Manager) error {
|
||||
c.clusterConditionCache = clusterConditionStore{
|
||||
successThreshold: c.ClusterSuccessThreshold.Duration,
|
||||
failureThreshold: c.ClusterFailureThreshold.Duration,
|
||||
}
|
||||
return controllerruntime.NewControllerManagedBy(mgr).For(&clusterv1alpha1.Cluster{}).WithEventFilter(c.PredicateFunc).WithOptions(controller.Options{
|
||||
|
@ -149,7 +152,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
|
|||
}
|
||||
|
||||
// skip collecting cluster status if not ready
|
||||
if online && healthy {
|
||||
if online && healthy && readyCondition.Status == metav1.ConditionTrue {
|
||||
// get or create informer for pods and nodes in member cluster
|
||||
clusterInformerManager, err := c.buildInformerForCluster(cluster)
|
||||
if err != nil {
|
||||
|
@ -433,7 +436,7 @@ func getNodeSummary(nodes []*corev1.Node) *clusterv1alpha1.NodeSummary {
|
|||
}
|
||||
}
|
||||
|
||||
var nodeSummary = &clusterv1alpha1.NodeSummary{}
|
||||
nodeSummary := &clusterv1alpha1.NodeSummary{}
|
||||
nodeSummary.TotalNum = int32(totalNum)
|
||||
nodeSummary.ReadyNum = int32(readyNum)
|
||||
|
||||
|
@ -445,7 +448,7 @@ func getResourceSummary(nodes []*corev1.Node, pods []*corev1.Pod) *clusterv1alph
|
|||
allocating := getAllocatingResource(pods)
|
||||
allocated := getAllocatedResource(pods)
|
||||
|
||||
var resourceSummary = &clusterv1alpha1.ResourceSummary{}
|
||||
resourceSummary := &clusterv1alpha1.ResourceSummary{}
|
||||
resourceSummary.Allocatable = allocatable
|
||||
resourceSummary.Allocating = allocating
|
||||
resourceSummary.Allocated = allocated
|
||||
|
|
Loading…
Reference in New Issue