Compare commits

...

3 Commits

Author SHA1 Message Date
Husni Alhamdani dc2befb68d
Merge 65668934a8 into 321cc1d498 2025-07-20 15:20:59 +02:00
Husni Alhamdani 65668934a8 feat: rediscluster observability
Signed-off-by: Husni Alhamdani <dhanielluis@gmail.com>
2025-07-10 23:38:42 +07:00
Husni Alhamdani ee08504864 fix: rediscluster observability
Signed-off-by: Husni Alhamdani <dhanielluis@gmail.com>
2025-07-10 21:58:59 +07:00
4 changed files with 77 additions and 1 deletions

View File

@ -41,9 +41,21 @@ Total number of rediscluster rebalance operations. Type: Counter.
### rediscluster_remove_follower_attempt
Number of times to remove follower attempts. Type: Counter.
### rediscluster_repair_disconnected_attempt
Number of times to repair a Redis cluster disconnected from the cluster. Type: Counter.
### rediscluster_repair_failed
Number of times to repair a Redis cluster failed. Type: Counter.
### rediscluster_replicas_size_desired
Total desired number of rediscluster replicas. Type: Gauge.
### rediscluster_reset_attempt
Number of times to reset a Redis cluster. Type: Counter.
### rediscluster_reset_failed
Number of times to reset a Redis cluster failed. Type: Counter.
### rediscluster_reshard_total
Total number of rediscluster reshard operations. Type: Counter.

View File

@ -66,7 +66,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
}
return intctrlutil.Reconciled()
}
monitoring.RedisReplicationSkipReconcile.WithLabelValues(instance.Namespace, instance.Name).Set(0)
monitoring.RedisClusterSkipReconcile.WithLabelValues(instance.Namespace, instance.Name).Set(0)
if common.IsSkipReconcile(ctx, instance) {
monitoring.RedisClusterSkipReconcile.WithLabelValues(instance.Namespace, instance.Name).Set(1)
return intctrlutil.Reconciled()
@ -230,7 +230,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
}
logger.Info("healthy leader count does not match desired; attempting to repair disconnected masters")
monitoring.RedisClusterRepairDisconnectedAttempt.WithLabelValues(instance.Namespace, instance.Name).Inc()
if err = k8sutils.RepairDisconnectedMasters(ctx, r.K8sClient, instance); err != nil {
monitoring.RedisClusterRepairDisconnectedFailed.WithLabelValues(instance.Namespace, instance.Name).Inc()
logger.Error(err, "failed to repair disconnected masters")
}
@ -256,7 +258,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
}
if int(totalReplicas) > 1 && unhealthyNodeCount >= int(totalReplicas)-1 {
logger.Info("unhealthy nodes exist after attempting to repair disconnected masters; starting failover")
monitoring.RedisClusterResetAttempt.WithLabelValues(instance.Namespace, instance.Name).Inc()
if err = k8sutils.ExecuteFailoverOperation(ctx, r.K8sClient, instance); err != nil {
monitoring.RedisClusterResetFailed.WithLabelValues(instance.Namespace, instance.Name).Inc()
return intctrlutil.RequeueE(ctx, err, "")
}
}

View File

@ -32,5 +32,9 @@ func RegisterRedisClusterMetrics() {
RedisClusterRebalanceTotal,
RedisClusterRemoveFollowerAttempt,
RedisClusterReshardTotal,
RedisClusterRepairDisconnectedAttempt,
RedisClusterRepairDisconnectedFailed,
RedisClusterResetAttempt,
RedisClusterResetFailed,
)
}

View File

@ -48,6 +48,30 @@ var RedisClusterDescription = map[string]MetricDescription{
Type: "Counter",
labels: []string{"namespace", "instance"},
},
"RedisClusterRepairDisconnectedAttempt": {
Name: "rediscluster_repair_disconnected_attempt",
Help: "Number of times to repair a Redis cluster disconnected from the cluster.",
Type: "Counter",
labels: []string{"namespace", "instance"},
},
"RedisClusterRepairFailed": {
Name: "rediscluster_repair_failed",
Help: "Number of times to repair a Redis cluster failed.",
Type: "Counter",
labels: []string{"namespace", "instance"},
},
"RedisClusterResetAttempt": {
Name: "rediscluster_reset_attempt",
Help: "Number of times to reset a Redis cluster.",
Type: "Counter",
labels: []string{"namespace", "instance"},
},
"RedisClusterResetFailed": {
Name: "rediscluster_reset_failed",
Help: "Number of times to reset a Redis cluster failed.",
Type: "Counter",
labels: []string{"namespace", "instance"},
},
}
var (
@ -114,6 +138,38 @@ var (
},
RedisClusterDescription["RedisClusterAddingNodeAttempt"].labels,
)
RedisClusterRepairDisconnectedAttempt = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: RedisClusterDescription["RedisClusterRepairDisconnectedAttempt"].Name,
Help: RedisClusterDescription["RedisClusterRepairDisconnectedAttempt"].Help,
},
RedisClusterDescription["RedisClusterRepairDisconnectedAttempt"].labels,
)
RedisClusterRepairDisconnectedFailed = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: RedisClusterDescription["RedisClusterRepairDisconnectedFailed"].Name,
Help: RedisClusterDescription["RedisClusterRepairDisconnectedFailed"].Help,
},
RedisClusterDescription["RedisClusterRepairDisconnectedFailed"].labels,
)
RedisClusterResetAttempt = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: RedisClusterDescription["RedisClusterResetAttempt"].Name,
Help: RedisClusterDescription["RedisClusterResetAttempt"].Help,
},
RedisClusterDescription["RedisClusterResetAttempt"].labels,
)
RedisClusterResetFailed = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: RedisClusterDescription["RedisClusterResetFailed"].Name,
Help: RedisClusterDescription["RedisClusterResetFailed"].Help,
},
RedisClusterDescription["RedisClusterResetFailed"].labels,
)
)
// ListMetrics will create a slice with the metrics available in metricDescription