diff --git a/manager/metrics/metrics.go b/manager/metrics/metrics.go index 7f4fc3076..59984cae0 100644 --- a/manager/metrics/metrics.go +++ b/manager/metrics/metrics.go @@ -39,6 +39,13 @@ var ( Help: "Gauge of the number of peer.", }, []string{"version", "commit"}) + SearchSchedulerClusterFailureCount = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: types.MetricsNamespace, + Subsystem: types.SchedulerMetricsName, + Name: "search_scheduler_cluster_failure_total", + Help: "Counter of the number of failed of searching scheduler cluster.", + }, []string{"version", "commit"}) + VersionGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ Namespace: types.MetricsNamespace, Subsystem: types.ManagerMetricsName, diff --git a/manager/rpcserver/rpcserver.go b/manager/rpcserver/rpcserver.go index e66ae9b7f..98beff585 100644 --- a/manager/rpcserver/rpcserver.go +++ b/manager/rpcserver/rpcserver.go @@ -544,20 +544,28 @@ func (s *Server) ListSchedulers(ctx context.Context, req *managerv1.ListSchedule if err := s.db.WithContext(ctx).Preload("SecurityGroup.SecurityRules").Preload("SeedPeerClusters.SeedPeers", "state = ?", "active").Preload("Schedulers", "state = ?", "active").Find(&schedulerClusters).Error; err != nil { return nil, status.Error(codes.Unknown, err.Error()) } + log.Debugf("list scheduler clusters %v with hostInfo %#v", getSchedulerClusterNames(schedulerClusters), req.HostInfo) // Search optimal scheduler clusters. - log.Debugf("list scheduler clusters %v with hostInfo %#v", getSchedulerClusterNames(schedulerClusters), req.HostInfo) - schedulerClusters, err := s.searcher.FindSchedulerClusters(ctx, schedulerClusters, req) + // If searcher can not found candidate scheduler cluster, + // return all scheduler clusters. + var ( + candidateSchedulerClusters []model.SchedulerCluster + err error + ) + candidateSchedulerClusters, err = s.searcher.FindSchedulerClusters(ctx, schedulerClusters, req) if err != nil { + candidateSchedulerClusters = schedulerClusters + log.Error(err) - return nil, status.Error(codes.NotFound, "scheduler cluster not found") + metrics.SearchSchedulerClusterFailureCount.WithLabelValues(req.Version, req.Commit).Inc() } log.Debugf("find matching scheduler cluster %v", getSchedulerClusterNames(schedulerClusters)) schedulers := []model.Scheduler{} - for _, schedulerCluster := range schedulerClusters { - for _, scheduler := range schedulerCluster.Schedulers { - scheduler.SchedulerCluster = schedulerCluster + for _, candidateSchedulerCluster := range candidateSchedulerClusters { + for _, scheduler := range candidateSchedulerCluster.Schedulers { + scheduler.SchedulerCluster = candidateSchedulerCluster schedulers = append(schedulers, scheduler) } }