dragonfly/scheduler/scheduling/scheduling.go

983 lines
38 KiB
Go

/*
* Copyright 2020 The Dragonfly Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//go:generate mockgen -destination mocks/scheduling_mock.go -source scheduling.go -package mocks
package scheduling
import (
"context"
"fmt"
"time"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/durationpb"
"google.golang.org/protobuf/types/known/timestamppb"
commonv1 "d7y.io/api/v2/pkg/apis/common/v1"
commonv2 "d7y.io/api/v2/pkg/apis/common/v2"
schedulerv1 "d7y.io/api/v2/pkg/apis/scheduler/v1"
schedulerv2 "d7y.io/api/v2/pkg/apis/scheduler/v2"
"d7y.io/dragonfly/v2/pkg/container/set"
"d7y.io/dragonfly/v2/pkg/types"
"d7y.io/dragonfly/v2/scheduler/config"
"d7y.io/dragonfly/v2/scheduler/resource/persistentcache"
"d7y.io/dragonfly/v2/scheduler/resource/standard"
"d7y.io/dragonfly/v2/scheduler/scheduling/evaluator"
)
type Scheduling interface {
// ScheduleCandidateParents schedules candidate parents to the normal peer to download the task.
// Used only in v2 version of the grpc.
ScheduleCandidateParents(context.Context, *standard.Peer, set.SafeSet[string]) error
// ScheduleParentAndCandidateParents schedules a parent and candidate parents to the normal peer to download the task.
// Used only in v1 version of the grpc.
ScheduleParentAndCandidateParents(context.Context, *standard.Peer, set.SafeSet[string])
// FindCandidateParents finds candidate parents for the peer to download the task.
// Used only in v2 version of the grpc.
FindCandidateParents(context.Context, *standard.Peer, set.SafeSet[string]) ([]*standard.Peer, bool)
// FindParentAndCandidateParents finds a parent and candidate parents for the peer to download the task.
// Used only in v1 version of the grpc.
FindParentAndCandidateParents(context.Context, *standard.Peer, set.SafeSet[string]) ([]*standard.Peer, bool)
// FindSuccessParent finds success parent for the peer to download the task.
FindSuccessParent(context.Context, *standard.Peer, set.SafeSet[string]) (*standard.Peer, bool)
// FindReplicatePersistentCacheHosts finds replicate persistent cache hosts for the peer to replicate the task. It will compare the current
// persistent replica count with the persistent replica count and try to find enough parents. Then function will return the cached replicate parents,
// the replicate hosts without cache and found flag.
FindReplicatePersistentCacheHosts(context.Context, *persistentcache.Task, set.SafeSet[string]) ([]*persistentcache.Peer, []*persistentcache.Host, bool)
// FindCandidatePersistentCacheParents finds candidate persistent cache parents for the peer to download the task.
FindCandidatePersistentCacheParents(context.Context, *persistentcache.Peer, set.SafeSet[string]) ([]*persistentcache.Peer, bool)
}
type scheduling struct {
// Evaluator interface.
evaluator evaluator.Evaluator
// Scheduler configuration.
config *config.SchedulerConfig
// Persistent cache resource.
persistentCacheResource persistentcache.Resource
// Scheduler dynamic configuration.
dynconfig config.DynconfigInterface
}
func New(cfg *config.SchedulerConfig, persistentCacheResource persistentcache.Resource, dynconfig config.DynconfigInterface, pluginDir string) Scheduling {
return &scheduling{
evaluator: evaluator.New(cfg.Algorithm, pluginDir),
config: cfg,
persistentCacheResource: persistentCacheResource,
dynconfig: dynconfig,
}
}
// ScheduleCandidateParents schedules candidate parents to the normal peer.
// Used only in v2 version of the grpc.
func (s *scheduling) ScheduleCandidateParents(ctx context.Context, peer *standard.Peer, blocklist set.SafeSet[string]) error {
var n int
for {
select {
case <-ctx.Done():
peer.Log.Infof("context was done")
return ctx.Err()
default:
}
// Scheduling will send NeedBackToSourceResponse to peer.
//
// Condition 1: Peer's NeedBackToSource is true.
// Condition 2: Scheduling exceeds the RetryBackToSourceLimit.
if peer.Task.CanBackToSource() {
// Check condition 1:
// Peer's NeedBackToSource is true.
if peer.NeedBackToSource.Load() {
stream, loaded := peer.LoadAnnouncePeerStream()
if !loaded {
peer.Log.Error("load stream failed")
return status.Error(codes.FailedPrecondition, "load stream failed")
}
// Send NeedBackToSourceResponse to peer.
peer.Log.Infof("send NeedBackToSourceResponse, because of peer's NeedBackToSource is %t", peer.NeedBackToSource.Load())
description := fmt.Sprintf("peer's NeedBackToSource is %t", peer.NeedBackToSource.Load())
if err := stream.Send(&schedulerv2.AnnouncePeerResponse{
Response: &schedulerv2.AnnouncePeerResponse_NeedBackToSourceResponse{
NeedBackToSourceResponse: &schedulerv2.NeedBackToSourceResponse{
Description: &description,
},
},
}); err != nil {
peer.Log.Error(err)
return status.Error(codes.FailedPrecondition, err.Error())
}
return nil
}
// Check condition 2:
// The number of retry scheduling is greater than RetryBackToSourceLimit
if n >= s.config.RetryBackToSourceLimit {
stream, loaded := peer.LoadAnnouncePeerStream()
if !loaded {
peer.Log.Error("load stream failed")
return status.Error(codes.FailedPrecondition, "load stream failed")
}
// Send NeedBackToSourceResponse to peer.
peer.Log.Infof("send NeedBackToSourceResponse, because of scheduling exceeded RetryBackToSourceLimit %d", s.config.RetryBackToSourceLimit)
description := "scheduling exceeded RetryBackToSourceLimit"
if err := stream.Send(&schedulerv2.AnnouncePeerResponse{
Response: &schedulerv2.AnnouncePeerResponse_NeedBackToSourceResponse{
NeedBackToSourceResponse: &schedulerv2.NeedBackToSourceResponse{
Description: &description,
},
},
}); err != nil {
peer.Log.Error(err)
return status.Error(codes.FailedPrecondition, err.Error())
}
return nil
}
}
// Scheduling will return schedule failed.
//
// Condition 1: Scheduling exceeds the RetryLimit.
if n >= s.config.RetryLimit {
peer.Log.Errorf("scheduling failed, because of scheduling exceeded RetryLimit %d", s.config.RetryLimit)
return status.Error(codes.FailedPrecondition, "scheduling exceeded RetryLimit")
}
// Scheduling will send NormalTaskResponse to peer.
//
// Condition 1: Scheduling can find candidate parents.
if err := peer.Task.DeletePeerInEdges(peer.ID); err != nil {
peer.Log.Error(err)
return status.Error(codes.Internal, err.Error())
}
// Find candidate parents.
candidateParents, found := s.FindCandidateParents(ctx, peer, blocklist)
if !found {
n++
peer.Log.Infof("scheduling failed in %d times, because of candidate parents not found", n)
// Sleep to avoid hot looping.
time.Sleep(s.config.RetryInterval)
continue
}
// Load AnnouncePeerStream from peer.
stream, loaded := peer.LoadAnnouncePeerStream()
if !loaded {
if err := peer.Task.DeletePeerInEdges(peer.ID); err != nil {
err = fmt.Errorf("peer deletes inedges failed: %w", err)
peer.Log.Error(err)
return status.Error(codes.Internal, err.Error())
}
peer.Log.Error("load stream failed")
return status.Error(codes.FailedPrecondition, "load stream failed")
}
// Send NormalTaskResponse to peer.
peer.Log.Info("send NormalTaskResponse")
if err := stream.Send(&schedulerv2.AnnouncePeerResponse{
Response: constructSuccessNormalTaskResponse(candidateParents),
}); err != nil {
peer.Log.Error(err)
return status.Error(codes.FailedPrecondition, err.Error())
}
// Add edge from parent to peer.
for _, candidateParent := range candidateParents {
if err := peer.Task.AddPeerEdge(candidateParent, peer); err != nil {
err = fmt.Errorf("peer adds edge failed: %w", err)
peer.Log.Warn(err)
continue
}
}
peer.Log.Infof("scheduling success in %d times", n+1)
return nil
}
}
// ScheduleParentAndCandidateParents schedules a parent and candidate parents to a peer.
// Used only in v1 version of the grpc.
func (s *scheduling) ScheduleParentAndCandidateParents(ctx context.Context, peer *standard.Peer, blocklist set.SafeSet[string]) {
var n int
for {
select {
case <-ctx.Done():
peer.Log.Infof("context was done")
return
default:
}
// Scheduling will send Code_SchedNeedBackSource to peer.
//
// Condition 1: Peer's NeedBackToSource is true.
// Condition 2: Scheduling exceeds the RetryBackToSourceLimit.
if peer.Task.CanBackToSource() {
// Check condition 1:
// Peer's NeedBackToSource is true.
if peer.NeedBackToSource.Load() {
stream, loaded := peer.LoadReportPieceResultStream()
if !loaded {
peer.Log.Error("load stream failed")
return
}
// Send Code_SchedNeedBackSource to peer.
if err := stream.Send(&schedulerv1.PeerPacket{Code: commonv1.Code_SchedNeedBackSource}); err != nil {
peer.Log.Error(err)
return
}
peer.Log.Infof("send Code_SchedNeedBackSource to peer, because of peer's NeedBackToSource is %t", peer.NeedBackToSource.Load())
if err := peer.FSM.Event(ctx, standard.PeerEventDownloadBackToSource); err != nil {
err = fmt.Errorf("peer fsm event failed: %w", err)
peer.Log.Error(err)
return
}
// If the task state is TaskStateFailed,
// peer back-to-source and reset task state to TaskStateRunning.
if peer.Task.FSM.Is(standard.TaskStateFailed) {
if err := peer.Task.FSM.Event(ctx, standard.TaskEventDownload); err != nil {
err = fmt.Errorf("task fsm event failed: %w", err)
peer.Task.Log.Error(err)
return
}
}
return
}
// Check condition 2:
// The number of retry scheduling is greater than RetryBackToSourceLimit
if n >= s.config.RetryBackToSourceLimit {
stream, loaded := peer.LoadReportPieceResultStream()
if !loaded {
peer.Log.Error("load stream failed")
return
}
// Send Code_SchedNeedBackSource peer.
if err := stream.Send(&schedulerv1.PeerPacket{Code: commonv1.Code_SchedNeedBackSource}); err != nil {
peer.Log.Error(err)
return
}
peer.Log.Infof("send Code_SchedNeedBackSource to peer, because of scheduling exceeded RetryBackToSourceLimit %d", s.config.RetryBackToSourceLimit)
if err := peer.FSM.Event(ctx, standard.PeerEventDownloadBackToSource); err != nil {
err = fmt.Errorf("peer fsm event failed: %w", err)
peer.Log.Error(err)
return
}
// If the task state is TaskStateFailed,
// peer back-to-source and reset task state to TaskStateRunning.
if peer.Task.FSM.Is(standard.TaskStateFailed) {
if err := peer.Task.FSM.Event(ctx, standard.TaskEventDownload); err != nil {
err = fmt.Errorf("task fsm event failed: %w", err)
peer.Task.Log.Error(err)
return
}
}
return
}
}
// Scheduling will send Code_SchedTaskStatusError to peer.
//
// Condition 1: Scheduling exceeds the RetryLimit.
if n >= s.config.RetryLimit {
stream, loaded := peer.LoadReportPieceResultStream()
if !loaded {
peer.Log.Error("load stream failed")
return
}
// Send Code_SchedTaskStatusError to peer.
if err := stream.Send(&schedulerv1.PeerPacket{Code: commonv1.Code_SchedTaskStatusError}); err != nil {
peer.Log.Error(err)
return
}
peer.Log.Errorf("send SchedulePeerFailed to peer, because of scheduling exceeded RetryLimit %d", s.config.RetryLimit)
return
}
// Scheduling will send PeerPacket to peer.
//
// Condition 1: Scheduling can find candidate parents.
if err := peer.Task.DeletePeerInEdges(peer.ID); err != nil {
n++
err := fmt.Errorf("scheduling failed in %d times, because of %w", n, err)
peer.Log.Error(err)
// Sleep to avoid hot looping.
time.Sleep(s.config.RetryInterval)
continue
}
// Find candidate parents.
candidateParents, found := s.FindCandidateParents(ctx, peer, blocklist)
if !found {
n++
peer.Log.Infof("scheduling failed in %d times, because of candidate parents not found", n)
// Sleep to avoid hot looping.
time.Sleep(s.config.RetryInterval)
continue
}
// Load ReportPieceResultStream from peer.
stream, loaded := peer.LoadReportPieceResultStream()
if !loaded {
n++
peer.Log.Errorf("scheduling failed in %d times, because of loading peer stream failed", n)
if err := peer.Task.DeletePeerInEdges(peer.ID); err != nil {
err = fmt.Errorf("peer deletes inedges failed: %w", err)
peer.Log.Error(err)
return
}
return
}
// Send PeerPacket to peer.
peer.Log.Info("send PeerPacket to peer")
if err := stream.Send(constructSuccessPeerPacket(peer, candidateParents[0], candidateParents[1:])); err != nil {
n++
err = fmt.Errorf("send PeerPacket to peer failed in %d times, because of %w", n, err)
peer.Log.Error(err)
if err := peer.Task.DeletePeerInEdges(peer.ID); err != nil {
err = fmt.Errorf("peer deletes inedges failed: %w", err)
peer.Log.Error(err)
return
}
return
}
// Add edge from parent to peer.
for _, candidateParent := range candidateParents {
if err := peer.Task.AddPeerEdge(candidateParent, peer); err != nil {
err = fmt.Errorf("peer adds edge failed: %w", err)
peer.Log.Debug(err)
continue
}
}
peer.Log.Infof("scheduling success in %d times", n+1)
return
}
}
// FindCandidateParents finds candidate parents for the peer.
func (s *scheduling) FindCandidateParents(ctx context.Context, peer *standard.Peer, blocklist set.SafeSet[string]) ([]*standard.Peer, bool) {
// Only PeerStateReceivedNormal and PeerStateRunning peers need to be rescheduled,
// and other states including the PeerStateBackToSource indicate that
// they have been scheduled.
if !(peer.FSM.Is(standard.PeerStateReceivedNormal) || peer.FSM.Is(standard.PeerStateRunning)) {
peer.Log.Infof("peer state is %s, can not schedule parent", peer.FSM.Current())
return []*standard.Peer{}, false
}
// Find the candidate parent that can be scheduled.
candidateParents := s.filterCandidateParents(peer, blocklist)
if len(candidateParents) == 0 {
peer.Log.Info("can not find candidate parents")
return []*standard.Peer{}, false
}
// Sort candidate parents by evaluation score.
taskTotalPieceCount := peer.Task.TotalPieceCount.Load()
candidateParents = s.evaluator.EvaluateParents(candidateParents, peer, uint32(taskTotalPieceCount))
// Get the parents with candidateParentLimit.
candidateParentLimit := config.DefaultSchedulerCandidateParentLimit
if config, err := s.dynconfig.GetSchedulerClusterConfig(); err == nil {
if config.CandidateParentLimit > 0 {
candidateParentLimit = int(config.CandidateParentLimit)
}
}
if len(candidateParents) > candidateParentLimit {
candidateParents = candidateParents[:candidateParentLimit]
}
var parentIDs []string
for _, candidateParent := range candidateParents {
parentIDs = append(parentIDs, candidateParent.ID)
}
peer.Log.Infof("scheduling candidate parents is %#v", parentIDs)
return candidateParents, true
}
// FindParentAndCandidateParents finds a parent and candidate parents for the peer.
func (s *scheduling) FindParentAndCandidateParents(ctx context.Context, peer *standard.Peer, blocklist set.SafeSet[string]) ([]*standard.Peer, bool) {
// Only PeerStateRunning peers need to be rescheduled,
// and other states including the PeerStateBackToSource indicate that
// they have been scheduled.
if !peer.FSM.Is(standard.PeerStateRunning) {
peer.Log.Infof("peer state is %s, can not schedule parent", peer.FSM.Current())
return []*standard.Peer{}, false
}
// Find the candidate parent that can be scheduled.
candidateParents := s.filterCandidateParents(peer, blocklist)
if len(candidateParents) == 0 {
peer.Log.Info("can not find candidate parents")
return []*standard.Peer{}, false
}
// Sort candidate parents by evaluation score.
taskTotalPieceCount := peer.Task.TotalPieceCount.Load()
candidateParents = s.evaluator.EvaluateParents(candidateParents, peer, uint32(taskTotalPieceCount))
// Get the parents with candidateParentLimit.
candidateParentLimit := config.DefaultSchedulerCandidateParentLimit
if config, err := s.dynconfig.GetSchedulerClusterConfig(); err == nil {
if config.CandidateParentLimit > 0 {
candidateParentLimit = int(config.CandidateParentLimit)
}
}
if len(candidateParents) > candidateParentLimit {
candidateParents = candidateParents[:candidateParentLimit]
}
var parentIDs []string
for _, candidateParent := range candidateParents {
parentIDs = append(parentIDs, candidateParent.ID)
}
peer.Log.Infof("scheduling candidate parents is %#v", parentIDs)
return candidateParents, true
}
// FindSuccessParent finds success parent for the peer.
func (s *scheduling) FindSuccessParent(ctx context.Context, peer *standard.Peer, blocklist set.SafeSet[string]) (*standard.Peer, bool) {
// Only PeerStateRunning peers need to be rescheduled,
// and other states including the PeerStateBackToSource indicate that
// they have been scheduled.
if !peer.FSM.Is(standard.PeerStateRunning) {
peer.Log.Infof("peer state is %s, can not schedule parent", peer.FSM.Current())
return nil, false
}
// Find the candidate parent that can be scheduled.
candidateParents := s.filterCandidateParents(peer, blocklist)
if len(candidateParents) == 0 {
peer.Log.Info("can not find candidate parents")
return nil, false
}
var successParents []*standard.Peer
for _, candidateParent := range candidateParents {
if candidateParent.FSM.Is(standard.PeerStateSucceeded) {
successParents = append(successParents, candidateParent)
}
}
// Sort candidate parents by evaluation score.
taskTotalPieceCount := peer.Task.TotalPieceCount.Load()
successParents = s.evaluator.EvaluateParents(successParents, peer, uint32(taskTotalPieceCount))
peer.Log.Infof("scheduling success parent is %s", successParents[0].ID)
return successParents[0], true
}
// filterCandidateParents filters the candidate parents that can be scheduled.
func (s *scheduling) filterCandidateParents(peer *standard.Peer, blocklist set.SafeSet[string]) []*standard.Peer {
filterParentLimit := config.DefaultSchedulerFilterParentLimit
if config, err := s.dynconfig.GetSchedulerClusterConfig(); err == nil {
if config.FilterParentLimit > 0 {
filterParentLimit = int(config.FilterParentLimit)
}
}
var (
candidateParents []*standard.Peer
candidateParentIDs []string
)
for _, candidateParent := range peer.Task.LoadRandomPeers(uint(filterParentLimit)) {
// Candidate parent is in blocklist.
if blocklist.Contains(candidateParent.ID) {
peer.Log.Debugf("parent %s host %s is not selected because it is in blocklist", candidateParent.ID, candidateParent.Host.ID)
continue
}
// Candidate parent is disable shared.
if candidateParent.Host.DisableShared {
peer.Log.Debugf("parent %s host %s is not selected because it is disable shared", candidateParent.ID, candidateParent.Host.ID)
continue
}
// Candidate parent host is not allowed to be the same as the peer host,
// because dfdaemon cannot handle the situation
// where two tasks are downloading and downloading each other.
if peer.Host.ID == candidateParent.Host.ID {
peer.Log.Debugf("parent %s host %s is the same as peer host", candidateParent.ID, candidateParent.Host.ID)
continue
}
// Candidate parent can not find in dag.
inDegree, err := peer.Task.PeerInDegree(candidateParent.ID)
if err != nil {
peer.Log.Debugf("can not find parent %s host %s vertex in dag", candidateParent.ID, candidateParent.Host.ID)
continue
}
// Parent can be parent of the peer:
// Condition 1: Parent has parent.
// Condition 2: Parent has been back-to-source.
// Condition 3: Parent has been succeeded.
// Condition 4: Parent is seed peer.
if candidateParent.Host.Type == types.HostTypeNormal && inDegree == 0 && !candidateParent.FSM.Is(standard.PeerStateBackToSource) &&
!candidateParent.FSM.Is(standard.PeerStateSucceeded) {
peer.Log.Debugf("parent %s host %s is not selected, because its download state is %d %d %s",
candidateParent.ID, candidateParent.Host.ID, inDegree, int(candidateParent.Host.Type), candidateParent.FSM.Current())
continue
}
// Candidate parent is bad parent.
if s.evaluator.IsBadParent(candidateParent) {
peer.Log.Debugf("parent %s host %s is not selected because it is bad node", candidateParent.ID, candidateParent.Host.ID)
continue
}
// Candidate parent's free upload is empty.
if candidateParent.Host.FreeUploadCount() <= 0 {
peer.Log.Debugf("parent %s host %s is not selected because its free upload is empty, upload limit is %d, upload count is %d",
candidateParent.ID, candidateParent.Host.ID, candidateParent.Host.ConcurrentUploadLimit.Load(), candidateParent.Host.ConcurrentUploadCount.Load())
continue
}
// Candidate parent can add edge with peer.
if !peer.Task.CanAddPeerEdge(candidateParent.ID, peer.ID) {
peer.Log.Debugf("can not add edge with parent %s host %s", candidateParent.ID, candidateParent.Host.ID)
continue
}
candidateParents = append(candidateParents, candidateParent)
candidateParentIDs = append(candidateParentIDs, candidateParent.ID)
}
peer.Log.Infof("filter candidate parents is %#v", candidateParentIDs)
return candidateParents
}
// FindReplicatePersistentCacheHosts finds replicate persistent cache hosts for the peer to replicate the task. It will compare the current
// persistent replica count with the persistent replica count and try to find enough parents. Then function will return the cached replicate parents,
// the replicate hosts without cache and found flag.
func (s *scheduling) FindReplicatePersistentCacheHosts(ctx context.Context, task *persistentcache.Task, blocklist set.SafeSet[string]) ([]*persistentcache.Peer, []*persistentcache.Host, bool) {
currentPersistentReplicaCount, err := s.persistentCacheResource.TaskManager().LoadCurrentPersistentReplicaCount(ctx, task.ID)
if err != nil {
err = fmt.Errorf("load current persistent replica count failed: %w", err)
task.Log.Error(err)
return nil, nil, false
}
needPersistentReplicaCount := int(task.PersistentReplicaCount - currentPersistentReplicaCount)
if needPersistentReplicaCount <= 0 {
task.Log.Infof("persistent cache task %s has enough persistent replica count %d", task.ID, task.PersistentReplicaCount)
return nil, nil, false
}
var (
replicateHosts []*persistentcache.Host
replicateHostIDs []string
cachedReplicateParents []*persistentcache.Peer
cachedReplicateParentIDs []string
)
cachedParents := s.filterCachedReplicatePersistentCacheParents(ctx, task, blocklist)
cachedParentsCount := len(cachedParents)
// If the number of cached parents is greater than or equal to the number of persistent replica count,
// return the cached parents directly and no need to find the replicate hosts without cache.
if cachedParentsCount >= needPersistentReplicaCount {
for _, cachedParent := range cachedParents[:needPersistentReplicaCount] {
cachedReplicateParents = append(cachedReplicateParents, cachedParent)
cachedReplicateParentIDs = append(cachedReplicateParentIDs, cachedParent.ID)
}
task.Log.Infof("find cached parents is %#v", cachedReplicateParentIDs)
return cachedReplicateParents, nil, true
}
// If cached parents are not enough, append the replicate cached parents and find the replicate hosts without cache.
if cachedParentsCount > 0 {
for _, cachedParent := range cachedParents {
cachedReplicateParents = append(cachedReplicateParents, cachedParent)
cachedReplicateParentIDs = append(cachedReplicateParentIDs, cachedParent.ID)
blocklist.Add(cachedParent.Host.ID)
}
}
// Load all current persistent peers and add them to the blocklist to avoid scheduling the same host.
currentPersistentPeers, err := s.persistentCacheResource.PeerManager().LoadPersistentAllByTaskID(ctx, task.ID)
if err != nil {
err = fmt.Errorf("load all persistent cache peers failed: %w", err)
task.Log.Error(err)
return nil, nil, false
}
for _, currentPersistentPeer := range currentPersistentPeers {
blocklist.Add(currentPersistentPeer.Host.ID)
}
// Find the replicate hosts without cache. Calculate the number of persistent replicas needed without considering the cache.
// Formula: Needed persistent replica count without cache = Total persistent replica count - Current persistent replica count - Cached parents count.
needPersistentReplicaCount -= cachedParentsCount
hosts := s.filterReplicatePersistentCacheHosts(ctx, task, needPersistentReplicaCount, blocklist)
for _, host := range hosts {
replicateHosts = append(replicateHosts, host)
replicateHostIDs = append(replicateHostIDs, host.ID)
}
if len(cachedReplicateParents) == 0 && len(replicateHosts) == 0 {
task.Log.Info("can not find replicate hosts")
return nil, nil, false
}
task.Log.Infof("find cached parents is %#v and hosts is %#v", cachedReplicateParentIDs, replicateHostIDs)
return cachedReplicateParents, replicateHosts, true
}
// FindCandidatePersistentCacheParents finds candidate persistent cache parents for the peer to download the task.
func (s *scheduling) FindCandidatePersistentCacheParents(ctx context.Context, peer *persistentcache.Peer, blocklist set.SafeSet[string]) ([]*persistentcache.Peer, bool) {
// Find the candidate parent that can be scheduled.
candidateParents := s.filterCandidatePersistentCacheParents(ctx, peer, blocklist)
if len(candidateParents) == 0 {
peer.Log.Info("can not find candidate persistent cache parents")
return candidateParents, false
}
// Sort candidate parents by evaluation score.
candidateParents = s.evaluator.EvaluatePersistentCacheParents(candidateParents, peer, peer.Task.TotalPieceCount)
// Get the parents with candidateParentLimit.
candidateParentLimit := config.DefaultSchedulerCandidateParentLimit
if config, err := s.dynconfig.GetSchedulerClusterConfig(); err == nil {
if config.CandidateParentLimit > 0 {
candidateParentLimit = int(config.CandidateParentLimit)
}
}
if len(candidateParents) > candidateParentLimit {
candidateParents = candidateParents[:candidateParentLimit]
}
var parentIDs []string
for _, candidateParent := range candidateParents {
parentIDs = append(parentIDs, candidateParent.ID)
}
peer.Log.Infof("scheduling candidate persistent cache parents is %#v", parentIDs)
return candidateParents, true
}
// filterCandidatePersistentCacheParents filters the candidate persistent cache parents that can be scheduled.
func (s *scheduling) filterCandidatePersistentCacheParents(ctx context.Context, peer *persistentcache.Peer, blocklist set.SafeSet[string]) []*persistentcache.Peer {
parents, err := s.persistentCacheResource.PeerManager().LoadAllByTaskID(ctx, peer.Task.ID)
if err != nil {
err = fmt.Errorf("load all persistent cache parents failed: %w", err)
peer.Log.Error(err)
return nil
}
var (
candidateParents []*persistentcache.Peer
candidateParentIDs []string
)
for _, candidateParent := range parents {
// Candidate persistent cache parent is in blocklist.
if blocklist.Contains(candidateParent.ID) {
peer.Log.Debugf("persistent cache parent %s host %s is not selected because it is in blocklist", candidateParent.ID, candidateParent.Host.ID)
continue
}
// Candidate persistent cache parent host is not allowed to be the same as the peer host,
if peer.Host.ID == candidateParent.Host.ID {
peer.Log.Debugf("persistent cache parent %s host %s is the same as peer host", candidateParent.ID, candidateParent.Host.ID)
continue
}
// Candidate persistent cache parent is bad parent.
if s.evaluator.IsBadPersistentCacheParent(candidateParent) {
peer.Log.Debugf("persistent cache parent %s host %s is not selected because it is bad node", candidateParent.ID, candidateParent.Host.ID)
continue
}
candidateParents = append(candidateParents, candidateParent)
candidateParentIDs = append(candidateParentIDs, candidateParent.ID)
}
peer.Log.Infof("filter candidate persistent cache parents is %#v", candidateParentIDs)
return candidateParents
}
// filterCachedReplicatePersistentCacheHosts filters the cached replicate persistent cache parents that can be scheduled.
func (s *scheduling) filterCachedReplicatePersistentCacheParents(ctx context.Context, task *persistentcache.Task, blocklist set.SafeSet[string]) []*persistentcache.Peer {
parents, err := s.persistentCacheResource.PeerManager().LoadAllByTaskID(ctx, task.ID)
if err != nil {
err = fmt.Errorf("load all persistent cache parents failed: %w", err)
task.Log.Error(err)
return nil
}
var (
replicateParents []*persistentcache.Peer
replicateParentIDs []string
)
for _, replicateParent := range parents {
// Candidate persistent cache parent is in blocklist.
if blocklist.Contains(replicateParent.ID) {
task.Log.Debugf("persistent cache parent %s host %s is not selected because it is in blocklist", replicateParent.ID, replicateParent.Host.ID)
continue
}
// If the parent is persistent, it cannot be selected.
if replicateParent.Persistent {
task.Log.Debugf("persistent cache parent %s host %s is not selected because it is persistent", replicateParent.ID, replicateParent.Host.ID)
continue
}
// If the parent is not succeeded, it cannot be selected.
if !replicateParent.FSM.Is(persistentcache.PeerStateSucceeded) {
task.Log.Debugf("persistent cache parent %s host %s is not selected because its download state is %s", replicateParent.ID, replicateParent.Host.ID, replicateParent.FSM.Current())
continue
}
// If the host is disable shared, it cannot be selected.
if replicateParent.Host.DisableShared {
task.Log.Debugf("persistent cache parent %s host %s is not selected because it is disable shared", replicateParent.ID, replicateParent.Host.ID)
continue
}
replicateParents = append(replicateParents, replicateParent)
replicateParentIDs = append(replicateParentIDs, replicateParent.ID)
}
task.Log.Infof("filter cached parents is %#v", replicateParentIDs)
return replicateParents
}
// filterReplicatePersistentCacheHosts filters the replicate persistent cache hosts that can be scheduled.
func (s *scheduling) filterReplicatePersistentCacheHosts(ctx context.Context, task *persistentcache.Task, count int, blocklist set.SafeSet[string]) []*persistentcache.Host {
hosts, err := s.persistentCacheResource.HostManager().LoadRandom(ctx, count, blocklist)
if err != nil {
err = fmt.Errorf("load all persistent cache hosts failed: %w", err)
task.Log.Error(err)
return nil
}
var (
replicateHosts []*persistentcache.Host
replicateHostIDs []string
)
for _, host := range hosts {
// If the host is disable shared, it cannot be selected.
if host.DisableShared {
task.Log.Debugf("persistent cache host %s is not selected because it is disable shared", host.ID)
continue
}
// If the available disk space is not enough, it cannot be selected.
if host.Disk.Free < task.ContentLength {
task.Log.Debugf("persistent cache host %s is not selected because its free disk space is not enough, free disk is %d, content length is %d",
host.ID, host.Disk.Free, task.ContentLength)
continue
}
replicateHosts = append(replicateHosts, host)
replicateHostIDs = append(replicateHostIDs, host.ID)
}
task.Log.Infof("filter hosts is %#v", replicateHostIDs)
return replicateHosts
}
// constructSuccessNormalTaskResponse constructs scheduling successful response of the normal task.
// Used only in v2 version of the grpc.
func constructSuccessNormalTaskResponse(candidateParents []*standard.Peer) *schedulerv2.AnnouncePeerResponse_NormalTaskResponse {
var parents []*commonv2.Peer
for _, candidateParent := range candidateParents {
parent := &commonv2.Peer{
Id: candidateParent.ID,
Priority: candidateParent.Priority,
Cost: durationpb.New(candidateParent.Cost.Load()),
State: candidateParent.FSM.Current(),
NeedBackToSource: candidateParent.NeedBackToSource.Load(),
CreatedAt: timestamppb.New(candidateParent.CreatedAt.Load()),
UpdatedAt: timestamppb.New(candidateParent.UpdatedAt.Load()),
}
// Set range to parent.
if candidateParent.Range != nil {
parent.Range = &commonv2.Range{
Start: uint64(candidateParent.Range.Start),
Length: uint64(candidateParent.Range.Length),
}
}
// Set task to parent.
parent.Task = &commonv2.Task{
Id: candidateParent.Task.ID,
Type: candidateParent.Task.Type,
Url: candidateParent.Task.URL,
Tag: &candidateParent.Task.Tag,
Application: &candidateParent.Task.Application,
FilteredQueryParams: candidateParent.Task.FilteredQueryParams,
RequestHeader: candidateParent.Task.Header,
ContentLength: uint64(candidateParent.Task.ContentLength.Load()),
PieceCount: uint32(candidateParent.Task.TotalPieceCount.Load()),
SizeScope: candidateParent.Task.SizeScope(),
State: candidateParent.Task.FSM.Current(),
PeerCount: uint32(candidateParent.Task.PeerCount()),
CreatedAt: timestamppb.New(candidateParent.Task.CreatedAt.Load()),
UpdatedAt: timestamppb.New(candidateParent.Task.UpdatedAt.Load()),
}
// Set digest to parent task.
if candidateParent.Task.Digest != nil {
dgst := candidateParent.Task.Digest.String()
parent.Task.Digest = &dgst
}
// Set host to parent.
parent.Host = &commonv2.Host{
Id: candidateParent.Host.ID,
Type: uint32(candidateParent.Host.Type),
Hostname: candidateParent.Host.Hostname,
Ip: candidateParent.Host.IP,
Port: candidateParent.Host.Port,
DownloadPort: candidateParent.Host.DownloadPort,
Os: candidateParent.Host.OS,
Platform: candidateParent.Host.Platform,
PlatformFamily: candidateParent.Host.PlatformFamily,
PlatformVersion: candidateParent.Host.PlatformVersion,
KernelVersion: candidateParent.Host.KernelVersion,
Cpu: &commonv2.CPU{
LogicalCount: candidateParent.Host.CPU.LogicalCount,
PhysicalCount: candidateParent.Host.CPU.PhysicalCount,
Percent: candidateParent.Host.CPU.Percent,
ProcessPercent: candidateParent.Host.CPU.ProcessPercent,
Times: &commonv2.CPUTimes{
User: candidateParent.Host.CPU.Times.User,
System: candidateParent.Host.CPU.Times.System,
Idle: candidateParent.Host.CPU.Times.Idle,
Nice: candidateParent.Host.CPU.Times.Nice,
Iowait: candidateParent.Host.CPU.Times.Iowait,
Irq: candidateParent.Host.CPU.Times.Irq,
Softirq: candidateParent.Host.CPU.Times.Softirq,
Steal: candidateParent.Host.CPU.Times.Steal,
Guest: candidateParent.Host.CPU.Times.Guest,
GuestNice: candidateParent.Host.CPU.Times.GuestNice,
},
},
Memory: &commonv2.Memory{
Total: candidateParent.Host.Memory.Total,
Available: candidateParent.Host.Memory.Available,
Used: candidateParent.Host.Memory.Used,
UsedPercent: candidateParent.Host.Memory.UsedPercent,
ProcessUsedPercent: candidateParent.Host.Memory.ProcessUsedPercent,
Free: candidateParent.Host.Memory.Free,
},
Network: &commonv2.Network{
TcpConnectionCount: candidateParent.Host.Network.TCPConnectionCount,
UploadTcpConnectionCount: candidateParent.Host.Network.UploadTCPConnectionCount,
Location: &candidateParent.Host.Network.Location,
Idc: &candidateParent.Host.Network.IDC,
DownloadRate: candidateParent.Host.Network.DownloadRate,
DownloadRateLimit: candidateParent.Host.Network.DownloadRateLimit,
UploadRate: candidateParent.Host.Network.UploadRate,
UploadRateLimit: candidateParent.Host.Network.UploadRateLimit,
},
Disk: &commonv2.Disk{
Total: candidateParent.Host.Disk.Total,
Free: candidateParent.Host.Disk.Free,
Used: candidateParent.Host.Disk.Used,
UsedPercent: candidateParent.Host.Disk.UsedPercent,
InodesTotal: candidateParent.Host.Disk.InodesTotal,
InodesUsed: candidateParent.Host.Disk.InodesUsed,
InodesFree: candidateParent.Host.Disk.InodesFree,
InodesUsedPercent: candidateParent.Host.Disk.InodesUsedPercent,
WriteBandwidth: candidateParent.Host.Disk.WriteBandwidth,
ReadBandwidth: candidateParent.Host.Disk.ReadBandwidth,
},
Build: &commonv2.Build{
GitVersion: candidateParent.Host.Build.GitVersion,
GitCommit: &candidateParent.Host.Build.GitCommit,
GoVersion: &candidateParent.Host.Build.GoVersion,
Platform: &candidateParent.Host.Build.Platform,
},
}
parents = append(parents, parent)
}
return &schedulerv2.AnnouncePeerResponse_NormalTaskResponse{
NormalTaskResponse: &schedulerv2.NormalTaskResponse{
CandidateParents: parents,
},
}
}
// constructSuccessPeerPacket constructs peer successful packet.
// Used only in v1 version of the grpc.
func constructSuccessPeerPacket(peer *standard.Peer, parent *standard.Peer, candidateParents []*standard.Peer) *schedulerv1.PeerPacket {
var parents []*schedulerv1.PeerPacket_DestPeer
for _, candidateParent := range candidateParents {
parents = append(parents, &schedulerv1.PeerPacket_DestPeer{
Ip: candidateParent.Host.IP,
RpcPort: candidateParent.Host.Port,
PeerId: candidateParent.ID,
})
}
return &schedulerv1.PeerPacket{
TaskId: peer.Task.ID,
SrcPid: peer.ID,
MainPeer: &schedulerv1.PeerPacket_DestPeer{
Ip: parent.Host.IP,
RpcPort: parent.Host.Port,
PeerId: parent.ID,
},
CandidatePeers: parents,
Code: commonv1.Code_Success,
}
}