dragonfly/scheduler/core/service.go

293 lines
8.9 KiB
Go

/*
* Copyright 2020 The Dragonfly Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package core
import (
"context"
"sync"
"time"
"d7y.io/dragonfly/v2/internal/dfcodes"
logger "d7y.io/dragonfly/v2/internal/dflog"
"d7y.io/dragonfly/v2/internal/idgen"
"d7y.io/dragonfly/v2/pkg/rpc/base"
"d7y.io/dragonfly/v2/pkg/rpc/base/common"
schedulerRPC "d7y.io/dragonfly/v2/pkg/rpc/scheduler"
"d7y.io/dragonfly/v2/pkg/synclock"
"d7y.io/dragonfly/v2/scheduler/config"
"d7y.io/dragonfly/v2/scheduler/core/scheduler"
"d7y.io/dragonfly/v2/scheduler/supervisor"
"d7y.io/dragonfly/v2/scheduler/supervisor/cdn"
"d7y.io/dragonfly/v2/scheduler/supervisor/host"
"d7y.io/dragonfly/v2/scheduler/supervisor/peer"
"d7y.io/dragonfly/v2/scheduler/supervisor/task"
"github.com/pkg/errors"
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"go.opentelemetry.io/otel/trace"
"google.golang.org/grpc"
"k8s.io/client-go/util/workqueue"
)
type SchedulerService struct {
// cdn mgr
cdnManager supervisor.CDNMgr
// task mgr
taskManager supervisor.TaskMgr
// host mgr
hostManager supervisor.HostMgr
// Peer mgr
peerManager supervisor.PeerMgr
sched scheduler.Scheduler
worker worker
config *config.SchedulerConfig
monitor *monitor
startOnce sync.Once
stopOnce sync.Once
done chan struct{}
wg sync.WaitGroup
}
func NewSchedulerService(cfg *config.SchedulerConfig, dynConfig config.DynconfigInterface, openTel bool) (*SchedulerService, error) {
hostManager := host.NewManager()
peerManager := peer.NewManager(cfg.GC, hostManager)
var opts []grpc.DialOption
if openTel {
opts = append(opts, grpc.WithChainUnaryInterceptor(otelgrpc.UnaryClientInterceptor()), grpc.WithChainStreamInterceptor(otelgrpc.StreamClientInterceptor()))
}
cdnClient, err := cdn.NewRefreshableCDNClient(dynConfig, opts)
if err != nil {
return nil, errors.Wrap(err, "new refreshable cdn client")
}
cdnManager, err := cdn.NewManager(cdnClient, peerManager, hostManager)
if err != nil {
return nil, errors.Wrap(err, "new cdn manager")
}
taskManager := task.NewManager(cfg.GC, peerManager)
sched, err := scheduler.Get(cfg.Scheduler).Build(cfg, &scheduler.BuildOptions{
PeerManager: peerManager,
})
if err != nil {
return nil, errors.Wrapf(err, "build scheduler %v", cfg.Scheduler)
}
work := newEventLoopGroup(cfg.WorkerNum)
downloadMonitor := newMonitor(cfg.OpenMonitor, peerManager)
done := make(chan struct{})
return &SchedulerService{
cdnManager: cdnManager,
taskManager: taskManager,
hostManager: hostManager,
peerManager: peerManager,
worker: work,
monitor: downloadMonitor,
sched: sched,
config: cfg,
done: done,
}, nil
}
func (s *SchedulerService) Serve() {
s.startOnce.Do(func() {
s.wg.Add(3)
wsdq := workqueue.NewNamedDelayingQueue("wait reSchedule parent")
go s.runWorkerLoop(wsdq)
go s.runReScheduleParentLoop(wsdq)
go s.runMonitor()
logger.Debugf("start scheduler service successfully")
})
}
func (s *SchedulerService) runWorkerLoop(wsdq workqueue.DelayingInterface) {
defer s.wg.Done()
s.worker.start(newState(s.sched, s.peerManager, s.cdnManager, wsdq))
}
func (s *SchedulerService) runReScheduleParentLoop(wsdq workqueue.DelayingInterface) {
defer s.wg.Done()
for {
select {
case <-s.done:
wsdq.ShutDown()
return
default:
v, shutdown := wsdq.Get()
if shutdown {
break
}
peer := v.(*supervisor.Peer)
wsdq.Done(v)
if peer.IsDone() || peer.IsLeave() {
logger.WithTaskAndPeerID(peer.Task.TaskID,
peer.PeerID).Debugf("runReScheduleLoop: peer has left from waitScheduleParentPeerQueue because peer is done or leave, peer status is %s, "+
"isLeave %t", peer.GetStatus(), peer.IsLeave())
continue
}
s.worker.send(reScheduleParentEvent{peer})
}
}
}
func (s *SchedulerService) runMonitor() {
defer s.wg.Done()
if s.monitor != nil {
s.monitor.start(s.done)
}
}
func (s *SchedulerService) Stop() {
s.stopOnce.Do(func() {
close(s.done)
if s.worker != nil {
s.worker.stop()
}
})
}
func (s *SchedulerService) GenerateTaskID(url string, meta *base.UrlMeta, peerID string) (taskID string) {
if s.config.ABTest {
return idgen.TwinsTaskID(url, meta, peerID)
}
return idgen.TaskID(url, meta)
}
func (s *SchedulerService) ScheduleParent(peer *supervisor.Peer) (parent *supervisor.Peer, err error) {
parent, _, hasParent := s.sched.ScheduleParent(peer)
//logger.Debugf("schedule parent result: parent %v, candidates:%v", parent, candidates)
if !hasParent || parent == nil {
return nil, errors.Errorf("no parent peer available for peer %v", peer.PeerID)
}
return parent, nil
}
func (s *SchedulerService) GetPeerTask(peerTaskID string) (peerTask *supervisor.Peer, ok bool) {
return s.peerManager.Get(peerTaskID)
}
func (s *SchedulerService) RegisterPeerTask(req *schedulerRPC.PeerTaskRequest, task *supervisor.Task) (*supervisor.Peer, error) {
// get or create host
reqPeerHost := req.PeerHost
var (
peer *supervisor.Peer
ok bool
peerHost *supervisor.PeerHost
)
if peerHost, ok = s.hostManager.Get(reqPeerHost.Uuid); !ok {
peerHost = supervisor.NewClientPeerHost(reqPeerHost.Uuid, reqPeerHost.Ip, reqPeerHost.HostName, reqPeerHost.RpcPort, reqPeerHost.DownPort,
reqPeerHost.SecurityDomain, reqPeerHost.Location, reqPeerHost.Idc, reqPeerHost.NetTopology, s.config.ClientLoad)
s.hostManager.Add(peerHost)
}
// get or creat PeerTask
if peer, ok = s.peerManager.Get(req.PeerId); !ok {
peer = supervisor.NewPeer(req.PeerId, task, peerHost)
s.peerManager.Add(peer)
}
return peer, nil
}
func (s *SchedulerService) GetOrCreateTask(ctx context.Context, task *supervisor.Task) (*supervisor.Task, error) {
span := trace.SpanFromContext(ctx)
synclock.Lock(task.TaskID, true)
task, ok := s.taskManager.GetOrAdd(task)
if ok {
if task.GetLastTriggerTime().Add(s.config.AccessWindow).After(time.Now()) || task.IsHealth() {
synclock.UnLock(task.TaskID, true)
span.SetAttributes(config.AttributeNeedSeedCDN.Bool(false))
return task, nil
}
}
synclock.UnLock(task.TaskID, true)
// do trigger
task.UpdateLastTriggerTime(time.Now())
// register cdn peer task
// notify peer tasks
synclock.Lock(task.TaskID, false)
defer synclock.UnLock(task.TaskID, false)
if task.IsHealth() && task.GetLastTriggerTime().Add(s.config.AccessWindow).After(time.Now()) {
return task, nil
}
if task.IsFrozen() {
task.SetStatus(supervisor.TaskStatusRunning)
span.SetAttributes(config.AttributeNeedSeedCDN.Bool(false))
}
span.SetAttributes(config.AttributeNeedSeedCDN.Bool(true))
go func() {
if cdnPeer, err := s.cdnManager.StartSeedTask(ctx, task); err != nil {
if errors.Cause(err) != cdn.ErrCDNInvokeFail {
task.SetStatus(supervisor.TaskStatusFailed)
}
logger.Errorf("failed to seed task: %v", err)
if ok = s.worker.send(taskSeedFailEvent{task}); !ok {
logger.Error("failed to send taskSeed fail event, eventLoop is shutdown")
}
} else {
if ok = s.worker.send(peerDownloadSuccessEvent{cdnPeer, nil}); !ok {
logger.Error("failed to send taskSeed fail event, eventLoop is shutdown")
}
logger.Debugf("===== successfully obtain seeds from cdn, task: %+v ====", task)
}
}()
return task, nil
}
func (s *SchedulerService) HandlePieceResult(ctx context.Context, peer *supervisor.Peer, pieceResult *schedulerRPC.PieceResult) error {
peer.Touch()
if pieceResult.PieceNum == common.ZeroOfPiece {
s.worker.send(startReportPieceResultEvent{ctx, peer})
return nil
} else if pieceResult.Success {
s.worker.send(peerDownloadPieceSuccessEvent{
ctx: ctx,
peer: peer,
pr: pieceResult,
})
return nil
} else if pieceResult.Code != dfcodes.Success {
s.worker.send(peerDownloadPieceFailEvent{
peer: peer,
pr: pieceResult,
})
return nil
}
return nil
}
func (s *SchedulerService) HandlePeerResult(ctx context.Context, peer *supervisor.Peer, peerResult *schedulerRPC.PeerResult) error {
peer.Touch()
if peerResult.Success {
if !s.worker.send(peerDownloadSuccessEvent{peer: peer, peerResult: peerResult}) {
logger.Errorf("send peer download success event failed")
}
} else if !s.worker.send(peerDownloadFailEvent{peer: peer, peerResult: peerResult}) {
logger.Errorf("send peer download fail event failed")
}
return nil
}
func (s *SchedulerService) HandleLeaveTask(ctx context.Context, peer *supervisor.Peer) error {
peer.Touch()
if !s.worker.send(peerLeaveEvent{
ctx: ctx,
peer: peer,
}) {
logger.Errorf("send peer leave event failed")
}
return nil
}