dragonfly/client/daemon/peer/peertask_base.go

890 lines
27 KiB
Go

/*
* Copyright 2020 The Dragonfly Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package peer
import (
"context"
"fmt"
"io"
"runtime/debug"
"sync"
"time"
"github.com/pkg/errors"
"go.opentelemetry.io/otel/trace"
"go.uber.org/atomic"
"golang.org/x/time/rate"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"d7y.io/dragonfly/v2/client/config"
"d7y.io/dragonfly/v2/internal/dferrors"
logger "d7y.io/dragonfly/v2/internal/dflog"
"d7y.io/dragonfly/v2/pkg/retry"
"d7y.io/dragonfly/v2/pkg/rpc/base"
dfclient "d7y.io/dragonfly/v2/pkg/rpc/dfdaemon/client"
"d7y.io/dragonfly/v2/pkg/rpc/scheduler"
schedulerclient "d7y.io/dragonfly/v2/pkg/rpc/scheduler/client"
)
const (
reasonScheduleTimeout = "wait first peer packet from scheduler timeout"
reasonReScheduleTimeout = "wait more available peers from scheduler timeout"
reasonContextCanceled = "context canceled"
reasonPeerGoneFromScheduler = "scheduler says client should disconnect"
reasonBackSourceDisabled = "download from source disabled"
failedReasonNotSet = "unknown"
failedCodeNotSet = 0
)
var errPeerPacketChanged = errors.New("peer packet changed")
var _ Task = (*peerTask)(nil)
type peerTask struct {
*logger.SugaredLoggerOnWith
ctx context.Context
cancel context.CancelFunc
// needBackSource indicates downloading resource from instead of other peers
needBackSource bool
backSourceFunc func()
reportPieceResultFunc func(result *pieceTaskResult) error
setContentLengthFunc func(i int64) error
setTotalPiecesFunc func(i int32)
request *scheduler.PeerTaskRequest
// pieceManager will be used for downloading piece
pieceManager PieceManager
// host info about current host
host *scheduler.PeerHost
// callback holds some actions, like init, done, fail actions
callback TaskCallback
// schedule options
schedulerOption config.SchedulerOption
schedulerClient schedulerclient.SchedulerClient
// peer task meta info
peerID string
taskID string
totalPiece int32
md5 string
contentLength *atomic.Int64
completedLength *atomic.Int64
usedTraffic *atomic.Int64
//sizeScope base.SizeScope
singlePiece *scheduler.SinglePiece
// TODO peerPacketStream
peerPacketStream schedulerclient.PeerPacketStream
// peerPacket is the latest available peers from peerPacketCh
peerPacket atomic.Value // *scheduler.PeerPacket
// peerPacketReady will receive a ready signal for peerPacket ready
peerPacketReady chan bool
// pieceParallelCount stands the piece parallel count from peerPacket
pieceParallelCount *atomic.Int32
// done channel will be close when peer task is finished
done chan struct{}
// success will be true after peer task done
success bool
// span stands open telemetry trace span
span trace.Span
// same actions must be done only once, like close done channel and so on
once sync.Once
// failedPieceCh will hold all pieces which download failed,
// those pieces will be retried later
failedPieceCh chan int32
// failedReason will be set when peer task failed
failedReason string
// failedReason will be set when peer task failed
failedCode base.Code
// readyPieces stands all pieces download status
readyPieces *Bitmap
// requestedPieces stands all pieces requested from peers
requestedPieces *Bitmap
// lock used by piece result manage, when update readyPieces, lock first
lock sync.RWMutex
// limiter will be used when enable per peer task rate limit
limiter *rate.Limiter
}
type pieceTaskResult struct {
piece *base.PieceInfo
pieceResult *scheduler.PieceResult
notRetry bool
err error
}
func (pt *peerTask) ReportPieceResult(result *pieceTaskResult) error {
return pt.reportPieceResultFunc(result)
}
func (pt *peerTask) SetCallback(callback TaskCallback) {
pt.callback = callback
}
func (pt *peerTask) GetPeerID() string {
return pt.peerID
}
func (pt *peerTask) GetTaskID() string {
return pt.taskID
}
func (pt *peerTask) GetContentLength() int64 {
return pt.contentLength.Load()
}
func (pt *peerTask) SetContentLength(i int64) error {
return pt.setContentLengthFunc(i)
}
func (pt *peerTask) AddTraffic(n int64) {
pt.usedTraffic.Add(n)
}
func (pt *peerTask) GetTraffic() int64 {
return pt.usedTraffic.Load()
}
func (pt *peerTask) GetTotalPieces() int32 {
return pt.totalPiece
}
func (pt *peerTask) SetTotalPieces(i int32) {
pt.setTotalPiecesFunc(i)
}
func (pt *peerTask) SetPieceMd5Sign(md5 string) {
pt.md5 = md5
}
func (pt *peerTask) GetPieceMd5Sign() string {
return pt.md5
}
func (pt *peerTask) Context() context.Context {
return pt.ctx
}
func (pt *peerTask) Log() *logger.SugaredLoggerOnWith {
return pt.SugaredLoggerOnWith
}
func (pt *peerTask) backSource() {
pt.backSourceFunc()
}
func (pt *peerTask) pullPieces(cleanUnfinishedFunc func()) {
// when there is a single piece, try to download first
if pt.singlePiece != nil {
go pt.pullSinglePiece(cleanUnfinishedFunc)
} else {
go pt.receivePeerPacket()
go pt.pullPiecesFromPeers(cleanUnfinishedFunc)
}
}
func (pt *peerTask) receivePeerPacket() {
var (
peerPacket *scheduler.PeerPacket
err error
firstSpanDone bool
)
// only record first schedule result
// other schedule result will record as an event in peer task span
_, firstPeerSpan := tracer.Start(pt.ctx, config.SpanFirstSchedule)
defer func() {
if !firstSpanDone {
firstPeerSpan.End()
}
}()
loop:
for {
select {
case <-pt.ctx.Done():
pt.Infof("context done due to %s", pt.ctx.Err())
break loop
case <-pt.done:
pt.Infof("peer task done, stop wait peer packet from scheduler")
break loop
default:
}
peerPacket, err = pt.peerPacketStream.Recv()
if err == io.EOF {
pt.Debugf("peerPacketStream closed")
break loop
}
if err != nil {
// when success, context will be cancelled, check if pt.success is true
if pt.success {
return
}
pt.failedCode = base.Code_UnknownError
if de, ok := err.(*dferrors.DfError); ok {
if de.Code == base.Code_SchedNeedBackSource {
pt.needBackSource = true
close(pt.peerPacketReady)
return
}
pt.failedCode = de.Code
pt.failedReason = de.Message
pt.Errorf("receive peer packet failed: %s", pt.failedReason)
} else {
pt.Errorf("receive peer packet failed: %s", err)
}
pt.cancel()
if !firstSpanDone {
firstPeerSpan.RecordError(err)
}
break loop
}
logger.Debugf("receive peerPacket %v for peer %s", peerPacket, pt.peerID)
if peerPacket.Code != base.Code_Success {
pt.Errorf("receive peer packet with error: %d", peerPacket.Code)
if pt.isExitPeerPacketCode(peerPacket) {
pt.cancel()
pt.Errorf(pt.failedReason)
if !firstSpanDone {
firstPeerSpan.RecordError(fmt.Errorf(pt.failedReason))
}
pt.span.AddEvent("receive exit peer packet",
trace.WithAttributes(config.AttributePeerPacketCode.Int(int(peerPacket.Code))))
pt.span.RecordError(fmt.Errorf(pt.failedReason))
break
} else {
pt.span.AddEvent("receive not success peer packet",
trace.WithAttributes(config.AttributePeerPacketCode.Int(int(peerPacket.Code))))
}
continue
}
if peerPacket.MainPeer == nil && peerPacket.StealPeers == nil {
pt.Warnf("scheduler client send a peerPacket with empty peers")
continue
}
pt.Infof("receive new peer packet, main peer: %s, parallel count: %d",
peerPacket.MainPeer.PeerId, peerPacket.ParallelCount)
pt.span.AddEvent("receive new peer packet",
trace.WithAttributes(config.AttributeMainPeer.String(peerPacket.MainPeer.PeerId)))
if !firstSpanDone {
firstSpanDone = true
firstPeerSpan.SetAttributes(config.AttributeMainPeer.String(peerPacket.MainPeer.PeerId))
firstPeerSpan.End()
}
pt.peerPacket.Store(peerPacket)
pt.pieceParallelCount.Store(peerPacket.ParallelCount)
select {
case pt.peerPacketReady <- true:
case <-pt.ctx.Done():
pt.Infof("context done due to %s", pt.ctx.Err())
break loop
case <-pt.done:
pt.Infof("peer task done, stop wait peer packet from scheduler")
break loop
default:
}
}
}
func (pt *peerTask) isExitPeerPacketCode(pp *scheduler.PeerPacket) bool {
switch pp.Code {
case base.Code_ResourceLacked, base.Code_BadRequest, base.Code_PeerTaskNotFound, base.Code_UnknownError, base.Code_RequestTimeOut:
// 1xxx
pt.failedCode = pp.Code
pt.failedReason = fmt.Sprintf("receive exit peer packet with code %d", pp.Code)
return true
case base.Code_SchedError:
// 5xxx
pt.failedCode = pp.Code
pt.failedReason = fmt.Sprintf("receive exit peer packet with code %d", pp.Code)
return true
case base.Code_SchedPeerGone:
pt.failedReason = reasonPeerGoneFromScheduler
pt.failedCode = base.Code_SchedPeerGone
return true
case base.Code_CDNError, base.Code_CDNTaskRegistryFail, base.Code_CDNTaskDownloadFail:
// 6xxx
pt.failedCode = pp.Code
pt.failedReason = fmt.Sprintf("receive exit peer packet with code %d", pp.Code)
return true
}
return false
}
func (pt *peerTask) pullSinglePiece(cleanUnfinishedFunc func()) {
pt.Infof("single piece, dest peer id: %s, piece num: %d, size: %d",
pt.singlePiece.DstPid, pt.singlePiece.PieceInfo.PieceNum, pt.singlePiece.PieceInfo.RangeSize)
ctx, span := tracer.Start(pt.ctx, fmt.Sprintf(config.SpanDownloadPiece, pt.singlePiece.PieceInfo.PieceNum))
span.SetAttributes(config.AttributePiece.Int(int(pt.singlePiece.PieceInfo.PieceNum)))
pt.contentLength.Store(int64(pt.singlePiece.PieceInfo.RangeSize))
pt.SetTotalPieces(1)
pt.SetPieceMd5Sign(pt.singlePiece.PieceInfo.PieceMd5)
if err := pt.callback.Init(pt); err != nil {
pt.failedReason = err.Error()
pt.failedCode = base.Code_ClientError
cleanUnfinishedFunc()
span.RecordError(err)
span.SetAttributes(config.AttributePieceSuccess.Bool(false))
span.End()
return
}
request := &DownloadPieceRequest{
TaskID: pt.GetTaskID(),
DstPid: pt.singlePiece.DstPid,
DstAddr: pt.singlePiece.DstAddr,
piece: pt.singlePiece.PieceInfo,
log: pt.Log(),
}
if pt.pieceManager.DownloadPiece(ctx, pt, request) {
pt.Infof("single piece download success")
span.SetAttributes(config.AttributePieceSuccess.Bool(true))
span.End()
} else {
// fallback to download from other peers
span.SetAttributes(config.AttributePieceSuccess.Bool(false))
span.End()
pt.Warnf("single piece download failed, switch to download from other peers")
go pt.receivePeerPacket()
pt.pullPiecesFromPeers(cleanUnfinishedFunc)
}
}
func (pt *peerTask) pullPiecesFromPeers(cleanUnfinishedFunc func()) {
defer func() {
cleanUnfinishedFunc()
}()
if ok, backSource := pt.waitFirstPeerPacket(); !ok {
if backSource {
return
}
pt.Errorf("wait first peer packet error")
return
}
var (
num int32
ok bool
limit int32
initialized bool
pieceRequestCh chan *DownloadPieceRequest
// keep same size with pt.failedPieceCh for avoiding dead-lock
pieceBufferSize = int32(config.DefaultPieceChanSize)
)
limit = pieceBufferSize
loop:
for {
// 1, check whether catch exit signal or get a failed piece
// if nothing got, process normal pieces
select {
case <-pt.done:
pt.Infof("peer task done, stop get pieces from peer")
break loop
case <-pt.ctx.Done():
pt.Debugf("context done due to %s", pt.ctx.Err())
if !pt.success {
if pt.failedCode == failedCodeNotSet {
pt.failedReason = reasonContextCanceled
pt.failedCode = base.Code_ClientContextCanceled
if err := pt.callback.Fail(pt, pt.failedCode, pt.ctx.Err().Error()); err != nil {
pt.Errorf("peer task callback failed %s", err)
}
} else {
if err := pt.callback.Fail(pt, pt.failedCode, pt.failedReason); err != nil {
pt.Errorf("peer task callback failed %s", err)
}
}
}
break loop
case failed := <-pt.failedPieceCh:
pt.Warnf("download piece %d failed, retry", failed)
num = failed
limit = 1
default:
}
// 2, try to get pieces
pt.Debugf("try to get pieces, number: %d, limit: %d", num, limit)
piecePacket, err := pt.preparePieceTasks(
&base.PieceTaskRequest{
TaskId: pt.taskID,
SrcPid: pt.peerID,
StartNum: num,
Limit: limit,
})
if err != nil {
pt.Warnf("get piece task error: %s, wait available peers from scheduler", err.Error())
pt.span.RecordError(err)
if num, ok = pt.waitAvailablePeerPacket(); !ok {
break loop
}
continue loop
}
if !initialized {
initialized = true
if pieceRequestCh, ok = pt.init(piecePacket, pieceBufferSize); !ok {
break loop
}
}
// update total piece
if piecePacket.TotalPiece > pt.totalPiece {
pt.totalPiece = piecePacket.TotalPiece
_ = pt.callback.Update(pt)
pt.Debugf("update total piece count: %d", pt.totalPiece)
}
// update md5 digest
if len(piecePacket.PieceMd5Sign) > 0 && len(pt.md5) == 0 {
pt.md5 = piecePacket.PieceMd5Sign
_ = pt.callback.Update(pt)
pt.Debugf("update digest: %s", pt.md5)
}
// 3. dispatch piece request to all workers
pt.dispatchPieceRequest(pieceRequestCh, piecePacket)
// 4. get next piece
num = pt.getNextPieceNum(num)
if num != -1 {
// get next piece success
limit = pieceBufferSize
continue
}
pt.Infof("all pieces requests send, just wait failed pieces")
// just need one piece
limit = 1
// get failed piece
if num, ok = pt.waitFailedPiece(); !ok {
// when ok == false, indicates than need break loop
break loop
}
}
}
func (pt *peerTask) init(piecePacket *base.PiecePacket, pieceBufferSize int32) (chan *DownloadPieceRequest, bool) {
pt.contentLength.Store(piecePacket.ContentLength)
if pt.contentLength.Load() > 0 {
pt.span.SetAttributes(config.AttributeTaskContentLength.Int64(pt.contentLength.Load()))
}
if err := pt.callback.Init(pt); err != nil {
pt.span.RecordError(err)
pt.failedReason = err.Error()
pt.failedCode = base.Code_ClientError
return nil, false
}
pc := pt.peerPacket.Load().(*scheduler.PeerPacket).ParallelCount
pieceRequestCh := make(chan *DownloadPieceRequest, pieceBufferSize)
for i := int32(0); i < pc; i++ {
go pt.downloadPieceWorker(i, pt, pieceRequestCh)
}
return pieceRequestCh, true
}
func (pt *peerTask) waitFirstPeerPacket() (done bool, backSource bool) {
// wait first available peer
select {
case <-pt.ctx.Done():
err := pt.ctx.Err()
pt.Errorf("context done due to %s", err)
if pt.failedReason == failedReasonNotSet && err != nil {
pt.failedReason = err.Error()
}
pt.span.AddEvent(fmt.Sprintf("pulling pieces end due to %s", err))
case _, ok := <-pt.peerPacketReady:
if ok {
// preparePieceTasksByPeer func already send piece result with error
pt.Infof("new peer client ready, scheduler time cost: %dus, main peer: %s",
time.Now().Sub(pt.callback.GetStartTime()).Microseconds(), pt.peerPacket.Load().(*scheduler.PeerPacket).MainPeer)
return true, false
}
// when scheduler says base.Code_SchedNeedBackSource, receivePeerPacket will close pt.peerPacketReady
pt.Infof("start download from source due to base.Code_SchedNeedBackSource")
pt.span.AddEvent("back source due to scheduler says need back source")
pt.needBackSource = true
pt.backSource()
return false, true
case <-time.After(pt.schedulerOption.ScheduleTimeout.Duration):
if pt.schedulerOption.DisableAutoBackSource {
pt.failedReason = reasonScheduleTimeout
pt.failedCode = base.Code_ClientScheduleTimeout
err := fmt.Errorf("%s, auto back source disabled", pt.failedReason)
pt.span.RecordError(err)
pt.Errorf(err.Error())
} else {
pt.Warnf("start download from source due to %s", reasonScheduleTimeout)
pt.span.AddEvent("back source due to schedule timeout")
pt.needBackSource = true
pt.backSource()
return false, true
}
}
return false, false
}
func (pt *peerTask) waitAvailablePeerPacket() (int32, bool) {
// only <-pt.peerPacketReady continue loop, others break
select {
// when peer task without content length or total pieces count, match here
case <-pt.done:
pt.Infof("peer task done, stop wait available peer packet")
case <-pt.ctx.Done():
pt.Debugf("context done due to %s", pt.ctx.Err())
if !pt.success {
if pt.failedCode == failedCodeNotSet {
pt.failedReason = reasonContextCanceled
pt.failedCode = base.Code_ClientContextCanceled
}
}
case _, ok := <-pt.peerPacketReady:
if ok {
// preparePieceTasksByPeer func already send piece result with error
pt.Infof("new peer client ready, main peer: %s", pt.peerPacket.Load().(*scheduler.PeerPacket).MainPeer)
// research from piece 0
return pt.getNextPieceNum(0), true
}
// when scheduler says base.Code_SchedNeedBackSource, receivePeerPacket will close pt.peerPacketReady
pt.Infof("start download from source due to base.Code_SchedNeedBackSource")
pt.span.AddEvent("back source due to scheduler says need back source ")
pt.needBackSource = true
// TODO optimize back source when already downloaded some pieces
pt.backSource()
case <-time.After(pt.schedulerOption.ScheduleTimeout.Duration):
if pt.schedulerOption.DisableAutoBackSource {
pt.failedReason = reasonReScheduleTimeout
pt.failedCode = base.Code_ClientScheduleTimeout
err := fmt.Errorf("%s, auto back source disabled", pt.failedReason)
pt.span.RecordError(err)
pt.Errorf(err.Error())
} else {
pt.Warnf("start download from source due to %s", reasonReScheduleTimeout)
pt.span.AddEvent("back source due to schedule timeout")
pt.needBackSource = true
pt.backSource()
}
}
return -1, false
}
func (pt *peerTask) dispatchPieceRequest(pieceRequestCh chan *DownloadPieceRequest, piecePacket *base.PiecePacket) {
for _, piece := range piecePacket.PieceInfos {
pt.Infof("get piece %d from %s/%s, md5: %s, start: %d, size: %d",
piece.PieceNum, piecePacket.DstAddr, piecePacket.DstPid, piece.PieceMd5, piece.RangeStart, piece.RangeSize)
if !pt.requestedPieces.IsSet(piece.PieceNum) {
pt.requestedPieces.Set(piece.PieceNum)
}
req := &DownloadPieceRequest{
TaskID: pt.GetTaskID(),
DstPid: piecePacket.DstPid,
DstAddr: piecePacket.DstAddr,
piece: piece,
log: pt.Log(),
}
select {
case pieceRequestCh <- req:
case <-pt.done:
pt.Warnf("peer task done, but still some piece request not process")
case <-pt.ctx.Done():
pt.Warnf("context done due to %s", pt.ctx.Err())
if !pt.success {
if pt.failedCode == failedCodeNotSet {
pt.failedReason = reasonContextCanceled
pt.failedCode = base.Code_ClientContextCanceled
}
}
}
}
}
func (pt *peerTask) waitFailedPiece() (int32, bool) {
if pt.isCompleted() {
return -1, false
}
// use no default branch select to wait failed piece or exit
select {
case <-pt.done:
pt.Infof("peer task done, stop to wait failed piece")
return -1, false
case <-pt.ctx.Done():
pt.Debugf("context done due to %s, stop to wait failed piece", pt.ctx.Err())
return -1, false
case failed := <-pt.failedPieceCh:
pt.Warnf("download piece/%d failed, retry", failed)
return failed, true
}
}
func (pt *peerTask) downloadPieceWorker(id int32, pti Task, requests chan *DownloadPieceRequest) {
for {
select {
case request := <-requests:
pt.lock.RLock()
if pt.readyPieces.IsSet(request.piece.PieceNum) {
pt.lock.RUnlock()
pt.Log().Debugf("piece %d is already downloaded, skip", request.piece.PieceNum)
continue
}
pt.lock.RUnlock()
ctx, span := tracer.Start(pt.ctx, fmt.Sprintf(config.SpanDownloadPiece, request.piece.PieceNum))
span.SetAttributes(config.AttributePiece.Int(int(request.piece.PieceNum)))
span.SetAttributes(config.AttributePieceWorker.Int(int(id)))
if pt.limiter != nil {
_, waitSpan := tracer.Start(ctx, config.SpanWaitPieceLimit)
if err := pt.limiter.WaitN(pt.ctx, int(request.piece.RangeSize)); err != nil {
pt.Errorf("request limiter error: %s", err)
waitSpan.RecordError(err)
waitSpan.End()
if err := pti.ReportPieceResult(&pieceTaskResult{
piece: request.piece,
pieceResult: &scheduler.PieceResult{
TaskId: pt.GetTaskID(),
SrcPid: pt.GetPeerID(),
DstPid: request.DstPid,
PieceInfo: request.piece,
Success: false,
Code: base.Code_ClientRequestLimitFail,
HostLoad: nil,
FinishedCount: 0, // update by peer task
},
err: err,
}); err != nil {
pt.Errorf("report piece result failed %s", err)
}
pt.failedReason = err.Error()
pt.failedCode = base.Code_ClientRequestLimitFail
pt.cancel()
span.SetAttributes(config.AttributePieceSuccess.Bool(false))
span.End()
return
}
waitSpan.End()
}
pt.Debugf("peer download worker #%d receive piece task, "+
"dest peer id: %s, piece num: %d, range start: %d, range size: %d",
id, request.DstPid, request.piece.PieceNum, request.piece.RangeStart, request.piece.RangeSize)
success := pt.pieceManager.DownloadPiece(ctx, pti, request)
span.SetAttributes(config.AttributePieceSuccess.Bool(success))
span.End()
case <-pt.done:
pt.Debugf("peer task done, peer download worker #%d exit", id)
return
case <-pt.ctx.Done():
pt.Debugf("peer task context done, peer download worker #%d exit", id)
return
}
}
}
func (pt *peerTask) isCompleted() bool {
return pt.completedLength.Load() == pt.contentLength.Load()
}
func (pt *peerTask) preparePieceTasks(request *base.PieceTaskRequest) (p *base.PiecePacket, err error) {
defer pt.recoverFromPanic()
prepare:
peerPacket := pt.peerPacket.Load().(*scheduler.PeerPacket)
pt.pieceParallelCount.Store(peerPacket.ParallelCount)
request.DstPid = peerPacket.MainPeer.PeerId
p, err = pt.preparePieceTasksByPeer(peerPacket, peerPacket.MainPeer, request)
if err == nil {
return
}
if err == errPeerPacketChanged {
goto prepare
}
for _, peer := range peerPacket.StealPeers {
request.DstPid = peer.PeerId
p, err = pt.preparePieceTasksByPeer(peerPacket, peer, request)
if err == nil {
return
}
if err == errPeerPacketChanged {
goto prepare
}
}
return
}
func (pt *peerTask) preparePieceTasksByPeer(curPeerPacket *scheduler.PeerPacket, peer *scheduler.PeerPacket_DestPeer, request *base.PieceTaskRequest) (*base.PiecePacket, error) {
if peer == nil {
return nil, fmt.Errorf("empty peer")
}
var span trace.Span
_, span = tracer.Start(pt.ctx, config.SpanGetPieceTasks)
span.SetAttributes(config.AttributeTargetPeerID.String(peer.PeerId))
span.SetAttributes(config.AttributeGetPieceStartNum.Int(int(request.StartNum)))
span.SetAttributes(config.AttributeGetPieceLimit.Int(int(request.Limit)))
defer span.End()
// when cdn returns base.Code_CDNTaskNotFound, report it to scheduler and wait cdn download it.
retry:
pt.Debugf("try get piece task from peer %s, piece num: %d, limit: %d\"", peer.PeerId, request.StartNum, request.Limit)
p, err := pt.getPieceTasks(span, curPeerPacket, peer, request)
if err == nil {
pt.Infof("got piece task from peer %s ok, pieces length: %d", peer.PeerId, len(p.PieceInfos))
span.SetAttributes(config.AttributeGetPieceCount.Int(len(p.PieceInfos)))
return p, nil
}
span.RecordError(err)
if err == errPeerPacketChanged {
return nil, err
}
pt.Debugf("get piece task error: %#v", err)
// grpc error
if se, ok := err.(interface{ GRPCStatus() *status.Status }); ok {
pt.Debugf("get piece task with grpc error, code: %d", se.GRPCStatus().Code())
// context canceled, just exit
if se.GRPCStatus().Code() == codes.Canceled {
span.AddEvent("context canceled")
pt.Warnf("get piece task from peer %s canceled: %s", peer.PeerId, err)
return nil, err
}
}
code := base.Code_ClientPieceRequestFail
// not grpc error
if de, ok := err.(*dferrors.DfError); ok && uint32(de.Code) > uint32(codes.Unauthenticated) {
pt.Debugf("get piece task from peer %s with df error, code: %d", peer.PeerId, de.Code)
code = de.Code
}
pt.Errorf("get piece task from peer %s error: %s, code: %d", peer.PeerId, err, code)
perr := pt.peerPacketStream.Send(&scheduler.PieceResult{
TaskId: pt.taskID,
SrcPid: pt.peerID,
DstPid: peer.PeerId,
PieceInfo: &base.PieceInfo{},
Success: false,
Code: code,
HostLoad: nil,
FinishedCount: -1,
})
if perr != nil {
span.RecordError(perr)
pt.Errorf("send piece result error: %s, code to send: %d", err, code)
}
if code == base.Code_CDNTaskNotFound && curPeerPacket == pt.peerPacket.Load().(*scheduler.PeerPacket) {
span.AddEvent("retry for CDNTaskNotFound")
goto retry
}
return nil, err
}
func (pt *peerTask) getPieceTasks(span trace.Span, curPeerPacket *scheduler.PeerPacket, peer *scheduler.PeerPacket_DestPeer, request *base.PieceTaskRequest) (*base.PiecePacket, error) {
var (
peerPacketChanged bool
count int
)
p, _, err := retry.Run(pt.ctx, func() (interface{}, bool, error) {
pp, getErr := dfclient.GetPieceTasks(pt.ctx, peer, request)
// when GetPieceTasks returns err, exit retry
if getErr != nil {
span.RecordError(getErr)
// fast way to exit retry
lastPeerPacket := pt.peerPacket.Load().(*scheduler.PeerPacket)
if curPeerPacket.MainPeer.PeerId != lastPeerPacket.MainPeer.PeerId {
pt.Warnf("get piece tasks with error: %s, but peer packet changed, switch to new peer packet, current destPeer %s, new destPeer %s", getErr,
curPeerPacket.MainPeer.PeerId, lastPeerPacket.MainPeer.PeerId)
peerPacketChanged = true
return nil, true, nil
}
return nil, true, getErr
}
// by santong: when peer return empty, retry later
if len(pp.PieceInfos) == 0 {
count++
er := pt.peerPacketStream.Send(&scheduler.PieceResult{
TaskId: pt.taskID,
SrcPid: pt.peerID,
DstPid: peer.PeerId,
PieceInfo: &base.PieceInfo{},
Success: false,
Code: base.Code_ClientWaitPieceReady,
HostLoad: nil,
FinishedCount: pt.readyPieces.Settled(),
})
if er != nil {
span.RecordError(er)
pt.Errorf("send piece result with base.Code_ClientWaitPieceReady error: %s", er)
}
// fast way to exit retry
lastPeerPacket := pt.peerPacket.Load().(*scheduler.PeerPacket)
if curPeerPacket.MainPeer.PeerId != lastPeerPacket.MainPeer.PeerId {
pt.Warnf("get empty pieces and peer packet changed, switch to new peer packet, current destPeer %s, new destPeer %s",
curPeerPacket.MainPeer.PeerId, lastPeerPacket.MainPeer.PeerId)
peerPacketChanged = true
return nil, true, nil
}
span.AddEvent("retry due to empty pieces",
trace.WithAttributes(config.AttributeGetPieceRetry.Int(count)))
pt.Infof("peer %s returns success but with empty pieces, retry later", peer.PeerId)
return nil, false, dferrors.ErrEmptyValue
}
return pp, false, nil
}, 0.05, 0.2, 40, nil)
if peerPacketChanged {
return nil, errPeerPacketChanged
}
if err == nil {
return p.(*base.PiecePacket), nil
}
return nil, err
}
func (pt *peerTask) getNextPieceNum(cur int32) int32 {
if pt.isCompleted() {
return -1
}
i := cur
for ; pt.requestedPieces.IsSet(i); i++ {
}
if pt.totalPiece > 0 && i >= pt.totalPiece {
// double check, re-search not success or not requested pieces
for i = int32(0); pt.requestedPieces.IsSet(i); i++ {
}
if pt.totalPiece > 0 && i >= pt.totalPiece {
return -1
}
}
return i
}
func (pt *peerTask) recoverFromPanic() {
if r := recover(); r != nil {
pt.Errorf("recovered from panic %q. Call stack:\n%v", r, string(debug.Stack()))
}
}