440 lines
14 KiB
Go
440 lines
14 KiB
Go
/*
|
||
* Copyright 2020 The Dragonfly Authors
|
||
*
|
||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
* you may not use this file except in compliance with the License.
|
||
* You may obtain a copy of the License at
|
||
*
|
||
* http://www.apache.org/licenses/LICENSE-2.0
|
||
*
|
||
* Unless required by applicable law or agreed to in writing, software
|
||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
* See the License for the specific language governing permissions and
|
||
* limitations under the License.
|
||
*/
|
||
|
||
package rpc
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"sync"
|
||
"time"
|
||
|
||
"d7y.io/dragonfly/v2/internal/dfcodes"
|
||
"d7y.io/dragonfly/v2/internal/dferrors"
|
||
logger "d7y.io/dragonfly/v2/internal/dflog"
|
||
"d7y.io/dragonfly/v2/pkg/basic/dfnet"
|
||
"github.com/pkg/errors"
|
||
"github.com/serialx/hashring"
|
||
"google.golang.org/grpc"
|
||
"google.golang.org/grpc/keepalive"
|
||
)
|
||
|
||
const (
|
||
// defaultGcConnTimeout specifies the timeout for clientConn gc.
|
||
// If the actual execution time exceeds this threshold, a warning will be thrown.
|
||
defaultGcConnTimeout = 1.0 * time.Second
|
||
|
||
defaultGcConnInterval = 60 * time.Second
|
||
|
||
defaultConnExpireTime = 2 * time.Minute
|
||
|
||
defaultDialTimeout = 10 * time.Second
|
||
)
|
||
|
||
type Closer interface {
|
||
Close() error
|
||
}
|
||
|
||
// TODO Perfect state
|
||
type ConnStatus string
|
||
|
||
type Connection struct {
|
||
ctx context.Context
|
||
cancelFun context.CancelFunc
|
||
rwMutex sync.RWMutex
|
||
dialOpts []grpc.DialOption
|
||
key2NodeMap sync.Map // key -> node(many to one)
|
||
node2ClientMap sync.Map // node -> clientConn(one to one)
|
||
accessNodeMap sync.Map // clientConn access time
|
||
connExpireTime time.Duration
|
||
gcConnTimeout time.Duration
|
||
gcConnInterval time.Duration
|
||
dialTimeout time.Duration
|
||
name string
|
||
hashRing *hashring.HashRing // server hash ring
|
||
serverNodes []dfnet.NetAddr
|
||
status ConnStatus
|
||
}
|
||
|
||
func newDefaultConnection(ctx context.Context) *Connection {
|
||
childCtx, cancel := context.WithCancel(ctx)
|
||
return &Connection{
|
||
ctx: childCtx,
|
||
cancelFun: cancel,
|
||
dialOpts: defaultClientOpts,
|
||
connExpireTime: defaultConnExpireTime,
|
||
gcConnTimeout: defaultGcConnTimeout,
|
||
gcConnInterval: defaultGcConnInterval,
|
||
dialTimeout: defaultDialTimeout,
|
||
}
|
||
}
|
||
|
||
var defaultClientOpts = []grpc.DialOption{
|
||
grpc.FailOnNonTempDialError(true),
|
||
grpc.WithBlock(),
|
||
grpc.WithDisableServiceConfig(),
|
||
grpc.WithInitialConnWindowSize(8 * 1024 * 1024),
|
||
grpc.WithInsecure(),
|
||
grpc.WithKeepaliveParams(keepalive.ClientParameters{
|
||
Time: 2 * time.Minute,
|
||
Timeout: 10 * time.Second,
|
||
}),
|
||
grpc.WithStreamInterceptor(streamClientInterceptor),
|
||
grpc.WithUnaryInterceptor(unaryClientInterceptor),
|
||
}
|
||
|
||
type ConnOption interface {
|
||
apply(*Connection)
|
||
}
|
||
|
||
type funcConnOption struct {
|
||
f func(*Connection)
|
||
}
|
||
|
||
func (fdo *funcConnOption) apply(conn *Connection) {
|
||
fdo.f(conn)
|
||
}
|
||
|
||
func newFuncConnOption(f func(option *Connection)) *funcConnOption {
|
||
return &funcConnOption{
|
||
f: f,
|
||
}
|
||
}
|
||
|
||
func WithConnExpireTime(duration time.Duration) ConnOption {
|
||
return newFuncConnOption(func(conn *Connection) {
|
||
conn.connExpireTime = duration
|
||
})
|
||
}
|
||
|
||
func WithDialOption(opts []grpc.DialOption) ConnOption {
|
||
return newFuncConnOption(func(conn *Connection) {
|
||
conn.dialOpts = append(defaultClientOpts, opts...)
|
||
})
|
||
}
|
||
|
||
func WithGcConnTimeout(gcConnTimeout time.Duration) ConnOption {
|
||
return newFuncConnOption(func(conn *Connection) {
|
||
conn.gcConnTimeout = gcConnTimeout
|
||
})
|
||
}
|
||
|
||
func WithGcConnInterval(gcConnInterval time.Duration) ConnOption {
|
||
return newFuncConnOption(func(conn *Connection) {
|
||
conn.gcConnInterval = gcConnInterval
|
||
})
|
||
}
|
||
|
||
func WithDialTimeout(dialTimeout time.Duration) ConnOption {
|
||
return newFuncConnOption(func(conn *Connection) {
|
||
conn.dialTimeout = dialTimeout
|
||
})
|
||
}
|
||
|
||
func NewConnection(ctx context.Context, name string, addrs []dfnet.NetAddr, connOpts []ConnOption) *Connection {
|
||
conn := newDefaultConnection(ctx)
|
||
conn.name = name
|
||
addresses := make([]string, 0, len(addrs))
|
||
for _, addr := range addrs {
|
||
addresses = append(addresses, addr.GetEndpoint())
|
||
}
|
||
conn.hashRing = hashring.New(addresses)
|
||
conn.serverNodes = addrs
|
||
for _, opt := range connOpts {
|
||
opt.apply(conn)
|
||
}
|
||
go conn.startGC()
|
||
return conn
|
||
}
|
||
|
||
func (conn *Connection) CorrectKey2NodeRelation(tmpHashKey, realHashKey string) {
|
||
if tmpHashKey == realHashKey {
|
||
return
|
||
}
|
||
conn.rwMutex.Lock()
|
||
defer conn.rwMutex.Unlock()
|
||
key, _ := conn.key2NodeMap.Load(tmpHashKey)
|
||
serverNode := key.(string)
|
||
conn.key2NodeMap.Store(realHashKey, serverNode)
|
||
conn.key2NodeMap.Delete(tmpHashKey)
|
||
}
|
||
|
||
func (conn *Connection) UpdateAccessNodeMapByHashKey(key string) {
|
||
node, ok := conn.key2NodeMap.Load(key)
|
||
if ok {
|
||
conn.accessNodeMap.Store(node, time.Now())
|
||
logger.With("conn", conn.name).Debugf("successfully update server node %s access time for hashKey %s", node, key)
|
||
_, ok := conn.node2ClientMap.Load(node)
|
||
if !ok {
|
||
logger.With("conn", conn.name).Warnf("successfully update server node %s access time for hashKey %s,"+
|
||
"but cannot found client conn in node2ClientMap", node, key)
|
||
}
|
||
} else {
|
||
logger.With("conn", conn.name).Errorf("update access node map failed, hash key (%s) not found in key2NodeMap", key)
|
||
}
|
||
}
|
||
|
||
func (conn *Connection) UpdateAccessNodeMapByServerNode(serverNode string) {
|
||
conn.accessNodeMap.Store(serverNode, time.Now())
|
||
}
|
||
|
||
func (conn *Connection) AddServerNodes(addrs []dfnet.NetAddr) error {
|
||
conn.rwMutex.Lock()
|
||
defer conn.rwMutex.Unlock()
|
||
for _, addr := range addrs {
|
||
serverNode := addr.GetEndpoint()
|
||
conn.hashRing = conn.hashRing.AddNode(serverNode)
|
||
logger.With("conn", conn.name).Debugf("success add %s to server node list", addr)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// findCandidateClientConn find candidate node client conn other than exclusiveNodes
|
||
func (conn *Connection) findCandidateClientConn(key string, exclusiveNodes ...string) (*candidateClient, error) {
|
||
if node, ok := conn.key2NodeMap.Load(key); ok {
|
||
candidateNode := node.(string)
|
||
selected := true
|
||
for _, exclusiveNode := range exclusiveNodes {
|
||
if exclusiveNode == candidateNode {
|
||
selected = false
|
||
}
|
||
}
|
||
if selected {
|
||
if client, ok := conn.node2ClientMap.Load(node); ok {
|
||
return &candidateClient{
|
||
node: candidateNode,
|
||
Ref: client,
|
||
}, nil
|
||
}
|
||
}
|
||
}
|
||
|
||
ringNodes, ok := conn.hashRing.GetNodes(key, conn.hashRing.Size())
|
||
if !ok {
|
||
logger.Warnf("cannot obtain expected %d server nodes", conn.hashRing.Size())
|
||
}
|
||
if len(ringNodes) == 0 {
|
||
return nil, dferrors.ErrNoCandidateNode
|
||
}
|
||
candidateNodes := make([]string, 0)
|
||
for _, ringNode := range ringNodes {
|
||
candidate := true
|
||
for _, exclusiveNode := range exclusiveNodes {
|
||
if exclusiveNode == ringNode {
|
||
candidate = false
|
||
}
|
||
}
|
||
if candidate {
|
||
candidateNodes = append(candidateNodes, ringNode)
|
||
}
|
||
}
|
||
logger.With("conn", conn.name).Infof("candidate result for hash key %s: all server node list: %v, exclusiveNodes node list: %v, candidate node list: %v",
|
||
key, ringNodes, exclusiveNodes, candidateNodes)
|
||
for _, candidateNode := range candidateNodes {
|
||
// Check whether there is a corresponding mapping client in the node2ClientMap
|
||
// TODO 下面部分可以直接调用loadOrCreate方法,但是日志没有这么调用打印全
|
||
if client, ok := conn.node2ClientMap.Load(candidateNode); ok {
|
||
logger.With("conn", conn.name).Infof("hit cache candidateNode %s for hash key %s", candidateNode, key)
|
||
return &candidateClient{
|
||
node: candidateNode,
|
||
Ref: client,
|
||
}, nil
|
||
}
|
||
logger.With("conn", conn.name).Debugf("attempt to connect candidateNode %s for hash key %s", candidateNode, key)
|
||
clientConn, err := conn.createClient(candidateNode, append(defaultClientOpts, conn.dialOpts...)...)
|
||
if err == nil {
|
||
logger.With("conn", conn.name).Infof("success connect to candidateNode %s for hash key %s", candidateNode, key)
|
||
return &candidateClient{
|
||
node: candidateNode,
|
||
Ref: clientConn,
|
||
}, nil
|
||
}
|
||
|
||
logger.With("conn", conn.name).Infof("failed to connect candidateNode %s for hash key %s: %v", candidateNode, key, err)
|
||
}
|
||
return nil, dferrors.ErrNoCandidateNode
|
||
}
|
||
|
||
type candidateClient struct {
|
||
node string
|
||
Ref interface{}
|
||
}
|
||
|
||
func (conn *Connection) createClient(target string, opts ...grpc.DialOption) (*grpc.ClientConn, error) {
|
||
// should not retry
|
||
ctx, cancel := context.WithTimeout(context.Background(), conn.dialTimeout)
|
||
defer cancel()
|
||
return grpc.DialContext(ctx, target, opts...)
|
||
}
|
||
|
||
// GetServerNode
|
||
func (conn *Connection) GetServerNode(hashKey string) (string, bool) {
|
||
conn.rwMutex.RLock()
|
||
defer conn.rwMutex.RUnlock()
|
||
node, ok := conn.key2NodeMap.Load(hashKey)
|
||
serverNode := node.(string)
|
||
if ok {
|
||
return serverNode, true
|
||
}
|
||
return "unknown", false
|
||
}
|
||
|
||
func (conn *Connection) GetClientConnByTarget(node string) (*grpc.ClientConn, error) {
|
||
logger.With("conn", conn.name).Debugf("start to get client conn by target %s", node)
|
||
conn.rwMutex.RLock()
|
||
defer conn.rwMutex.RUnlock()
|
||
clientConn, err := conn.loadOrCreateClientConnByNode(node)
|
||
if err != nil {
|
||
return nil, errors.Wrapf(err, "get client conn by conn %s", node)
|
||
}
|
||
logger.With("conn", conn.name).Debugf("successfully get %s client conn", node)
|
||
return clientConn, nil
|
||
}
|
||
|
||
func (conn *Connection) loadOrCreateClientConnByNode(node string) (clientConn *grpc.ClientConn, err error) {
|
||
defer func() {
|
||
if desc := recover(); desc != nil {
|
||
err = errors.Errorf("%v", desc)
|
||
}
|
||
}()
|
||
conn.accessNodeMap.Store(node, time.Now())
|
||
client, ok := conn.node2ClientMap.Load(node)
|
||
if ok {
|
||
logger.With("conn", conn.name).Debugf("hit cache clientConn associated with node %s", node)
|
||
return client.(*grpc.ClientConn), nil
|
||
}
|
||
|
||
logger.With("conn", conn.name).Debugf("failed to load clientConn associated with node %s, attempt to create it", node)
|
||
clientConn, err = conn.createClient(node, append(defaultClientOpts, conn.dialOpts...)...)
|
||
if err == nil {
|
||
logger.With("conn", conn.name).Infof("success connect to node %s", node)
|
||
// bind
|
||
conn.node2ClientMap.Store(node, clientConn)
|
||
return clientConn, nil
|
||
}
|
||
|
||
return nil, errors.Wrapf(err, "cannot found clientConn associated with node %s and create client conn failed", node)
|
||
}
|
||
|
||
// GetClientConn get conn or bind hashKey to candidate node, don't do the migrate action
|
||
// stick whether hash key need already associated with specify node
|
||
func (conn *Connection) GetClientConn(hashKey string, stick bool) (*grpc.ClientConn, error) {
|
||
logger.With("conn", conn.name).Debugf("start to get client conn hashKey %s, stick %t", hashKey, stick)
|
||
conn.rwMutex.RLock()
|
||
node, ok := conn.key2NodeMap.Load(hashKey)
|
||
if stick && !ok {
|
||
conn.rwMutex.RUnlock()
|
||
// if request is stateful, hash key must exist in key2NodeMap
|
||
return nil, fmt.Errorf("it is a stateful request, but cannot find hash key(%s) in key2NodeMap", hashKey)
|
||
}
|
||
if ok {
|
||
// if exist
|
||
serverNode := node.(string)
|
||
clientConn, err := conn.loadOrCreateClientConnByNode(serverNode)
|
||
conn.rwMutex.RUnlock()
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return clientConn, nil
|
||
}
|
||
logger.With("conn", conn.name).Infof("no server node associated with hash key %s was found, start find candidate", hashKey)
|
||
conn.rwMutex.RUnlock()
|
||
// if absence
|
||
conn.rwMutex.Lock()
|
||
defer conn.rwMutex.Unlock()
|
||
client, err := conn.findCandidateClientConn(hashKey)
|
||
if err != nil {
|
||
return nil, errors.Wrapf(err, "prob candidate client conn for hash key %s", hashKey)
|
||
}
|
||
conn.key2NodeMap.Store(hashKey, client.node)
|
||
conn.node2ClientMap.Store(client.node, client.Ref)
|
||
conn.accessNodeMap.Store(client.node, time.Now())
|
||
return client.Ref.(*grpc.ClientConn), nil
|
||
}
|
||
|
||
// TryMigrate migrate key to another hash node other than exclusiveNodes
|
||
// preNode node before the migration
|
||
func (conn *Connection) TryMigrate(key string, cause error, exclusiveNodes []string) (preNode string, err error) {
|
||
logger.With("conn", conn.name).Infof("start try migrate server node for key %s, cause err: %v", key, cause)
|
||
// TODO recover findCandidateClientConn error
|
||
if e, ok := cause.(*dferrors.DfError); ok {
|
||
if e.Code != dfcodes.ResourceLacked && e.Code != dfcodes.UnknownError {
|
||
return "", cause
|
||
}
|
||
}
|
||
currentNode := ""
|
||
conn.rwMutex.RLock()
|
||
if currentNode, ok := conn.key2NodeMap.Load(key); ok {
|
||
preNode = currentNode.(string)
|
||
exclusiveNodes = append(exclusiveNodes, currentNode.(string))
|
||
} else {
|
||
logger.With("conn", conn.name).Warnf("failed to find server node for hash key %s", key)
|
||
}
|
||
conn.rwMutex.RUnlock()
|
||
conn.rwMutex.Lock()
|
||
defer conn.rwMutex.Unlock()
|
||
client, err := conn.findCandidateClientConn(key, exclusiveNodes...)
|
||
if err != nil {
|
||
return "", errors.Wrapf(err, "find candidate client conn for hash key %s", key)
|
||
}
|
||
logger.With("conn", conn.name).Infof("successfully migrate hash key %s from server node %s to %s", key, currentNode, client.node)
|
||
conn.key2NodeMap.Store(key, client.node)
|
||
conn.node2ClientMap.Store(client.node, client.Ref)
|
||
conn.accessNodeMap.Store(client.node, time.Now())
|
||
return
|
||
}
|
||
|
||
func (conn *Connection) Close() error {
|
||
conn.rwMutex.Lock()
|
||
defer conn.rwMutex.Unlock()
|
||
for i := range conn.serverNodes {
|
||
serverNode := conn.serverNodes[i].GetEndpoint()
|
||
conn.hashRing.RemoveNode(serverNode)
|
||
value, ok := conn.node2ClientMap.Load(serverNode)
|
||
if ok {
|
||
clientCon := value.(*grpc.ClientConn)
|
||
err := clientCon.Close()
|
||
if err == nil {
|
||
conn.node2ClientMap.Delete(serverNode)
|
||
} else {
|
||
logger.GrpcLogger.With("conn", conn.name).Warnf("failed to close clientConn: %s: %v", serverNode, err)
|
||
}
|
||
}
|
||
// gc hash keys
|
||
conn.key2NodeMap.Range(func(key, value interface{}) bool {
|
||
if value == serverNode {
|
||
conn.key2NodeMap.Delete(key)
|
||
logger.GrpcLogger.With("conn", conn.name).Infof("success gc key: %s associated with server node %s", key, serverNode)
|
||
}
|
||
return true
|
||
})
|
||
conn.accessNodeMap.Delete(serverNode)
|
||
}
|
||
conn.cancelFun()
|
||
return nil
|
||
}
|
||
|
||
func (conn *Connection) UpdateState(addrs []dfnet.NetAddr) {
|
||
var addresses []string
|
||
for _, addr := range addrs {
|
||
addresses = append(addresses, addr.GetEndpoint())
|
||
}
|
||
conn.rwMutex.Lock()
|
||
defer conn.rwMutex.Unlock()
|
||
conn.serverNodes = addrs
|
||
conn.hashRing = hashring.New(addresses)
|
||
}
|