mirror of https://github.com/docker/docs.git
Scheduler prefers nodes without connection failures.
Signed-off-by: Dong Chen <dongluo.chen@docker.com>
This commit is contained in:
parent
8cc26f56f2
commit
cf664141b6
|
@ -209,6 +209,18 @@ func (e *Engine) IsHealthy() bool {
|
|||
return e.state == stateHealthy
|
||||
}
|
||||
|
||||
// HealthIndicator returns degree of healthiness between 0 and 100.
|
||||
// 0 means node is not healthy (unhealthy, pending), 100 means last connectivity was successful
|
||||
// other values indicate recent failures but haven't moved engine out of healthy state
|
||||
func (e *Engine) HealthIndicator() int {
|
||||
e.RLock()
|
||||
e.RUnlock()
|
||||
if e.state != stateHealthy || e.failureCount >= e.opts.FailureRetry {
|
||||
return 0
|
||||
}
|
||||
return 100 - e.failureCount*100/e.opts.FailureRetry
|
||||
}
|
||||
|
||||
// setState sets engine state
|
||||
func (e *Engine) setState(state engineState) {
|
||||
e.Lock()
|
||||
|
@ -301,13 +313,13 @@ func (e *Engine) resetFailureCount() {
|
|||
func (e *Engine) CheckConnectionErr(err error) {
|
||||
if err == nil {
|
||||
e.setErrMsg("")
|
||||
e.resetFailureCount()
|
||||
// If current state is unhealthy, change it to healthy
|
||||
if e.state == stateUnhealthy {
|
||||
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", e.failureCount)
|
||||
e.emitEvent("engine_reconnect")
|
||||
e.setState(stateHealthy)
|
||||
}
|
||||
e.resetFailureCount()
|
||||
return
|
||||
}
|
||||
|
||||
|
|
|
@ -84,6 +84,18 @@ func TestEngineFailureCount(t *testing.T) {
|
|||
assert.True(t, engine.failureCount == 0)
|
||||
}
|
||||
|
||||
func TestHealthINdicator(t *testing.T) {
|
||||
engine := NewEngine("test", 0, engOpts)
|
||||
assert.True(t, engine.state == statePending)
|
||||
assert.True(t, engine.HealthIndicator() == 0)
|
||||
engine.setState(stateUnhealthy)
|
||||
assert.True(t, engine.HealthIndicator() == 0)
|
||||
engine.setState(stateHealthy)
|
||||
assert.True(t, engine.HealthIndicator() == 100)
|
||||
engine.incFailureCount()
|
||||
assert.True(t, engine.HealthIndicator() == 100-100/engine.opts.FailureRetry)
|
||||
}
|
||||
|
||||
func TestEngineConnectionFailure(t *testing.T) {
|
||||
engine := NewEngine("test", 0, engOpts)
|
||||
assert.False(t, engine.isConnected())
|
||||
|
|
|
@ -30,7 +30,7 @@ func TestApplyFilters(t *testing.T) {
|
|||
Id: "image-0-id",
|
||||
RepoTags: []string{"image-0:tag1", "image-0:tag2"},
|
||||
}}},
|
||||
IsHealthy: true,
|
||||
HealthIndicator: 100,
|
||||
},
|
||||
{
|
||||
ID: "node-1-id",
|
||||
|
@ -54,7 +54,7 @@ func TestApplyFilters(t *testing.T) {
|
|||
Id: "image-1-id",
|
||||
RepoTags: []string{"image-1:tag1", "image-0:tag3", "image-1:tag2"},
|
||||
}}},
|
||||
IsHealthy: false,
|
||||
HealthIndicator: 0,
|
||||
},
|
||||
}
|
||||
result []*node.Node
|
||||
|
|
|
@ -25,7 +25,7 @@ func (f *HealthFilter) Name() string {
|
|||
func (f *HealthFilter) Filter(_ *cluster.ContainerConfig, nodes []*node.Node, _ bool) ([]*node.Node, error) {
|
||||
result := []*node.Node{}
|
||||
for _, node := range nodes {
|
||||
if node.IsHealthy {
|
||||
if node.IsHealthy() {
|
||||
result = append(result, node)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,15 +11,15 @@ import (
|
|||
func testFixturesAllHealthyNode() []*node.Node {
|
||||
return []*node.Node{
|
||||
{
|
||||
ID: "node-0-id",
|
||||
Name: "node-0-name",
|
||||
IsHealthy: true,
|
||||
ID: "node-0-id",
|
||||
Name: "node-0-name",
|
||||
HealthIndicator: 100,
|
||||
},
|
||||
|
||||
{
|
||||
ID: "node-1-id",
|
||||
Name: "node-1-name",
|
||||
IsHealthy: true,
|
||||
ID: "node-1-id",
|
||||
Name: "node-1-name",
|
||||
HealthIndicator: 100,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -27,15 +27,15 @@ func testFixturesAllHealthyNode() []*node.Node {
|
|||
func testFixturesPartHealthyNode() []*node.Node {
|
||||
return []*node.Node{
|
||||
{
|
||||
ID: "node-0-id",
|
||||
Name: "node-0-name",
|
||||
IsHealthy: false,
|
||||
ID: "node-0-id",
|
||||
Name: "node-0-name",
|
||||
HealthIndicator: 0,
|
||||
},
|
||||
|
||||
{
|
||||
ID: "node-1-id",
|
||||
Name: "node-1-name",
|
||||
IsHealthy: true,
|
||||
ID: "node-1-id",
|
||||
Name: "node-1-name",
|
||||
HealthIndicator: 100,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -43,15 +43,15 @@ func testFixturesPartHealthyNode() []*node.Node {
|
|||
func testFixturesNoHealthyNode() []*node.Node {
|
||||
return []*node.Node{
|
||||
{
|
||||
ID: "node-0-id",
|
||||
Name: "node-0-name",
|
||||
IsHealthy: false,
|
||||
ID: "node-0-id",
|
||||
Name: "node-0-name",
|
||||
HealthIndicator: 0,
|
||||
},
|
||||
|
||||
{
|
||||
ID: "node-1-id",
|
||||
Name: "node-1-name",
|
||||
IsHealthy: false,
|
||||
ID: "node-1-id",
|
||||
Name: "node-1-name",
|
||||
HealthIndicator: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,27 +21,32 @@ type Node struct {
|
|||
TotalMemory int64
|
||||
TotalCpus int64
|
||||
|
||||
IsHealthy bool
|
||||
HealthIndicator int64
|
||||
}
|
||||
|
||||
// NewNode creates a node from an engine.
|
||||
func NewNode(e *cluster.Engine) *Node {
|
||||
return &Node{
|
||||
ID: e.ID,
|
||||
IP: e.IP,
|
||||
Addr: e.Addr,
|
||||
Name: e.Name,
|
||||
Labels: e.Labels,
|
||||
Containers: e.Containers(),
|
||||
Images: e.Images(),
|
||||
UsedMemory: e.UsedMemory(),
|
||||
UsedCpus: e.UsedCpus(),
|
||||
TotalMemory: e.TotalMemory(),
|
||||
TotalCpus: e.TotalCpus(),
|
||||
IsHealthy: e.IsHealthy(),
|
||||
ID: e.ID,
|
||||
IP: e.IP,
|
||||
Addr: e.Addr,
|
||||
Name: e.Name,
|
||||
Labels: e.Labels,
|
||||
Containers: e.Containers(),
|
||||
Images: e.Images(),
|
||||
UsedMemory: e.UsedMemory(),
|
||||
UsedCpus: e.UsedCpus(),
|
||||
TotalMemory: e.TotalMemory(),
|
||||
TotalCpus: e.TotalCpus(),
|
||||
HealthIndicator: int64(e.HealthIndicator()),
|
||||
}
|
||||
}
|
||||
|
||||
// IsHealthy responses if node is in healthy state
|
||||
func (n *Node) IsHealthy() bool {
|
||||
return n.HealthIndicator > 0
|
||||
}
|
||||
|
||||
// Container returns the container with IDOrName in the engine.
|
||||
func (n *Node) Container(IDOrName string) *cluster.Container {
|
||||
return n.Containers.Get(IDOrName)
|
||||
|
|
|
@ -23,7 +23,10 @@ func (p *BinpackPlacementStrategy) Name() string {
|
|||
|
||||
// RankAndSort sorts nodes based on the binpack strategy applied to the container config.
|
||||
func (p *BinpackPlacementStrategy) RankAndSort(config *cluster.ContainerConfig, nodes []*node.Node) ([]*node.Node, error) {
|
||||
weightedNodes, err := weighNodes(config, nodes)
|
||||
// for binpack, a healthy node should increase its weight to increase its chance of being selected
|
||||
// set healthFactor to 10 to make health degree [0, 100] overpower cpu + memory (each in range [0, 100])
|
||||
const healthFactor int64 = 10
|
||||
weightedNodes, err := weighNodes(config, nodes, healthFactor)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
@ -14,11 +14,12 @@ func createNode(ID string, memory int64, cpus int64) *node.Node {
|
|||
oc := 0.05
|
||||
memory = int64(float64(memory) + float64(memory)*oc)
|
||||
return &node.Node{
|
||||
ID: ID,
|
||||
IP: ID,
|
||||
Addr: ID,
|
||||
TotalMemory: memory * 1024 * 1024 * 1024,
|
||||
TotalCpus: cpus,
|
||||
ID: ID,
|
||||
IP: ID,
|
||||
Addr: ID,
|
||||
TotalMemory: memory * 1024 * 1024 * 1024,
|
||||
TotalCpus: cpus,
|
||||
HealthIndicator: 100,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,10 @@ func (p *SpreadPlacementStrategy) Name() string {
|
|||
|
||||
// RankAndSort sorts nodes based on the spread strategy applied to the container config.
|
||||
func (p *SpreadPlacementStrategy) RankAndSort(config *cluster.ContainerConfig, nodes []*node.Node) ([]*node.Node, error) {
|
||||
weightedNodes, err := weighNodes(config, nodes)
|
||||
// for spread, a healthy node should decrease its weight to increase its chance of being selected
|
||||
// set healthFactor to -10 to make health degree [0, 100] overpower cpu + memory (each in range [0, 100])
|
||||
const healthFactor int64 = -10
|
||||
weightedNodes, err := weighNodes(config, nodes, healthFactor)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
@ -36,7 +36,7 @@ func (n weightedNodeList) Less(i, j int) bool {
|
|||
return ip.Weight < jp.Weight
|
||||
}
|
||||
|
||||
func weighNodes(config *cluster.ContainerConfig, nodes []*node.Node) (weightedNodeList, error) {
|
||||
func weighNodes(config *cluster.ContainerConfig, nodes []*node.Node, healthinessFactor int64) (weightedNodeList, error) {
|
||||
weightedNodes := weightedNodeList{}
|
||||
|
||||
for _, node := range nodes {
|
||||
|
@ -61,7 +61,7 @@ func weighNodes(config *cluster.ContainerConfig, nodes []*node.Node) (weightedNo
|
|||
}
|
||||
|
||||
if cpuScore <= 100 && memoryScore <= 100 {
|
||||
weightedNodes = append(weightedNodes, &weightedNode{Node: node, Weight: cpuScore + memoryScore})
|
||||
weightedNodes = append(weightedNodes, &weightedNode{Node: node, Weight: cpuScore + memoryScore + healthinessFactor*node.HealthIndicator})
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
#!/usr/bin/env bats
|
||||
|
||||
load ../helpers
|
||||
|
||||
function teardown() {
|
||||
swarm_manage_cleanup
|
||||
stop_docker
|
||||
}
|
||||
|
||||
@test "scheduler avoids failing node" {
|
||||
# Start 1 engine and register it in the file.
|
||||
start_docker 2
|
||||
# Start swarm and check it can reach the node
|
||||
# refresh interval is 20s. 20 retries before marking it as unhealthy
|
||||
swarm_manage --engine-refresh-min-interval "20s" --engine-refresh-max-interval "20s" --engine-failure-retry 20 "${HOSTS[0]},${HOSTS[1]}"
|
||||
|
||||
eval "docker_swarm info | grep -q -i 'Nodes: 2'"
|
||||
|
||||
# Use memory on node-0
|
||||
docker_swarm run -e constraint:node==node-0 -m 50m busybox sh
|
||||
|
||||
# Stop the node-1
|
||||
docker_host stop ${DOCKER_CONTAINERS[1]}
|
||||
|
||||
# Try to schedule a container. It'd first select node-1 and fail
|
||||
run docker_swarm run -m 10m busybox sh
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "${lines[0]}" == *"Cannot connect to the docker engine endpoint"* ]]
|
||||
|
||||
# Try to run it again. It'd select node-0 and succeed
|
||||
run docker_swarm run -m 10m busybox sh
|
||||
[ "$status" -eq 0 ]
|
||||
}
|
||||
|
||||
@test "refresh loop detects failure" {
|
||||
# Start 1 engine and register it in the file.
|
||||
start_docker 2
|
||||
# Start swarm and check it can reach the node
|
||||
# refresh interval is 1s. 20 retries before marking it as unhealthy
|
||||
swarm_manage --engine-refresh-min-interval "1s" --engine-refresh-max-interval "1s" --engine-failure-retry 20 "${HOSTS[0]},${HOSTS[1]}"
|
||||
|
||||
eval "docker_swarm info | grep -q -i 'Nodes: 2'"
|
||||
|
||||
# Use memory on node-0
|
||||
docker_swarm run -e constraint:node==node-0 -m 50m busybox sh
|
||||
|
||||
# Stop the node-1
|
||||
docker_host stop ${DOCKER_CONTAINERS[1]}
|
||||
|
||||
# Sleep to let refresh loop detect node-1 failure
|
||||
sleep 3
|
||||
|
||||
# Try to schedule a container. It'd select node-0 and succeed
|
||||
run docker_swarm run -m 10m busybox sh
|
||||
[ "$status" -eq 0 ]
|
||||
}
|
||||
|
Loading…
Reference in New Issue