Scheduler prefers nodes without connection failures.

Signed-off-by: Dong Chen <dongluo.chen@docker.com>
This commit is contained in:
Dong Chen 2016-01-11 11:22:13 -08:00
parent 8cc26f56f2
commit cf664141b6
11 changed files with 137 additions and 44 deletions

View File

@ -209,6 +209,18 @@ func (e *Engine) IsHealthy() bool {
return e.state == stateHealthy
}
// HealthIndicator returns degree of healthiness between 0 and 100.
// 0 means node is not healthy (unhealthy, pending), 100 means last connectivity was successful
// other values indicate recent failures but haven't moved engine out of healthy state
func (e *Engine) HealthIndicator() int {
e.RLock()
e.RUnlock()
if e.state != stateHealthy || e.failureCount >= e.opts.FailureRetry {
return 0
}
return 100 - e.failureCount*100/e.opts.FailureRetry
}
// setState sets engine state
func (e *Engine) setState(state engineState) {
e.Lock()
@ -301,13 +313,13 @@ func (e *Engine) resetFailureCount() {
func (e *Engine) CheckConnectionErr(err error) {
if err == nil {
e.setErrMsg("")
e.resetFailureCount()
// If current state is unhealthy, change it to healthy
if e.state == stateUnhealthy {
log.WithFields(log.Fields{"name": e.Name, "id": e.ID}).Infof("Engine came back to life after %d retries. Hooray!", e.failureCount)
e.emitEvent("engine_reconnect")
e.setState(stateHealthy)
}
e.resetFailureCount()
return
}

View File

@ -84,6 +84,18 @@ func TestEngineFailureCount(t *testing.T) {
assert.True(t, engine.failureCount == 0)
}
func TestHealthINdicator(t *testing.T) {
engine := NewEngine("test", 0, engOpts)
assert.True(t, engine.state == statePending)
assert.True(t, engine.HealthIndicator() == 0)
engine.setState(stateUnhealthy)
assert.True(t, engine.HealthIndicator() == 0)
engine.setState(stateHealthy)
assert.True(t, engine.HealthIndicator() == 100)
engine.incFailureCount()
assert.True(t, engine.HealthIndicator() == 100-100/engine.opts.FailureRetry)
}
func TestEngineConnectionFailure(t *testing.T) {
engine := NewEngine("test", 0, engOpts)
assert.False(t, engine.isConnected())

View File

@ -30,7 +30,7 @@ func TestApplyFilters(t *testing.T) {
Id: "image-0-id",
RepoTags: []string{"image-0:tag1", "image-0:tag2"},
}}},
IsHealthy: true,
HealthIndicator: 100,
},
{
ID: "node-1-id",
@ -54,7 +54,7 @@ func TestApplyFilters(t *testing.T) {
Id: "image-1-id",
RepoTags: []string{"image-1:tag1", "image-0:tag3", "image-1:tag2"},
}}},
IsHealthy: false,
HealthIndicator: 0,
},
}
result []*node.Node

View File

@ -25,7 +25,7 @@ func (f *HealthFilter) Name() string {
func (f *HealthFilter) Filter(_ *cluster.ContainerConfig, nodes []*node.Node, _ bool) ([]*node.Node, error) {
result := []*node.Node{}
for _, node := range nodes {
if node.IsHealthy {
if node.IsHealthy() {
result = append(result, node)
}
}

View File

@ -11,15 +11,15 @@ import (
func testFixturesAllHealthyNode() []*node.Node {
return []*node.Node{
{
ID: "node-0-id",
Name: "node-0-name",
IsHealthy: true,
ID: "node-0-id",
Name: "node-0-name",
HealthIndicator: 100,
},
{
ID: "node-1-id",
Name: "node-1-name",
IsHealthy: true,
ID: "node-1-id",
Name: "node-1-name",
HealthIndicator: 100,
},
}
}
@ -27,15 +27,15 @@ func testFixturesAllHealthyNode() []*node.Node {
func testFixturesPartHealthyNode() []*node.Node {
return []*node.Node{
{
ID: "node-0-id",
Name: "node-0-name",
IsHealthy: false,
ID: "node-0-id",
Name: "node-0-name",
HealthIndicator: 0,
},
{
ID: "node-1-id",
Name: "node-1-name",
IsHealthy: true,
ID: "node-1-id",
Name: "node-1-name",
HealthIndicator: 100,
},
}
}
@ -43,15 +43,15 @@ func testFixturesPartHealthyNode() []*node.Node {
func testFixturesNoHealthyNode() []*node.Node {
return []*node.Node{
{
ID: "node-0-id",
Name: "node-0-name",
IsHealthy: false,
ID: "node-0-id",
Name: "node-0-name",
HealthIndicator: 0,
},
{
ID: "node-1-id",
Name: "node-1-name",
IsHealthy: false,
ID: "node-1-id",
Name: "node-1-name",
HealthIndicator: 0,
},
}
}

View File

@ -21,27 +21,32 @@ type Node struct {
TotalMemory int64
TotalCpus int64
IsHealthy bool
HealthIndicator int64
}
// NewNode creates a node from an engine.
func NewNode(e *cluster.Engine) *Node {
return &Node{
ID: e.ID,
IP: e.IP,
Addr: e.Addr,
Name: e.Name,
Labels: e.Labels,
Containers: e.Containers(),
Images: e.Images(),
UsedMemory: e.UsedMemory(),
UsedCpus: e.UsedCpus(),
TotalMemory: e.TotalMemory(),
TotalCpus: e.TotalCpus(),
IsHealthy: e.IsHealthy(),
ID: e.ID,
IP: e.IP,
Addr: e.Addr,
Name: e.Name,
Labels: e.Labels,
Containers: e.Containers(),
Images: e.Images(),
UsedMemory: e.UsedMemory(),
UsedCpus: e.UsedCpus(),
TotalMemory: e.TotalMemory(),
TotalCpus: e.TotalCpus(),
HealthIndicator: int64(e.HealthIndicator()),
}
}
// IsHealthy responses if node is in healthy state
func (n *Node) IsHealthy() bool {
return n.HealthIndicator > 0
}
// Container returns the container with IDOrName in the engine.
func (n *Node) Container(IDOrName string) *cluster.Container {
return n.Containers.Get(IDOrName)

View File

@ -23,7 +23,10 @@ func (p *BinpackPlacementStrategy) Name() string {
// RankAndSort sorts nodes based on the binpack strategy applied to the container config.
func (p *BinpackPlacementStrategy) RankAndSort(config *cluster.ContainerConfig, nodes []*node.Node) ([]*node.Node, error) {
weightedNodes, err := weighNodes(config, nodes)
// for binpack, a healthy node should increase its weight to increase its chance of being selected
// set healthFactor to 10 to make health degree [0, 100] overpower cpu + memory (each in range [0, 100])
const healthFactor int64 = 10
weightedNodes, err := weighNodes(config, nodes, healthFactor)
if err != nil {
return nil, err
}

View File

@ -14,11 +14,12 @@ func createNode(ID string, memory int64, cpus int64) *node.Node {
oc := 0.05
memory = int64(float64(memory) + float64(memory)*oc)
return &node.Node{
ID: ID,
IP: ID,
Addr: ID,
TotalMemory: memory * 1024 * 1024 * 1024,
TotalCpus: cpus,
ID: ID,
IP: ID,
Addr: ID,
TotalMemory: memory * 1024 * 1024 * 1024,
TotalCpus: cpus,
HealthIndicator: 100,
}
}

View File

@ -23,7 +23,10 @@ func (p *SpreadPlacementStrategy) Name() string {
// RankAndSort sorts nodes based on the spread strategy applied to the container config.
func (p *SpreadPlacementStrategy) RankAndSort(config *cluster.ContainerConfig, nodes []*node.Node) ([]*node.Node, error) {
weightedNodes, err := weighNodes(config, nodes)
// for spread, a healthy node should decrease its weight to increase its chance of being selected
// set healthFactor to -10 to make health degree [0, 100] overpower cpu + memory (each in range [0, 100])
const healthFactor int64 = -10
weightedNodes, err := weighNodes(config, nodes, healthFactor)
if err != nil {
return nil, err
}

View File

@ -36,7 +36,7 @@ func (n weightedNodeList) Less(i, j int) bool {
return ip.Weight < jp.Weight
}
func weighNodes(config *cluster.ContainerConfig, nodes []*node.Node) (weightedNodeList, error) {
func weighNodes(config *cluster.ContainerConfig, nodes []*node.Node, healthinessFactor int64) (weightedNodeList, error) {
weightedNodes := weightedNodeList{}
for _, node := range nodes {
@ -61,7 +61,7 @@ func weighNodes(config *cluster.ContainerConfig, nodes []*node.Node) (weightedNo
}
if cpuScore <= 100 && memoryScore <= 100 {
weightedNodes = append(weightedNodes, &weightedNode{Node: node, Weight: cpuScore + memoryScore})
weightedNodes = append(weightedNodes, &weightedNode{Node: node, Weight: cpuScore + memoryScore + healthinessFactor*node.HealthIndicator})
}
}

View File

@ -0,0 +1,57 @@
#!/usr/bin/env bats
load ../helpers
function teardown() {
swarm_manage_cleanup
stop_docker
}
@test "scheduler avoids failing node" {
# Start 1 engine and register it in the file.
start_docker 2
# Start swarm and check it can reach the node
# refresh interval is 20s. 20 retries before marking it as unhealthy
swarm_manage --engine-refresh-min-interval "20s" --engine-refresh-max-interval "20s" --engine-failure-retry 20 "${HOSTS[0]},${HOSTS[1]}"
eval "docker_swarm info | grep -q -i 'Nodes: 2'"
# Use memory on node-0
docker_swarm run -e constraint:node==node-0 -m 50m busybox sh
# Stop the node-1
docker_host stop ${DOCKER_CONTAINERS[1]}
# Try to schedule a container. It'd first select node-1 and fail
run docker_swarm run -m 10m busybox sh
[ "$status" -ne 0 ]
[[ "${lines[0]}" == *"Cannot connect to the docker engine endpoint"* ]]
# Try to run it again. It'd select node-0 and succeed
run docker_swarm run -m 10m busybox sh
[ "$status" -eq 0 ]
}
@test "refresh loop detects failure" {
# Start 1 engine and register it in the file.
start_docker 2
# Start swarm and check it can reach the node
# refresh interval is 1s. 20 retries before marking it as unhealthy
swarm_manage --engine-refresh-min-interval "1s" --engine-refresh-max-interval "1s" --engine-failure-retry 20 "${HOSTS[0]},${HOSTS[1]}"
eval "docker_swarm info | grep -q -i 'Nodes: 2'"
# Use memory on node-0
docker_swarm run -e constraint:node==node-0 -m 50m busybox sh
# Stop the node-1
docker_host stop ${DOCKER_CONTAINERS[1]}
# Sleep to let refresh loop detect node-1 failure
sleep 3
# Try to schedule a container. It'd select node-0 and succeed
run docker_swarm run -m 10m busybox sh
[ "$status" -eq 0 ]
}