mirror of https://github.com/docker/docs.git
Merge pull request #1578 from aluzzardi/rescheduling
[experimental] Simple container rescheduling on node failure
This commit is contained in:
commit
e1213384bc
|
@ -438,8 +438,14 @@ func postContainersCreate(c *context, w http.ResponseWriter, r *http.Request) {
|
|||
authConfig = &dockerclient.AuthConfig{}
|
||||
json.Unmarshal(buf, authConfig)
|
||||
}
|
||||
containerConfig := cluster.BuildContainerConfig(config)
|
||||
|
||||
container, err := c.cluster.CreateContainer(cluster.BuildContainerConfig(config), name, authConfig)
|
||||
if err := containerConfig.Validate(); err != nil {
|
||||
httpError(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
container, err := c.cluster.CreateContainer(containerConfig, name, authConfig)
|
||||
if err != nil {
|
||||
if strings.HasPrefix(err.Error(), "Conflict") {
|
||||
httpError(w, err.Error(), http.StatusConflict)
|
||||
|
|
|
@ -321,5 +321,6 @@ func manage(c *cli.Context) {
|
|||
server.SetHandler(api.NewPrimary(cl, tlsConfig, &statusHandler{cl, nil, nil}, c.GlobalBool("debug"), c.Bool("cors")))
|
||||
}
|
||||
|
||||
cluster.NewWatchdog(cl)
|
||||
log.Fatal(server.ListenAndServe())
|
||||
}
|
||||
|
|
|
@ -83,6 +83,9 @@ type Cluster interface {
|
|||
// Register an event handler for cluster-wide events.
|
||||
RegisterEventHandler(h EventHandler) error
|
||||
|
||||
// Unregister an event handler.
|
||||
UnregisterEventHandler(h EventHandler)
|
||||
|
||||
// FIXME: remove this method
|
||||
// Return a random engine
|
||||
RANDOMENGINE() (*Engine, error)
|
||||
|
|
|
@ -2,6 +2,8 @@ package cluster
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/samalba/dockerclient"
|
||||
|
@ -63,9 +65,10 @@ func consolidateResourceFields(c *dockerclient.ContainerConfig) {
|
|||
// BuildContainerConfig creates a cluster.ContainerConfig from a dockerclient.ContainerConfig
|
||||
func BuildContainerConfig(c dockerclient.ContainerConfig) *ContainerConfig {
|
||||
var (
|
||||
affinities []string
|
||||
constraints []string
|
||||
env []string
|
||||
affinities []string
|
||||
constraints []string
|
||||
reschedulePolicies []string
|
||||
env []string
|
||||
)
|
||||
|
||||
// only for tests
|
||||
|
@ -83,12 +86,19 @@ func BuildContainerConfig(c dockerclient.ContainerConfig) *ContainerConfig {
|
|||
json.Unmarshal([]byte(labels), &constraints)
|
||||
}
|
||||
|
||||
// parse affinities/constraints from env (ex. docker run -e affinity:container==redis -e affinity:image==nginx -e constraint:region==us-east -e constraint:storage==ssd)
|
||||
// parse reschedule policy from labels (ex. docker run --label 'com.docker.swarm.reschedule-policies=on-node-failure')
|
||||
if labels, ok := c.Labels[SwarmLabelNamespace+".reschedule-policies"]; ok {
|
||||
json.Unmarshal([]byte(labels), &reschedulePolicies)
|
||||
}
|
||||
|
||||
// parse affinities/constraints/reschedule policies from env (ex. docker run -e affinity:container==redis -e affinity:image==nginx -e constraint:region==us-east -e constraint:storage==ssd -e reschedule:off)
|
||||
for _, e := range c.Env {
|
||||
if ok, key, value := parseEnv(e); ok && key == "affinity" {
|
||||
affinities = append(affinities, value)
|
||||
} else if ok && key == "constraint" {
|
||||
constraints = append(constraints, value)
|
||||
} else if ok && key == "reschedule" {
|
||||
reschedulePolicies = append(reschedulePolicies, value)
|
||||
} else {
|
||||
env = append(env, e)
|
||||
}
|
||||
|
@ -111,6 +121,13 @@ func BuildContainerConfig(c dockerclient.ContainerConfig) *ContainerConfig {
|
|||
}
|
||||
}
|
||||
|
||||
// store reschedule policies in labels
|
||||
if len(reschedulePolicies) > 0 {
|
||||
if labels, err := json.Marshal(reschedulePolicies); err == nil {
|
||||
c.Labels[SwarmLabelNamespace+".reschedule-policies"] = string(labels)
|
||||
}
|
||||
}
|
||||
|
||||
consolidateResourceFields(&c)
|
||||
|
||||
return &ContainerConfig{c}
|
||||
|
@ -186,3 +203,33 @@ func (c *ContainerConfig) HaveNodeConstraint() bool {
|
|||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// HasReschedulePolicy returns true if the specified policy is part of the config
|
||||
func (c *ContainerConfig) HasReschedulePolicy(p string) bool {
|
||||
for _, reschedulePolicy := range c.extractExprs("reschedule-policies") {
|
||||
if reschedulePolicy == p {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Validate returns an error if the config isn't valid
|
||||
func (c *ContainerConfig) Validate() error {
|
||||
//TODO: add validation for affinities and constraints
|
||||
reschedulePolicies := c.extractExprs("reschedule-policies")
|
||||
if len(reschedulePolicies) > 1 {
|
||||
return errors.New("too many reschedule policies")
|
||||
} else if len(reschedulePolicies) == 1 {
|
||||
valid := false
|
||||
for _, validReschedulePolicy := range []string{"off", "on-node-failure"} {
|
||||
if reschedulePolicies[0] == validReschedulePolicy {
|
||||
valid = true
|
||||
}
|
||||
}
|
||||
if !valid {
|
||||
return fmt.Errorf("invalid reschedule policy: %s", reschedulePolicies[0])
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -21,6 +21,11 @@ func (c *Container) Refresh() (*Container, error) {
|
|||
return c.Engine.refreshContainer(c.Id, true)
|
||||
}
|
||||
|
||||
// Start a container
|
||||
func (c *Container) Start() error {
|
||||
return c.Engine.client.StartContainer(c.Id, nil)
|
||||
}
|
||||
|
||||
// Containers represents a list a containers
|
||||
type Containers []*Container
|
||||
|
||||
|
|
|
@ -1,6 +1,12 @@
|
|||
package cluster
|
||||
|
||||
import "github.com/samalba/dockerclient"
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
|
||||
log "github.com/Sirupsen/logrus"
|
||||
"github.com/samalba/dockerclient"
|
||||
)
|
||||
|
||||
// Event is exported
|
||||
type Event struct {
|
||||
|
@ -12,3 +18,49 @@ type Event struct {
|
|||
type EventHandler interface {
|
||||
Handle(*Event) error
|
||||
}
|
||||
|
||||
// EventHandlers is a map of EventHandler
|
||||
type EventHandlers struct {
|
||||
sync.RWMutex
|
||||
|
||||
eventHandlers map[EventHandler]struct{}
|
||||
}
|
||||
|
||||
// NewEventHandlers returns a EventHandlers
|
||||
func NewEventHandlers() *EventHandlers {
|
||||
return &EventHandlers{
|
||||
eventHandlers: make(map[EventHandler]struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Handle callbacks for the events
|
||||
func (eh *EventHandlers) Handle(e *Event) {
|
||||
eh.RLock()
|
||||
defer eh.RUnlock()
|
||||
|
||||
for h := range eh.eventHandlers {
|
||||
if err := h.Handle(e); err != nil {
|
||||
log.Error(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterEventHandler registers an event handler.
|
||||
func (eh *EventHandlers) RegisterEventHandler(h EventHandler) error {
|
||||
eh.Lock()
|
||||
defer eh.Unlock()
|
||||
|
||||
if _, ok := eh.eventHandlers[h]; ok {
|
||||
return errors.New("event handler already set")
|
||||
}
|
||||
eh.eventHandlers[h] = struct{}{}
|
||||
return nil
|
||||
}
|
||||
|
||||
// UnregisterEventHandler unregisters a previously registered event handler.
|
||||
func (eh *EventHandlers) UnregisterEventHandler(h EventHandler) {
|
||||
eh.Lock()
|
||||
defer eh.Unlock()
|
||||
|
||||
delete(eh.eventHandlers, h)
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@ type Cluster struct {
|
|||
|
||||
driver *mesosscheduler.MesosSchedulerDriver
|
||||
dockerEnginePort string
|
||||
eventHandler cluster.EventHandler
|
||||
eventHandlers *cluster.EventHandlers
|
||||
master string
|
||||
agents map[string]*agent
|
||||
scheduler *scheduler.Scheduler
|
||||
|
@ -67,6 +67,7 @@ func NewCluster(scheduler *scheduler.Scheduler, TLSConfig *tls.Config, master st
|
|||
}
|
||||
cluster := &Cluster{
|
||||
dockerEnginePort: defaultDockerEnginePort,
|
||||
eventHandlers: cluster.NewEventHandlers(),
|
||||
master: master,
|
||||
agents: make(map[string]*agent),
|
||||
scheduler: scheduler,
|
||||
|
@ -156,22 +157,18 @@ func NewCluster(scheduler *scheduler.Scheduler, TLSConfig *tls.Config, master st
|
|||
|
||||
// Handle callbacks for the events
|
||||
func (c *Cluster) Handle(e *cluster.Event) error {
|
||||
if c.eventHandler == nil {
|
||||
return nil
|
||||
}
|
||||
if err := c.eventHandler.Handle(e); err != nil {
|
||||
log.Error(err)
|
||||
}
|
||||
c.eventHandlers.Handle(e)
|
||||
return nil
|
||||
}
|
||||
|
||||
// RegisterEventHandler registers an event handler.
|
||||
func (c *Cluster) RegisterEventHandler(h cluster.EventHandler) error {
|
||||
if c.eventHandler != nil {
|
||||
return errors.New("event handler already set")
|
||||
}
|
||||
c.eventHandler = h
|
||||
return nil
|
||||
return c.eventHandlers.RegisterEventHandler(h)
|
||||
}
|
||||
|
||||
// UnregisterEventHandler unregisters a previously registered event handler.
|
||||
func (c *Cluster) UnregisterEventHandler(h cluster.EventHandler) {
|
||||
c.eventHandlers.UnregisterEventHandler(h)
|
||||
}
|
||||
|
||||
// CreateContainer for container creation in Mesos task
|
||||
|
|
|
@ -50,7 +50,7 @@ func (p *pendingContainer) ToContainer() *cluster.Container {
|
|||
type Cluster struct {
|
||||
sync.RWMutex
|
||||
|
||||
eventHandler cluster.EventHandler
|
||||
eventHandlers *cluster.EventHandlers
|
||||
engines map[string]*cluster.Engine
|
||||
pendingEngines map[string]*cluster.Engine
|
||||
scheduler *scheduler.Scheduler
|
||||
|
@ -67,6 +67,7 @@ func NewCluster(scheduler *scheduler.Scheduler, TLSConfig *tls.Config, discovery
|
|||
log.WithFields(log.Fields{"name": "swarm"}).Debug("Initializing cluster")
|
||||
|
||||
cluster := &Cluster{
|
||||
eventHandlers: cluster.NewEventHandlers(),
|
||||
engines: make(map[string]*cluster.Engine),
|
||||
pendingEngines: make(map[string]*cluster.Engine),
|
||||
scheduler: scheduler,
|
||||
|
@ -90,22 +91,18 @@ func NewCluster(scheduler *scheduler.Scheduler, TLSConfig *tls.Config, discovery
|
|||
|
||||
// Handle callbacks for the events
|
||||
func (c *Cluster) Handle(e *cluster.Event) error {
|
||||
if c.eventHandler == nil {
|
||||
return nil
|
||||
}
|
||||
if err := c.eventHandler.Handle(e); err != nil {
|
||||
log.Error(err)
|
||||
}
|
||||
c.eventHandlers.Handle(e)
|
||||
return nil
|
||||
}
|
||||
|
||||
// RegisterEventHandler registers an event handler.
|
||||
func (c *Cluster) RegisterEventHandler(h cluster.EventHandler) error {
|
||||
if c.eventHandler != nil {
|
||||
return errors.New("event handler already set")
|
||||
}
|
||||
c.eventHandler = h
|
||||
return nil
|
||||
return c.eventHandlers.RegisterEventHandler(h)
|
||||
}
|
||||
|
||||
// UnregisterEventHandler unregisters a previously registered event handler.
|
||||
func (c *Cluster) UnregisterEventHandler(h cluster.EventHandler) {
|
||||
c.eventHandlers.UnregisterEventHandler(h)
|
||||
}
|
||||
|
||||
// Generate a globally (across the cluster) unique ID.
|
||||
|
@ -145,9 +142,12 @@ func (c *Cluster) createContainer(config *cluster.ContainerConfig, name string,
|
|||
return nil, fmt.Errorf("Conflict: The name %s is already assigned. You have to delete (or rename) that container to be able to assign %s to a container again.", name, name)
|
||||
}
|
||||
|
||||
// Associate a Swarm ID to the container we are creating.
|
||||
swarmID := c.generateUniqueID()
|
||||
config.SetSwarmID(swarmID)
|
||||
swarmID := config.SwarmID()
|
||||
if swarmID == "" {
|
||||
// Associate a Swarm ID to the container we are creating.
|
||||
swarmID = c.generateUniqueID()
|
||||
config.SetSwarmID(swarmID)
|
||||
}
|
||||
|
||||
if withImageAffinity {
|
||||
config.AddAffinity("image==" + config.Image)
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
package cluster
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
log "github.com/Sirupsen/logrus"
|
||||
)
|
||||
|
||||
// Watchdog listen to cluster events ans handle container rescheduling
|
||||
type Watchdog struct {
|
||||
sync.Mutex
|
||||
cluster Cluster
|
||||
}
|
||||
|
||||
// Handle cluster callbacks
|
||||
func (w *Watchdog) Handle(e *Event) error {
|
||||
// Skip non-swarm events.
|
||||
if e.From != "swarm" {
|
||||
return nil
|
||||
}
|
||||
|
||||
switch e.Status {
|
||||
case "engine_reconnect":
|
||||
go w.removeDuplicateContainers(e.Engine)
|
||||
case "engine_disconnect":
|
||||
go w.rescheduleContainers(e.Engine)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Remove Duplicate containers when a node comes back
|
||||
func (w *Watchdog) removeDuplicateContainers(e *Engine) {
|
||||
log.Debugf("removing duplicate containers from Node %s", e.ID)
|
||||
|
||||
e.RefreshContainers(false)
|
||||
|
||||
w.Lock()
|
||||
defer w.Unlock()
|
||||
|
||||
for _, container := range e.Containers() {
|
||||
|
||||
for _, containerInCluster := range w.cluster.Containers() {
|
||||
if containerInCluster.Config.SwarmID() == container.Config.SwarmID() && containerInCluster.Engine.ID != container.Engine.ID {
|
||||
log.Debugf("container %s was rescheduled on node %s, removing it\n", container.Id, containerInCluster.Engine.ID)
|
||||
// container already exists in the cluster, destroy it
|
||||
e.RemoveContainer(container, true, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reschedule containers as soon as a node fail
|
||||
func (w *Watchdog) rescheduleContainers(e *Engine) {
|
||||
w.Lock()
|
||||
defer w.Unlock()
|
||||
|
||||
log.Debugf("Node %s failed - rescheduling containers", e.ID)
|
||||
for _, c := range e.Containers() {
|
||||
|
||||
// Skip containers which don't have an "on-node-failure" reschedule policy.
|
||||
if !c.Config.HasReschedulePolicy("on-node-failure") {
|
||||
log.Debugf("Skipping rescheduling of %s based on rescheduling policies", c.Id)
|
||||
continue
|
||||
}
|
||||
|
||||
// Remove the container from the dead engine. If we don't, then both
|
||||
// the old and new one will show up in docker ps.
|
||||
// We have to do this before calling `CreateContainer`, otherwise it
|
||||
// will abort because the name is already taken.
|
||||
c.Engine.removeContainer(c)
|
||||
|
||||
newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name, nil)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
|
||||
// add the container back, so we can retry later
|
||||
c.Engine.AddContainer(c)
|
||||
} else {
|
||||
log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID())
|
||||
if c.Info.State.Running {
|
||||
if err := newContainer.Start(); err != nil {
|
||||
log.Errorf("Failed to start rescheduled container %s", newContainer.Id)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// NewWatchdog creates a new watchdog
|
||||
func NewWatchdog(cluster Cluster) *Watchdog {
|
||||
log.Debugf("Watchdog enabled")
|
||||
w := &Watchdog{
|
||||
cluster: cluster,
|
||||
}
|
||||
cluster.RegisterEventHandler(w)
|
||||
return w
|
||||
}
|
|
@ -10,7 +10,7 @@ parent="smn_workw_swarm"
|
|||
|
||||
## Advanced Scheduling
|
||||
|
||||
To learn more about advanced scheduling, see the
|
||||
[strategies](strategy.md) and [filters](filter.md)
|
||||
To learn more about advanced scheduling, see the [rescheduling]
|
||||
(rescheduling.md), [strategies](strategy.md) and [filters](filter.md)
|
||||
documents.
|
||||
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
# Docker Swarm Experimental Features
|
||||
|
||||
This page contains a list of features in the Docker Swarm which are
|
||||
experimental. Experimental features are **not** ready for production. They are
|
||||
provided for test and evaluation in your sandbox environments.
|
||||
|
||||
The information below describes each feature and the GitHub pull requests and
|
||||
issues associated with it. If necessary, links are provided to additional
|
||||
documentation on an issue. As an active Docker user and community member,
|
||||
please feel free to provide any feedback on these features you wish.
|
||||
|
||||
|
||||
## Current experimental features
|
||||
|
||||
* [Container Rescheduling on node failure](rescheduling.md)
|
||||
|
||||
## How to comment on an experimental feature
|
||||
|
||||
Each feature's documentation includes a list of proposal pull requests or PRs associated with the feature. If you want to comment on or suggest a change to a feature, please add it to the existing feature PR.
|
||||
|
||||
Issues or problems with a feature? Inquire for help on the `#swarm` IRC channel or in on the [Docker Google group](https://groups.google.com/forum/#!forum/docker-user).
|
|
@ -0,0 +1,41 @@
|
|||
<!--[metadata]>
|
||||
+++
|
||||
title = "Docker Swarm recheduling"
|
||||
description = "Swarm rescheduling"
|
||||
keywords = ["docker, swarm, clustering, rescheduling"]
|
||||
[menu.main]
|
||||
parent="smn_workw_swarm"
|
||||
weight=5
|
||||
+++
|
||||
<![end-metadata]-->
|
||||
|
||||
# Rescheduling
|
||||
|
||||
The Docker Swarm scheduler is able to detect node failure and
|
||||
restart its containers on another node.
|
||||
|
||||
## Rescheduling policies
|
||||
|
||||
The rescheduling policies are:
|
||||
|
||||
* `on-node-failure`
|
||||
* `off` (default if not specified)
|
||||
|
||||
|
||||
When you start a container, use the env var `reschedule` or the
|
||||
label `com.docker.swarm.reschedule-policy` to specify the policy to
|
||||
apply to the container.
|
||||
|
||||
```
|
||||
# do not reschedule (default)
|
||||
$ docker run -d -e reschedule:off redis
|
||||
# or
|
||||
$ docker run -d -l 'com.docker.swarm.reschedule-policy=["off"]' redis
|
||||
```
|
||||
|
||||
```
|
||||
# reschedule on node failure
|
||||
$ docker run -d -e reschedule:on-node-failure redis
|
||||
# or
|
||||
$ docker run -d -l 'com.docker.swarm.reschedule-policy=["on-node-failure"]' redis
|
||||
```
|
|
@ -0,0 +1,184 @@
|
|||
#!/usr/bin/env bats
|
||||
|
||||
load helpers
|
||||
|
||||
function teardown() {
|
||||
swarm_manage_cleanup
|
||||
stop_docker
|
||||
}
|
||||
|
||||
@test "rescheduling" {
|
||||
start_docker_with_busybox 2
|
||||
swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 ${HOSTS[0]},${HOSTS[1]}
|
||||
|
||||
# c1 on node-0 with reschedule=on-node-failure
|
||||
run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' busybox sh
|
||||
[ "$status" -eq 0 ]
|
||||
# c2 on node-0 with reschedule=off
|
||||
run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label 'com.docker.swarm.reschedule-policies=["off"]' busybox sh
|
||||
[ "$status" -eq 0 ]
|
||||
# c3 on node-1
|
||||
run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' busybox sh
|
||||
[ "$status" -eq 0 ]
|
||||
|
||||
run docker_swarm ps -q
|
||||
[ "${#lines[@]}" -eq 3 ]
|
||||
|
||||
# Make sure containers are running where they should.
|
||||
run docker_swarm inspect c1
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-0"'* ]]
|
||||
run docker_swarm inspect c2
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-0"'* ]]
|
||||
run docker_swarm inspect c3
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||
|
||||
# Get c1 swarm id
|
||||
swarm_id=$(docker_swarm inspect -f '{{ index .Config.Labels "com.docker.swarm.id" }}' c1)
|
||||
|
||||
# Stop node-0
|
||||
docker_host stop ${DOCKER_CONTAINERS[0]}
|
||||
|
||||
# Wait for Swarm to detect the node failure.
|
||||
retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'"
|
||||
|
||||
# Wait for the container to be rescheduled
|
||||
retry 5 1 eval docker_swarm inspect c1
|
||||
|
||||
# c1 should have been rescheduled from node-0 to node-1
|
||||
run docker_swarm inspect c1
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||
|
||||
# Check swarm id didn't change for c1
|
||||
[[ "$swarm_id" == $(docker_swarm inspect -f '{{ index .Config.Labels "com.docker.swarm.id" }}' c1) ]]
|
||||
|
||||
run docker_swarm inspect "$swarm_id"
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||
|
||||
# c2 should still be on node-0 since the rescheduling policy was off.
|
||||
run docker_swarm inspect c2
|
||||
[ "$status" -eq 1 ]
|
||||
|
||||
# c3 should still be on node-1 since it wasn't affected
|
||||
run docker_swarm inspect c3
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||
}
|
||||
|
||||
@test "rescheduling with constraints" {
|
||||
start_docker_with_busybox 2
|
||||
swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 ${HOSTS[0]},${HOSTS[1]}
|
||||
|
||||
# c1 on node-0 with reschedule=on-node-failure
|
||||
run docker_swarm run -dit --name c1 -e constraint:node==~node-0 -e reschedule:on-node-failure busybox sh
|
||||
[ "$status" -eq 0 ]
|
||||
# c2 on node-0 with reschedule=off
|
||||
run docker_swarm run -dit --name c2 -e constraint:node==node-0 -e reschedule:on-node-failure busybox sh
|
||||
[ "$status" -eq 0 ]
|
||||
# c3 on node-1
|
||||
run docker_swarm run -dit --name c3 -e constraint:node==node-1 -e reschedule:on-node-failure busybox sh
|
||||
[ "$status" -eq 0 ]
|
||||
|
||||
run docker_swarm ps -q
|
||||
[ "${#lines[@]}" -eq 3 ]
|
||||
|
||||
# Make sure containers are running where they should.
|
||||
run docker_swarm inspect c1
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-0"'* ]]
|
||||
run docker_swarm inspect c2
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-0"'* ]]
|
||||
run docker_swarm inspect c3
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||
|
||||
# Stop node-0
|
||||
docker_host stop ${DOCKER_CONTAINERS[0]}
|
||||
|
||||
# Wait for Swarm to detect the node failure.
|
||||
retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'"
|
||||
|
||||
# Wait for the container to be rescheduled
|
||||
retry 5 1 eval docker_swarm inspect c1
|
||||
|
||||
# c1 should have been rescheduled from node-0 to node-1
|
||||
run docker_swarm inspect c1
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||
|
||||
# c2 should still be on node-0 since a node constraint was applied.
|
||||
run docker_swarm inspect c2
|
||||
[ "$status" -eq 1 ]
|
||||
|
||||
# c3 should still be on node-1 since it wasn't affected
|
||||
run docker_swarm inspect c3
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||
}
|
||||
|
||||
@test "reschedule conflict" {
|
||||
start_docker_with_busybox 2
|
||||
swarm_manage
|
||||
|
||||
run docker_swarm run --name c1 -dit --label 'com.docker.swarm.reschedule-policies=["false"]' busybox sh
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "${output}" == *'invalid reschedule policy: false'* ]]
|
||||
|
||||
run docker_swarm run --name c2 -dit -e reschedule:off --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' -e reschedule:off busybox sh
|
||||
[ "$status" -ne 0 ]
|
||||
[[ "${output}" == *'too many reschedule policies'* ]]
|
||||
}
|
||||
|
||||
@test "rescheduling node comes back" {
|
||||
start_docker_with_busybox 2
|
||||
swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 ${HOSTS[0]},${HOSTS[1]}
|
||||
|
||||
# c1 on node-0 with reschedule=on-node-failure
|
||||
run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' busybox sh
|
||||
[ "$status" -eq 0 ]
|
||||
# c2 on node-1
|
||||
run docker_swarm run -dit --name c2 -e constraint:node==~node-1 --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' busybox sh
|
||||
[ "$status" -eq 0 ]
|
||||
|
||||
run docker_swarm ps -q
|
||||
[ "${#lines[@]}" -eq 2 ]
|
||||
|
||||
# Make sure containers are running where they should.
|
||||
run docker_swarm inspect c1
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-0"'* ]]
|
||||
run docker_swarm inspect c2
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||
|
||||
# Stop node-0
|
||||
docker_host stop ${DOCKER_CONTAINERS[0]}
|
||||
|
||||
# Wait for Swarm to detect the node failure.
|
||||
retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'"
|
||||
|
||||
# Wait for the container to be rescheduled
|
||||
retry 5 1 eval docker_swarm inspect c1
|
||||
|
||||
# c1 should have been rescheduled from node-0 to node-1
|
||||
run docker_swarm inspect c1
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||
|
||||
# c2 should still be on node-1 since it wasn't affected
|
||||
run docker_swarm inspect c2
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "${output}" == *'"Name": "node-1"'* ]]
|
||||
|
||||
# Restart node-0
|
||||
docker_host start ${DOCKER_CONTAINERS[0]}
|
||||
|
||||
sleep 5
|
||||
run docker_swarm ps
|
||||
[ "${#lines[@]}" -eq 3 ]
|
||||
}
|
Loading…
Reference in New Issue