mirror of https://github.com/docker/docs.git
Merge pull request #2436 from dongluochen/rescheduleWithOverlay
Reschedule containers with global network
This commit is contained in:
commit
dcaaa5570a
|
@ -2,8 +2,11 @@ package cluster
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
log "github.com/Sirupsen/logrus"
|
log "github.com/Sirupsen/logrus"
|
||||||
|
"github.com/docker/engine-api/types/network"
|
||||||
|
"golang.org/x/net/context"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Watchdog listens to cluster events and handles container rescheduling
|
// Watchdog listens to cluster events and handles container rescheduling
|
||||||
|
@ -61,6 +64,7 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
|
||||||
defer w.Unlock()
|
defer w.Unlock()
|
||||||
|
|
||||||
log.Debugf("Node %s failed - rescheduling containers", e.ID)
|
log.Debugf("Node %s failed - rescheduling containers", e.ID)
|
||||||
|
|
||||||
for _, c := range e.Containers() {
|
for _, c := range e.Containers() {
|
||||||
|
|
||||||
// Skip containers which don't have an "on-node-failure" reschedule policy.
|
// Skip containers which don't have an "on-node-failure" reschedule policy.
|
||||||
|
@ -75,13 +79,68 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
|
||||||
// will abort because the name is already taken.
|
// will abort because the name is already taken.
|
||||||
c.Engine.removeContainer(c)
|
c.Engine.removeContainer(c)
|
||||||
|
|
||||||
newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name, nil)
|
// keep track of all global networks this container is connected to
|
||||||
|
globalNetworks := make(map[string]*network.EndpointSettings)
|
||||||
|
// if the existing containter has global network endpoints,
|
||||||
|
// they need to be removed with force option
|
||||||
|
// "docker network disconnect -f network containername" only takes containername
|
||||||
|
name := c.Info.Name
|
||||||
|
if len(name) == 0 || len(name) == 1 && name[0] == '/' {
|
||||||
|
log.Errorf("container %s has no name", c.ID)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// cut preceeding '/'
|
||||||
|
if name[0] == '/' {
|
||||||
|
name = name[1:]
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.NetworkSettings != nil && len(c.NetworkSettings.Networks) > 0 {
|
||||||
|
// find an engine to do disconnect work
|
||||||
|
randomEngine, err := w.cluster.RANDOMENGINE()
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Failed to find an engine to do network cleanup for container %s: %v", c.ID, err)
|
||||||
|
// add the container back, so we can retry later
|
||||||
|
c.Engine.AddContainer(c)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
clusterNetworks := w.cluster.Networks().Uniq()
|
||||||
|
for networkName, endpoint := range c.NetworkSettings.Networks {
|
||||||
|
net := clusterNetworks.Get(endpoint.NetworkID)
|
||||||
|
if net != nil && net.Scope == "global" {
|
||||||
|
// record the nework, they should be reconstructed on the new container
|
||||||
|
globalNetworks[networkName] = endpoint
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
err = randomEngine.apiClient.NetworkDisconnect(ctx, networkName, name, true)
|
||||||
|
if err != nil {
|
||||||
|
// do not abort here as this endpoint might have been removed before
|
||||||
|
log.Warnf("Failed to remove network endpoint from old container %s: %v", name, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Failed to reschedule container %s: %v", c.ID, err)
|
log.Errorf("Failed to reschedule container %s: %v", c.ID, err)
|
||||||
// add the container back, so we can retry later
|
// add the container back, so we can retry later
|
||||||
c.Engine.AddContainer(c)
|
c.Engine.AddContainer(c)
|
||||||
} else {
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Docker create command cannot create a container with multiple networks
|
||||||
|
// see https://github.com/docker/docker/issues/17750
|
||||||
|
// Add the global networks one by one
|
||||||
|
for networkName, endpoint := range globalNetworks {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
err = newContainer.Engine.apiClient.NetworkConnect(ctx, networkName, name, endpoint)
|
||||||
|
if err != nil {
|
||||||
|
log.Warnf("Failed to connect network %s to container %s: %v", networkName, name, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
log.Infof("Rescheduled container %s from %s to %s as %s", c.ID, c.Engine.Name, newContainer.Engine.Name, newContainer.ID)
|
log.Infof("Rescheduled container %s from %s to %s as %s", c.ID, c.Engine.Name, newContainer.Engine.Name, newContainer.ID)
|
||||||
if c.Info.State.Running {
|
if c.Info.State.Running {
|
||||||
log.Infof("Container %s was running, starting container %s", c.ID, newContainer.ID)
|
log.Infof("Container %s was running, starting container %s", c.ID, newContainer.ID)
|
||||||
|
@ -92,8 +151,6 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewWatchdog creates a new watchdog
|
// NewWatchdog creates a new watchdog
|
||||||
func NewWatchdog(cluster Cluster) *Watchdog {
|
func NewWatchdog(cluster Cluster) *Watchdog {
|
||||||
log.Debugf("Watchdog enabled")
|
log.Debugf("Watchdog enabled")
|
||||||
|
|
Loading…
Reference in New Issue