Add swarm container create retry option.

Signed-off-by: Dong Chen <dongluo.chen@docker.com>
This commit is contained in:
Dong Chen 2016-01-12 19:28:06 -08:00
parent d21748699d
commit 8cc9b6c284
3 changed files with 37 additions and 1 deletions

View File

@ -44,6 +44,7 @@ Arguments:
Options:
{{range .Flags}}{{.}}
{{end}}{{if (eq .Name "manage")}}{{printf "\t * swarm.overcommit=0.05\tovercommit to apply on resources"}}
{{printf "\t * swarm.createretry=0\tcontainer create retry count after initial failure"}}
{{printf "\t * mesos.address=\taddress to bind on [$SWARM_MESOS_ADDRESS]"}}
{{printf "\t * mesos.port=\tport to bind on [$SWARM_MESOS_PORT]"}}
{{printf "\t * mesos.offertimeout=30s\ttimeout for offers [$SWARM_MESOS_OFFER_TIMEOUT]"}}

View File

@ -59,6 +59,7 @@ type Cluster struct {
overcommitRatio float64
engineOpts *cluster.EngineOpts
createRetry int64
TLSConfig *tls.Config
}
@ -76,12 +77,20 @@ func NewCluster(scheduler *scheduler.Scheduler, TLSConfig *tls.Config, discovery
pendingContainers: make(map[string]*pendingContainer),
overcommitRatio: 0.05,
engineOpts: engineOptions,
createRetry: 0,
}
if val, ok := options.Float("swarm.overcommit", ""); ok {
cluster.overcommitRatio = val
}
if val, ok := options.Int("swarm.createretry", ""); ok {
if val < 0 {
log.Fatalf("swarm.createretry=%d is invalid", val)
}
cluster.createRetry = val
}
discoveryCh, errCh := cluster.discovery.Watch(nil)
go cluster.monitorDiscovery(discoveryCh, errCh)
go cluster.monitorPendingEngines()
@ -119,16 +128,23 @@ func (c *Cluster) generateUniqueID() string {
func (c *Cluster) CreateContainer(config *cluster.ContainerConfig, name string, authConfig *dockerclient.AuthConfig) (*cluster.Container, error) {
container, err := c.createContainer(config, name, false, authConfig)
// fails with image not found, then try to reschedule with image affinity
if err != nil {
var retries int64
// fails with image not found, then try to reschedule with image affinity
bImageNotFoundError, _ := regexp.MatchString(`image \S* not found`, err.Error())
if bImageNotFoundError && !config.HaveNodeConstraint() {
// Check if the image exists in the cluster
// If exists, retry with a image affinity
if c.Image(config.Image) != nil {
container, err = c.createContainer(config, name, true, authConfig)
retries++
}
}
for ; retries < c.createRetry && err != nil; retries++ {
log.WithFields(log.Fields{"Name": "Swarm"}).Warnf("Failed to create container: %s, retrying", err)
container, err = c.createContainer(config, name, false, authConfig)
}
}
return container, err
}

View File

@ -55,3 +55,22 @@ function teardown() {
[ "$status" -eq 0 ]
}
@test "scheduler retry" {
# Start 1 engine and register it in the file.
start_docker 2
# Start swarm and check it can reach the node
# refresh interval is 20s. 20 retries before marking it as unhealthy
swarm_manage --engine-refresh-min-interval "20s" --engine-refresh-max-interval "20s" --engine-failure-retry 20 -cluster-opt swarm.createretry=1 "${HOSTS[0]},${HOSTS[1]}"
eval "docker_swarm info | grep -q -i 'Nodes: 2'"
# Use memory on node-0
docker_swarm run -e constraint:node==node-0 -m 50m busybox sh
# Stop the node-1
docker_host stop ${DOCKER_CONTAINERS[1]}
# Try to run a container. It'd try node-1, upon failure automatically retry on node-0
run docker_swarm run -m 10m busybox sh
[ "$status" -eq 0 ]
}