Merge pull request #1636 from jimenez/checkpoint_failover

Enabling checkpoint failover in FrameworkInfo
This commit is contained in:
Victor Vieux 2016-01-14 10:18:09 -08:00
commit 08839f62fa
4 changed files with 21 additions and 7 deletions

View File

@ -46,11 +46,12 @@ Options:
{{end}}{{if (eq .Name "manage")}}{{printf "\t * swarm.overcommit=0.05\tovercommit to apply on resources"}} {{end}}{{if (eq .Name "manage")}}{{printf "\t * swarm.overcommit=0.05\tovercommit to apply on resources"}}
{{printf "\t * swarm.createretry=0\tcontainer create retry count after initial failure"}} {{printf "\t * swarm.createretry=0\tcontainer create retry count after initial failure"}}
{{printf "\t * mesos.address=\taddress to bind on [$SWARM_MESOS_ADDRESS]"}} {{printf "\t * mesos.address=\taddress to bind on [$SWARM_MESOS_ADDRESS]"}}
{{printf "\t * mesos.checkpointfailover=false\tcheckpointing allows a restarted slave to reconnect with old executors and recover status updates, at the cost of disk I/O [$SWARM_MESOS_CHECKPOINT_FAILOVER]"}}
{{printf "\t * mesos.port=\tport to bind on [$SWARM_MESOS_PORT]"}} {{printf "\t * mesos.port=\tport to bind on [$SWARM_MESOS_PORT]"}}
{{printf "\t * mesos.offertimeout=30s\ttimeout for offers [$SWARM_MESOS_OFFER_TIMEOUT]"}} {{printf "\t * mesos.offertimeout=30s\ttimeout for offers [$SWARM_MESOS_OFFER_TIMEOUT]"}}
{{printf "\t * mesos.offerrefusetimeout=5s\tseconds to consider unused resources refused [$SWARM_MESOS_OFFER_REFUSE_TIMEOUT]"}}
{{printf "\t * mesos.tasktimeout=5s\ttimeout for task creation [$SWARM_MESOS_TASK_TIMEOUT]"}} {{printf "\t * mesos.tasktimeout=5s\ttimeout for task creation [$SWARM_MESOS_TASK_TIMEOUT]"}}
{{printf "\t * mesos.user=\tframework user [$SWARM_MESOS_USER]"}} {{printf "\t * mesos.user=\tframework user [$SWARM_MESOS_USER]"}}{{end}}{{ end }}
{{printf "\t * mesos.offerrefusetimeout=5s\tseconds to consider unused resources refused [$SWARM_MESOS_OFFER_REFUSE_TIMEOUT]"}}{{end}}{{ end }}
` `
} }

View File

@ -119,6 +119,10 @@ func NewCluster(scheduler *scheduler.Scheduler, TLSConfig *tls.Config, master st
driverConfig.BindingAddress = bindingAddress driverConfig.BindingAddress = bindingAddress
} }
if checkpointFailover, ok := options.Bool("mesos.checkpointfailover", "SWARM_MESOS_CHECKPOINT_FAILOVER"); ok {
driverConfig.Framework.Checkpoint = &checkpointFailover
}
if offerTimeout, ok := options.String("mesos.offertimeout", "SWARM_MESOS_OFFER_TIMEOUT"); ok { if offerTimeout, ok := options.String("mesos.offertimeout", "SWARM_MESOS_OFFER_TIMEOUT"); ok {
d, err := time.ParseDuration(offerTimeout) d, err := time.ParseDuration(offerTimeout)
if err != nil { if err != nil {

View File

@ -58,3 +58,12 @@ func (do DriverOpts) IP(key, env string) (net.IP, bool) {
} }
return nil, false return nil, false
} }
// Bool returns a boolean from the driver options
func (do DriverOpts) Bool(key, env string) (bool, bool) {
if value, ok := do.String(key, env); ok {
b, _ := strconv.ParseBool(value)
return b, true
}
return false, false
}

View File

@ -8,7 +8,7 @@ export SWARM_MESOS_USER=root
MESOS_IMAGE=dockerswarm/mesos:0.25.0 MESOS_IMAGE=dockerswarm/mesos:0.25.0
MESOS_MASTER_PORT=$(( ( RANDOM % 1000 ) + 10000 )) MESOS_MASTER_PORT=$(( ( RANDOM % 1000 ) + 10000 ))
# Start mesos master and slave. # Start mesos master and agent.
function start_mesos() { function start_mesos() {
local current=${#DOCKER_CONTAINERS[@]} local current=${#DOCKER_CONTAINERS[@]}
MESOS_MASTER=$( MESOS_MASTER=$(
@ -19,7 +19,7 @@ function start_mesos() {
retry 10 1 eval "docker_host ps | grep 'mesos-master'" retry 10 1 eval "docker_host ps | grep 'mesos-master'"
for ((i=0; i < current; i++)); do for ((i=0; i < current; i++)); do
local docker_port=$(echo ${HOSTS[$i]} | cut -d: -f2) local docker_port=$(echo ${HOSTS[$i]} | cut -d: -f2)
MESOS_SLAVES[$i]=$( MESOS_AGENTS[$i]=$(
docker_host run --privileged -d --name mesos-slave-$i --volumes-from node-$i -v /sys/fs/cgroup:/sys/fs/cgroup --net=host -u root \ docker_host run --privileged -d --name mesos-slave-$i --volumes-from node-$i -v /sys/fs/cgroup:/sys/fs/cgroup --net=host -u root \
$MESOS_IMAGE mesos-slave --master=127.0.0.1:$MESOS_MASTER_PORT --containerizers=docker --attributes="docker_port:$docker_port" --hostname=127.0.0.1 --port=$(($MESOS_MASTER_PORT + (1 + $i))) --docker=/usr/local/bin/docker $MESOS_IMAGE mesos-slave --master=127.0.0.1:$MESOS_MASTER_PORT --containerizers=docker --attributes="docker_port:$docker_port" --hostname=127.0.0.1 --port=$(($MESOS_MASTER_PORT + (1 + $i))) --docker=/usr/local/bin/docker
) )
@ -37,7 +37,7 @@ function start_mesos_zk() {
retry 10 1 eval "docker_host ps | grep 'mesos-master'" retry 10 1 eval "docker_host ps | grep 'mesos-master'"
for ((i=0; i < current; i++)); do for ((i=0; i < current; i++)); do
local docker_port=$(echo ${HOSTS[$i]} | cut -d: -f2) local docker_port=$(echo ${HOSTS[$i]} | cut -d: -f2)
MESOS_SLAVES[$i]=$( MESOS_AGENTS[$i]=$(
docker_host run --privileged -d --name mesos-slave-$i --volumes-from node-$i -v /sys/fs/cgroup:/sys/fs/cgroup --net=host -u root \ docker_host run --privileged -d --name mesos-slave-$i --volumes-from node-$i -v /sys/fs/cgroup:/sys/fs/cgroup --net=host -u root \
$MESOS_IMAGE mesos-slave --master=127.0.0.1:$MESOS_MASTER_PORT --containerizers=docker --attributes="docker_port:$docker_port" --hostname=127.0.0.1 --port=$(($MESOS_MASTER_PORT + (1 + $i))) --docker=/usr/local/bin/docker $MESOS_IMAGE mesos-slave --master=127.0.0.1:$MESOS_MASTER_PORT --containerizers=docker --attributes="docker_port:$docker_port" --hostname=127.0.0.1 --port=$(($MESOS_MASTER_PORT + (1 + $i))) --docker=/usr/local/bin/docker
) )
@ -45,11 +45,11 @@ function start_mesos_zk() {
done done
} }
# Stop mesos master and slave. # Stop mesos master and agent
function stop_mesos() { function stop_mesos() {
echo "Stopping $MESOS_MASTER" echo "Stopping $MESOS_MASTER"
docker_host rm -f -v $MESOS_MASTER > /dev/null; docker_host rm -f -v $MESOS_MASTER > /dev/null;
for id in ${MESOS_SLAVES[@]}; do for id in ${MESOS_AGENTS[@]}; do
echo "Stopping $id" echo "Stopping $id"
docker_host rm -f -v $id > /dev/null; docker_host rm -f -v $id > /dev/null;
done done