add an example for cpu-only case (#210)
* add example for cpu-only case * add more details about the cpu-only example * set python3.6 as default for cpu-only example
This commit is contained in:
parent
56cf34e777
commit
c3d6e612c6
|
|
@ -0,0 +1,78 @@
|
|||
FROM ubuntu:18.04
|
||||
|
||||
ENV TENSORFLOW_VERSION=1.14.0
|
||||
ENV PYTORCH_VERSION=1.4.0
|
||||
ENV TORCHVISION_VERSION=0.5.0
|
||||
ENV MXNET_VERSION=1.6.0
|
||||
|
||||
# Python 2.7 or 3.6 is supported by Ubuntu Bionic out of the box
|
||||
ARG python=3.6
|
||||
ENV PYTHON_VERSION=${python}
|
||||
|
||||
# Set default shell to /bin/bash
|
||||
SHELL ["/bin/bash", "-cu"]
|
||||
|
||||
RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
g++-4.8 \
|
||||
git \
|
||||
curl \
|
||||
vim \
|
||||
wget \
|
||||
ca-certificates \
|
||||
libjpeg-dev \
|
||||
libpng-dev \
|
||||
python${PYTHON_VERSION} \
|
||||
python${PYTHON_VERSION}-dev \
|
||||
librdmacm1 \
|
||||
libibverbs1 \
|
||||
ibverbs-providers
|
||||
|
||||
RUN if [[ "${PYTHON_VERSION}" == "3.6" ]]; then \
|
||||
apt-get install -y python${PYTHON_VERSION}-distutils; \
|
||||
fi
|
||||
RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
# Install TensorFlow, Keras, PyTorch and MXNet
|
||||
RUN pip install future typing
|
||||
RUN pip install numpy \
|
||||
tensorflow==${TENSORFLOW_VERSION} \
|
||||
keras \
|
||||
h5py
|
||||
RUN pip install torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION}
|
||||
RUN pip install mxnet==${MXNET_VERSION}
|
||||
|
||||
# Install Open MPI
|
||||
RUN mkdir /tmp/openmpi && \
|
||||
cd /tmp/openmpi && \
|
||||
wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz && \
|
||||
tar zxf openmpi-4.0.0.tar.gz && \
|
||||
cd openmpi-4.0.0 && \
|
||||
./configure --enable-orterun-prefix-by-default && \
|
||||
make -j $(nproc) all && \
|
||||
make install && \
|
||||
ldconfig && \
|
||||
rm -rf /tmp/openmpi
|
||||
|
||||
# Install Horovod
|
||||
RUN HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 \
|
||||
pip install --no-cache-dir horovod
|
||||
|
||||
# Install OpenSSH for MPI to communicate between containers
|
||||
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
|
||||
mkdir -p /var/run/sshd
|
||||
|
||||
# Allow OpenSSH to talk to containers without asking for confirmation
|
||||
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
|
||||
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
|
||||
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && mkdir /examples
|
||||
|
||||
# Get example python scripts
|
||||
COPY tensorflow_mnist.py /examples
|
||||
|
||||
WORKDIR "/examples"
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
# Horovod CPU-Only Case
|
||||
|
||||
This example shows how to run a cpu-only mpijob.
|
||||
|
||||
## How to Build Image
|
||||
|
||||
This example dockerfile is based on Horovod cpu only [dockerfile](https://raw.githubusercontent.com/horovod/horovod/master/Dockerfile.cpu), please build the image as follows:
|
||||
|
||||
```bash
|
||||
docker build -t horovod:latest -f Dockerfile.cpu .
|
||||
```
|
||||
|
||||
## Create Mpijob
|
||||
|
||||
The example mpijob is to run the horovod cpu-only example [tensorflow_mnist.py](https://raw.githubusercontent.com/horovod/horovod/master/examples/tensorflow_mnist.py).
|
||||
|
||||
```bash
|
||||
kubectl create -f ./tensorflow-mnist.yaml
|
||||
```
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
apiVersion: kubeflow.org/v1alpha2
|
||||
kind: MPIJob
|
||||
metadata:
|
||||
name: tensorflow-mnist
|
||||
spec:
|
||||
slotsPerWorker: 1
|
||||
cleanPodPolicy: Running
|
||||
mpiReplicaSpecs:
|
||||
Launcher:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- image: horovod-cpu:latest
|
||||
name: mpi-launcher
|
||||
command:
|
||||
- mpirun
|
||||
args:
|
||||
- -np
|
||||
- "2"
|
||||
- --allow-run-as-root
|
||||
- -bind-to
|
||||
- none
|
||||
- -map-by
|
||||
- slot
|
||||
- -x
|
||||
- LD_LIBRARY_PATH
|
||||
- -x
|
||||
- PATH
|
||||
- -mca
|
||||
- pml
|
||||
- ob1
|
||||
- -mca
|
||||
- btl
|
||||
- ^openib
|
||||
- python
|
||||
- /examples/tensorflow_mnist.py
|
||||
resources:
|
||||
limits:
|
||||
cpu: 1
|
||||
memory: 2Gi
|
||||
Worker:
|
||||
replicas: 2
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- ""
|
||||
image: horovod-cpu:latest
|
||||
name: mpi-worker
|
||||
resources:
|
||||
limits:
|
||||
cpu: 2
|
||||
memory: 4Gi
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
import os
|
||||
import errno
|
||||
import tensorflow as tf
|
||||
import horovod.tensorflow as hvd
|
||||
import numpy as np
|
||||
import argparse
|
||||
|
||||
from tensorflow import keras
|
||||
|
||||
layers = tf.layers
|
||||
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
|
||||
# Training settings
|
||||
parser = argparse.ArgumentParser(description='Tensorflow MNIST Example')
|
||||
parser.add_argument('--use-adasum', action='store_true', default=False,
|
||||
help='use adasum algorithm to do reduction')
|
||||
args = parser.parse_args()
|
||||
|
||||
def conv_model(feature, target, mode):
|
||||
"""2-layer convolution model."""
|
||||
# Convert the target to a one-hot tensor of shape (batch_size, 10) and
|
||||
# with a on-value of 1 for each one-hot vector of length 10.
|
||||
target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
|
||||
|
||||
# Reshape feature to 4d tensor with 2nd and 3rd dimensions being
|
||||
# image width and height final dimension being the number of color channels.
|
||||
feature = tf.reshape(feature, [-1, 28, 28, 1])
|
||||
|
||||
# First conv layer will compute 32 features for each 5x5 patch
|
||||
with tf.variable_scope('conv_layer1'):
|
||||
h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
|
||||
activation=tf.nn.relu, padding="SAME")
|
||||
h_pool1 = tf.nn.max_pool(
|
||||
h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
|
||||
|
||||
# Second conv layer will compute 64 features for each 5x5 patch.
|
||||
with tf.variable_scope('conv_layer2'):
|
||||
h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
|
||||
activation=tf.nn.relu, padding="SAME")
|
||||
h_pool2 = tf.nn.max_pool(
|
||||
h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
|
||||
# reshape tensor into a batch of vectors
|
||||
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
|
||||
|
||||
# Densely connected layer with 1024 neurons.
|
||||
h_fc1 = layers.dropout(
|
||||
layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
|
||||
rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
|
||||
|
||||
# Compute logits (1 per class) and compute loss.
|
||||
logits = layers.dense(h_fc1, 10, activation=None)
|
||||
loss = tf.losses.softmax_cross_entropy(target, logits)
|
||||
|
||||
return tf.argmax(logits, 1), loss
|
||||
|
||||
|
||||
def train_input_generator(x_train, y_train, batch_size=64):
|
||||
assert len(x_train) == len(y_train)
|
||||
while True:
|
||||
p = np.random.permutation(len(x_train))
|
||||
x_train, y_train = x_train[p], y_train[p]
|
||||
index = 0
|
||||
while index <= len(x_train) - batch_size:
|
||||
yield x_train[index:index + batch_size], \
|
||||
y_train[index:index + batch_size],
|
||||
index += batch_size
|
||||
|
||||
|
||||
def main(_):
|
||||
# Horovod: initialize Horovod.
|
||||
hvd.init()
|
||||
|
||||
# Keras automatically creates a cache directory in ~/.keras/datasets for
|
||||
# storing the downloaded MNIST data. This creates a race
|
||||
# condition among the workers that share the same filesystem. If the
|
||||
# directory already exists by the time this worker gets around to creating
|
||||
# it, ignore the resulting exception and continue.
|
||||
cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
|
||||
if not os.path.exists(cache_dir):
|
||||
try:
|
||||
os.mkdir(cache_dir)
|
||||
except OSError as e:
|
||||
if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
|
||||
# Download and load MNIST dataset.
|
||||
(x_train, y_train), (x_test, y_test) = \
|
||||
keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())
|
||||
|
||||
# The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
|
||||
# into (-1, 784) to feed into our network. Also, need to normalize the
|
||||
# features between 0 and 1.
|
||||
x_train = np.reshape(x_train, (-1, 784)) / 255.0
|
||||
x_test = np.reshape(x_test, (-1, 784)) / 255.0
|
||||
|
||||
# Build model...
|
||||
with tf.name_scope('input'):
|
||||
image = tf.placeholder(tf.float32, [None, 784], name='image')
|
||||
label = tf.placeholder(tf.float32, [None], name='label')
|
||||
predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
|
||||
|
||||
lr_scaler = hvd.size()
|
||||
# By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL,
|
||||
# scale lr by local_size
|
||||
if args.use_adasum:
|
||||
lr_scaler = hvd.local_size() if hvd.nccl_built() else 1
|
||||
|
||||
# Horovod: adjust learning rate based on lr_scaler.
|
||||
opt = tf.train.AdamOptimizer(0.001 * lr_scaler)
|
||||
|
||||
# Horovod: add Horovod Distributed Optimizer.
|
||||
opt = hvd.DistributedOptimizer(opt, op=hvd.Adasum if args.use_adasum else hvd.Average)
|
||||
|
||||
global_step = tf.train.get_or_create_global_step()
|
||||
train_op = opt.minimize(loss, global_step=global_step)
|
||||
|
||||
hooks = [
|
||||
# Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
|
||||
# from rank 0 to all other processes. This is necessary to ensure consistent
|
||||
# initialization of all workers when training is started with random weights
|
||||
# or restored from a checkpoint.
|
||||
hvd.BroadcastGlobalVariablesHook(0),
|
||||
|
||||
# Horovod: adjust number of steps based on number of GPUs.
|
||||
tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
|
||||
|
||||
tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
|
||||
every_n_iter=10),
|
||||
]
|
||||
|
||||
# Horovod: pin GPU to be used to process local rank (one GPU per process)
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
config.gpu_options.visible_device_list = str(hvd.local_rank())
|
||||
|
||||
# Horovod: save checkpoints only on worker 0 to prevent other workers from
|
||||
# corrupting them.
|
||||
checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
|
||||
training_batch_generator = train_input_generator(x_train,
|
||||
y_train, batch_size=100)
|
||||
# The MonitoredTrainingSession takes care of session initialization,
|
||||
# restoring from a checkpoint, saving to a checkpoint, and closing when done
|
||||
# or an error occurs.
|
||||
with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
|
||||
hooks=hooks,
|
||||
config=config) as mon_sess:
|
||||
while not mon_sess.should_stop():
|
||||
# Run a training step synchronously.
|
||||
image_, label_ = next(training_batch_generator)
|
||||
mon_sess.run(train_op, feed_dict={image: image_, label: label_})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tf.app.run()
|
||||
Loading…
Reference in New Issue