From c3d6e612c6bf52a04114a22f26b8bc4fd5d53efb Mon Sep 17 00:00:00 2001 From: runzhliu Date: Sat, 4 Apr 2020 23:35:46 +0800 Subject: [PATCH] add an example for cpu-only case (#210) * add example for cpu-only case * add more details about the cpu-only example * set python3.6 as default for cpu-only example --- examples/hovorod-cpu/Dockerfile.cpu | 78 ++++++++++ examples/hovorod-cpu/README.md | 19 +++ examples/hovorod-cpu/tensorflow-mnist.yaml | 54 +++++++ examples/hovorod-cpu/tensorflow_mnist.py | 172 +++++++++++++++++++++ 4 files changed, 323 insertions(+) create mode 100644 examples/hovorod-cpu/Dockerfile.cpu create mode 100644 examples/hovorod-cpu/README.md create mode 100644 examples/hovorod-cpu/tensorflow-mnist.yaml create mode 100644 examples/hovorod-cpu/tensorflow_mnist.py diff --git a/examples/hovorod-cpu/Dockerfile.cpu b/examples/hovorod-cpu/Dockerfile.cpu new file mode 100644 index 0000000..5980b63 --- /dev/null +++ b/examples/hovorod-cpu/Dockerfile.cpu @@ -0,0 +1,78 @@ +FROM ubuntu:18.04 + +ENV TENSORFLOW_VERSION=1.14.0 +ENV PYTORCH_VERSION=1.4.0 +ENV TORCHVISION_VERSION=0.5.0 +ENV MXNET_VERSION=1.6.0 + +# Python 2.7 or 3.6 is supported by Ubuntu Bionic out of the box +ARG python=3.6 +ENV PYTHON_VERSION=${python} + +# Set default shell to /bin/bash +SHELL ["/bin/bash", "-cu"] + +RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ + build-essential \ + cmake \ + g++-4.8 \ + git \ + curl \ + vim \ + wget \ + ca-certificates \ + libjpeg-dev \ + libpng-dev \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-dev \ + librdmacm1 \ + libibverbs1 \ + ibverbs-providers + +RUN if [[ "${PYTHON_VERSION}" == "3.6" ]]; then \ + apt-get install -y python${PYTHON_VERSION}-distutils; \ + fi +RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +# Install TensorFlow, Keras, PyTorch and MXNet +RUN pip install future typing +RUN pip install numpy \ + tensorflow==${TENSORFLOW_VERSION} \ + keras \ + h5py +RUN pip install torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION} +RUN pip install mxnet==${MXNET_VERSION} + +# Install Open MPI +RUN mkdir /tmp/openmpi && \ + cd /tmp/openmpi && \ + wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz && \ + tar zxf openmpi-4.0.0.tar.gz && \ + cd openmpi-4.0.0 && \ + ./configure --enable-orterun-prefix-by-default && \ + make -j $(nproc) all && \ + make install && \ + ldconfig && \ + rm -rf /tmp/openmpi + +# Install Horovod +RUN HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 \ + pip install --no-cache-dir horovod + +# Install OpenSSH for MPI to communicate between containers +RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ + mkdir -p /var/run/sshd + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && mkdir /examples + +# Get example python scripts +COPY tensorflow_mnist.py /examples + +WORKDIR "/examples" diff --git a/examples/hovorod-cpu/README.md b/examples/hovorod-cpu/README.md new file mode 100644 index 0000000..10716b2 --- /dev/null +++ b/examples/hovorod-cpu/README.md @@ -0,0 +1,19 @@ +# Horovod CPU-Only Case + +This example shows how to run a cpu-only mpijob. + +## How to Build Image + +This example dockerfile is based on Horovod cpu only [dockerfile](https://raw.githubusercontent.com/horovod/horovod/master/Dockerfile.cpu), please build the image as follows: + +```bash +docker build -t horovod:latest -f Dockerfile.cpu . +``` + +## Create Mpijob + +The example mpijob is to run the horovod cpu-only example [tensorflow_mnist.py](https://raw.githubusercontent.com/horovod/horovod/master/examples/tensorflow_mnist.py). + +```bash +kubectl create -f ./tensorflow-mnist.yaml +``` diff --git a/examples/hovorod-cpu/tensorflow-mnist.yaml b/examples/hovorod-cpu/tensorflow-mnist.yaml new file mode 100644 index 0000000..b69bc75 --- /dev/null +++ b/examples/hovorod-cpu/tensorflow-mnist.yaml @@ -0,0 +1,54 @@ +apiVersion: kubeflow.org/v1alpha2 +kind: MPIJob +metadata: + name: tensorflow-mnist +spec: + slotsPerWorker: 1 + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: horovod-cpu:latest + name: mpi-launcher + command: + - mpirun + args: + - -np + - "2" + - --allow-run-as-root + - -bind-to + - none + - -map-by + - slot + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -mca + - pml + - ob1 + - -mca + - btl + - ^openib + - python + - /examples/tensorflow_mnist.py + resources: + limits: + cpu: 1 + memory: 2Gi + Worker: + replicas: 2 + template: + spec: + containers: + - command: + - "" + image: horovod-cpu:latest + name: mpi-worker + resources: + limits: + cpu: 2 + memory: 4Gi diff --git a/examples/hovorod-cpu/tensorflow_mnist.py b/examples/hovorod-cpu/tensorflow_mnist.py new file mode 100644 index 0000000..e4e329a --- /dev/null +++ b/examples/hovorod-cpu/tensorflow_mnist.py @@ -0,0 +1,172 @@ +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import errno +import tensorflow as tf +import horovod.tensorflow as hvd +import numpy as np +import argparse + +from tensorflow import keras + +layers = tf.layers + +tf.logging.set_verbosity(tf.logging.INFO) + +# Training settings +parser = argparse.ArgumentParser(description='Tensorflow MNIST Example') +parser.add_argument('--use-adasum', action='store_true', default=False, + help='use adasum algorithm to do reduction') +args = parser.parse_args() + +def conv_model(feature, target, mode): + """2-layer convolution model.""" + # Convert the target to a one-hot tensor of shape (batch_size, 10) and + # with a on-value of 1 for each one-hot vector of length 10. + target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0) + + # Reshape feature to 4d tensor with 2nd and 3rd dimensions being + # image width and height final dimension being the number of color channels. + feature = tf.reshape(feature, [-1, 28, 28, 1]) + + # First conv layer will compute 32 features for each 5x5 patch + with tf.variable_scope('conv_layer1'): + h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5], + activation=tf.nn.relu, padding="SAME") + h_pool1 = tf.nn.max_pool( + h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') + + # Second conv layer will compute 64 features for each 5x5 patch. + with tf.variable_scope('conv_layer2'): + h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5], + activation=tf.nn.relu, padding="SAME") + h_pool2 = tf.nn.max_pool( + h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') + # reshape tensor into a batch of vectors + h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) + + # Densely connected layer with 1024 neurons. + h_fc1 = layers.dropout( + layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu), + rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN) + + # Compute logits (1 per class) and compute loss. + logits = layers.dense(h_fc1, 10, activation=None) + loss = tf.losses.softmax_cross_entropy(target, logits) + + return tf.argmax(logits, 1), loss + + +def train_input_generator(x_train, y_train, batch_size=64): + assert len(x_train) == len(y_train) + while True: + p = np.random.permutation(len(x_train)) + x_train, y_train = x_train[p], y_train[p] + index = 0 + while index <= len(x_train) - batch_size: + yield x_train[index:index + batch_size], \ + y_train[index:index + batch_size], + index += batch_size + + +def main(_): + # Horovod: initialize Horovod. + hvd.init() + + # Keras automatically creates a cache directory in ~/.keras/datasets for + # storing the downloaded MNIST data. This creates a race + # condition among the workers that share the same filesystem. If the + # directory already exists by the time this worker gets around to creating + # it, ignore the resulting exception and continue. + cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') + if not os.path.exists(cache_dir): + try: + os.mkdir(cache_dir) + except OSError as e: + if e.errno == errno.EEXIST and os.path.isdir(cache_dir): + pass + else: + raise + + # Download and load MNIST dataset. + (x_train, y_train), (x_test, y_test) = \ + keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) + + # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it + # into (-1, 784) to feed into our network. Also, need to normalize the + # features between 0 and 1. + x_train = np.reshape(x_train, (-1, 784)) / 255.0 + x_test = np.reshape(x_test, (-1, 784)) / 255.0 + + # Build model... + with tf.name_scope('input'): + image = tf.placeholder(tf.float32, [None, 784], name='image') + label = tf.placeholder(tf.float32, [None], name='label') + predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) + + lr_scaler = hvd.size() + # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL, + # scale lr by local_size + if args.use_adasum: + lr_scaler = hvd.local_size() if hvd.nccl_built() else 1 + + # Horovod: adjust learning rate based on lr_scaler. + opt = tf.train.AdamOptimizer(0.001 * lr_scaler) + + # Horovod: add Horovod Distributed Optimizer. + opt = hvd.DistributedOptimizer(opt, op=hvd.Adasum if args.use_adasum else hvd.Average) + + global_step = tf.train.get_or_create_global_step() + train_op = opt.minimize(loss, global_step=global_step) + + hooks = [ + # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states + # from rank 0 to all other processes. This is necessary to ensure consistent + # initialization of all workers when training is started with random weights + # or restored from a checkpoint. + hvd.BroadcastGlobalVariablesHook(0), + + # Horovod: adjust number of steps based on number of GPUs. + tf.train.StopAtStepHook(last_step=20000 // hvd.size()), + + tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, + every_n_iter=10), + ] + + # Horovod: pin GPU to be used to process local rank (one GPU per process) + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.gpu_options.visible_device_list = str(hvd.local_rank()) + + # Horovod: save checkpoints only on worker 0 to prevent other workers from + # corrupting them. + checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None + training_batch_generator = train_input_generator(x_train, + y_train, batch_size=100) + # The MonitoredTrainingSession takes care of session initialization, + # restoring from a checkpoint, saving to a checkpoint, and closing when done + # or an error occurs. + with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, + hooks=hooks, + config=config) as mon_sess: + while not mon_sess.should_stop(): + # Run a training step synchronously. + image_, label_ = next(training_batch_generator) + mon_sess.run(train_op, feed_dict={image: image_, label: label_}) + + +if __name__ == "__main__": + tf.app.run()