From c3d6e612c6bf52a04114a22f26b8bc4fd5d53efb Mon Sep 17 00:00:00 2001
From: runzhliu <runzhliu@tencent.com>
Date: Sat, 4 Apr 2020 23:35:46 +0800
Subject: [PATCH] add an example for cpu-only case (#210)

* add example for cpu-only case

* add more details about the cpu-only example

* set python3.6 as default for cpu-only example
---
 examples/hovorod-cpu/Dockerfile.cpu        |  78 ++++++++++
 examples/hovorod-cpu/README.md             |  19 +++
 examples/hovorod-cpu/tensorflow-mnist.yaml |  54 +++++++
 examples/hovorod-cpu/tensorflow_mnist.py   | 172 +++++++++++++++++++++
 4 files changed, 323 insertions(+)
 create mode 100644 examples/hovorod-cpu/Dockerfile.cpu
 create mode 100644 examples/hovorod-cpu/README.md
 create mode 100644 examples/hovorod-cpu/tensorflow-mnist.yaml
 create mode 100644 examples/hovorod-cpu/tensorflow_mnist.py

diff --git a/examples/hovorod-cpu/Dockerfile.cpu b/examples/hovorod-cpu/Dockerfile.cpu
new file mode 100644
index 0000000..5980b63
--- /dev/null
+++ b/examples/hovorod-cpu/Dockerfile.cpu
@@ -0,0 +1,78 @@
+FROM ubuntu:18.04
+
+ENV TENSORFLOW_VERSION=1.14.0
+ENV PYTORCH_VERSION=1.4.0
+ENV TORCHVISION_VERSION=0.5.0
+ENV MXNET_VERSION=1.6.0
+
+# Python 2.7 or 3.6 is supported by Ubuntu Bionic out of the box
+ARG python=3.6
+ENV PYTHON_VERSION=${python}
+
+# Set default shell to /bin/bash
+SHELL ["/bin/bash", "-cu"]
+
+RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        build-essential \
+        cmake \
+        g++-4.8 \
+        git \
+        curl \
+        vim \
+        wget \
+        ca-certificates \
+        libjpeg-dev \
+        libpng-dev \
+        python${PYTHON_VERSION} \
+        python${PYTHON_VERSION}-dev \
+        librdmacm1 \
+        libibverbs1 \
+        ibverbs-providers
+
+RUN if [[ "${PYTHON_VERSION}" == "3.6" ]]; then \
+        apt-get install -y python${PYTHON_VERSION}-distutils; \
+    fi
+RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
+
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+
+# Install TensorFlow, Keras, PyTorch and MXNet
+RUN pip install future typing
+RUN pip install numpy \
+        tensorflow==${TENSORFLOW_VERSION} \
+        keras \
+        h5py
+RUN pip install torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION}
+RUN pip install mxnet==${MXNET_VERSION}
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi && \
+    cd /tmp/openmpi && \
+    wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz && \
+    tar zxf openmpi-4.0.0.tar.gz && \
+    cd openmpi-4.0.0 && \
+    ./configure --enable-orterun-prefix-by-default && \
+    make -j $(nproc) all && \
+    make install && \
+    ldconfig && \
+    rm -rf /tmp/openmpi
+
+# Install Horovod
+RUN HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 \
+    pip install --no-cache-dir horovod
+
+# Install OpenSSH for MPI to communicate between containers
+RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
+    mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && mkdir /examples
+
+# Get example python scripts
+COPY tensorflow_mnist.py /examples
+
+WORKDIR "/examples"
diff --git a/examples/hovorod-cpu/README.md b/examples/hovorod-cpu/README.md
new file mode 100644
index 0000000..10716b2
--- /dev/null
+++ b/examples/hovorod-cpu/README.md
@@ -0,0 +1,19 @@
+# Horovod CPU-Only Case
+
+This example shows how to run a cpu-only mpijob.
+
+## How to Build Image
+
+This example dockerfile is based on Horovod cpu only [dockerfile](https://raw.githubusercontent.com/horovod/horovod/master/Dockerfile.cpu), please build the image as follows:
+
+```bash
+docker build -t horovod:latest -f Dockerfile.cpu .
+```
+
+## Create Mpijob
+
+The example mpijob is to run the horovod cpu-only example [tensorflow_mnist.py](https://raw.githubusercontent.com/horovod/horovod/master/examples/tensorflow_mnist.py).
+
+```bash
+kubectl create -f ./tensorflow-mnist.yaml
+```
diff --git a/examples/hovorod-cpu/tensorflow-mnist.yaml b/examples/hovorod-cpu/tensorflow-mnist.yaml
new file mode 100644
index 0000000..b69bc75
--- /dev/null
+++ b/examples/hovorod-cpu/tensorflow-mnist.yaml
@@ -0,0 +1,54 @@
+apiVersion: kubeflow.org/v1alpha2
+kind: MPIJob
+metadata:
+  name: tensorflow-mnist
+spec:
+  slotsPerWorker: 1
+  cleanPodPolicy: Running
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          - image: horovod-cpu:latest
+            name: mpi-launcher
+            command:
+            - mpirun
+            args:
+            - -np
+            - "2"
+            - --allow-run-as-root
+            - -bind-to
+            - none
+            - -map-by
+            - slot
+            - -x
+            - LD_LIBRARY_PATH
+            - -x
+            - PATH
+            - -mca
+            - pml
+            - ob1
+            - -mca
+            - btl
+            - ^openib
+            - python
+            - /examples/tensorflow_mnist.py
+            resources:
+              limits:
+                cpu: 1
+                memory: 2Gi
+    Worker:
+      replicas: 2
+      template:
+        spec:
+          containers:
+          - command:
+            - ""
+            image: horovod-cpu:latest
+            name: mpi-worker
+            resources:
+              limits:
+                cpu: 2
+                memory: 4Gi
diff --git a/examples/hovorod-cpu/tensorflow_mnist.py b/examples/hovorod-cpu/tensorflow_mnist.py
new file mode 100644
index 0000000..e4e329a
--- /dev/null
+++ b/examples/hovorod-cpu/tensorflow_mnist.py
@@ -0,0 +1,172 @@
+# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import errno
+import tensorflow as tf
+import horovod.tensorflow as hvd
+import numpy as np
+import argparse
+
+from tensorflow import keras
+
+layers = tf.layers
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+# Training settings
+parser = argparse.ArgumentParser(description='Tensorflow MNIST Example')
+parser.add_argument('--use-adasum', action='store_true', default=False,
+                    help='use adasum algorithm to do reduction')
+args = parser.parse_args()
+
+def conv_model(feature, target, mode):
+    """2-layer convolution model."""
+    # Convert the target to a one-hot tensor of shape (batch_size, 10) and
+    # with a on-value of 1 for each one-hot vector of length 10.
+    target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
+
+    # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
+    # image width and height final dimension being the number of color channels.
+    feature = tf.reshape(feature, [-1, 28, 28, 1])
+
+    # First conv layer will compute 32 features for each 5x5 patch
+    with tf.variable_scope('conv_layer1'):
+        h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
+                                activation=tf.nn.relu, padding="SAME")
+        h_pool1 = tf.nn.max_pool(
+            h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
+
+    # Second conv layer will compute 64 features for each 5x5 patch.
+    with tf.variable_scope('conv_layer2'):
+        h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
+                                activation=tf.nn.relu, padding="SAME")
+        h_pool2 = tf.nn.max_pool(
+            h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
+        # reshape tensor into a batch of vectors
+        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
+
+    # Densely connected layer with 1024 neurons.
+    h_fc1 = layers.dropout(
+        layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
+        rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
+
+    # Compute logits (1 per class) and compute loss.
+    logits = layers.dense(h_fc1, 10, activation=None)
+    loss = tf.losses.softmax_cross_entropy(target, logits)
+
+    return tf.argmax(logits, 1), loss
+
+
+def train_input_generator(x_train, y_train, batch_size=64):
+    assert len(x_train) == len(y_train)
+    while True:
+        p = np.random.permutation(len(x_train))
+        x_train, y_train = x_train[p], y_train[p]
+        index = 0
+        while index <= len(x_train) - batch_size:
+            yield x_train[index:index + batch_size], \
+                  y_train[index:index + batch_size],
+            index += batch_size
+
+
+def main(_):
+    # Horovod: initialize Horovod.
+    hvd.init()
+
+    # Keras automatically creates a cache directory in ~/.keras/datasets for
+    # storing the downloaded MNIST data. This creates a race
+    # condition among the workers that share the same filesystem. If the
+    # directory already exists by the time this worker gets around to creating
+    # it, ignore the resulting exception and continue.
+    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
+    if not os.path.exists(cache_dir):
+        try:
+            os.mkdir(cache_dir)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
+                pass
+            else:
+                raise
+
+    # Download and load MNIST dataset.
+    (x_train, y_train), (x_test, y_test) = \
+        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())
+
+    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
+    # into (-1, 784) to feed into our network. Also, need to normalize the
+    # features between 0 and 1.
+    x_train = np.reshape(x_train, (-1, 784)) / 255.0
+    x_test = np.reshape(x_test, (-1, 784)) / 255.0
+
+    # Build model...
+    with tf.name_scope('input'):
+        image = tf.placeholder(tf.float32, [None, 784], name='image')
+        label = tf.placeholder(tf.float32, [None], name='label')
+    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
+
+    lr_scaler = hvd.size()
+    # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL,
+    # scale lr by local_size
+    if args.use_adasum:
+        lr_scaler = hvd.local_size() if hvd.nccl_built() else 1
+
+    # Horovod: adjust learning rate based on lr_scaler.
+    opt = tf.train.AdamOptimizer(0.001 * lr_scaler)
+
+    # Horovod: add Horovod Distributed Optimizer.
+    opt = hvd.DistributedOptimizer(opt, op=hvd.Adasum if args.use_adasum else hvd.Average)
+
+    global_step = tf.train.get_or_create_global_step()
+    train_op = opt.minimize(loss, global_step=global_step)
+
+    hooks = [
+        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
+        # from rank 0 to all other processes. This is necessary to ensure consistent
+        # initialization of all workers when training is started with random weights
+        # or restored from a checkpoint.
+        hvd.BroadcastGlobalVariablesHook(0),
+
+        # Horovod: adjust number of steps based on number of GPUs.
+        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
+
+        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
+                                   every_n_iter=10),
+    ]
+
+    # Horovod: pin GPU to be used to process local rank (one GPU per process)
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    config.gpu_options.visible_device_list = str(hvd.local_rank())
+
+    # Horovod: save checkpoints only on worker 0 to prevent other workers from
+    # corrupting them.
+    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
+    training_batch_generator = train_input_generator(x_train,
+                                                     y_train, batch_size=100)
+    # The MonitoredTrainingSession takes care of session initialization,
+    # restoring from a checkpoint, saving to a checkpoint, and closing when done
+    # or an error occurs.
+    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
+                                           hooks=hooks,
+                                           config=config) as mon_sess:
+        while not mon_sess.should_stop():
+            # Run a training step synchronously.
+            image_, label_ = next(training_batch_generator)
+            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
+
+
+if __name__ == "__main__":
+    tf.app.run()