Updated object detection training example (#228)

* Updated Dockerfile.traning to use latest tensorflow and tensorflow object detetion api. * Updated tf-training-job component and added a chief replica spec * Corrected some typos and updated some instructions
2018-08-20 19:32:12 -07:00 · 2018-08-20 19:32:12 -07:00 · e6b6730650
parent f9873e6ac4
commit e6b6730650
8 changed files with 117 additions and 52 deletions
--- a/object_detection/conf/faster_rcnn_resnet101_pets.config
+++ b/object_detection/conf/faster_rcnn_resnet101_pets.config
@ -88,10 +88,6 @@ train_config: {
      learning_rate: {
        manual_step_learning_rate {
          initial_learning_rate: 0.0003
          schedule {
            step: 0
            learning_rate: .0003
          }
          schedule {
            step: 900000
            learning_rate: .00003
@ -109,6 +105,7 @@ train_config: {
  gradient_clipping_by_norm: 10.0
  fine_tune_checkpoint: "/pets_data/faster_rcnn_resnet101_coco_2018_01_28/model.ckpt"
  from_detection_checkpoint: true
  # load_all_detection_checkpoint_vars: true
  # Note: The below line limits the training process to 200K steps, which we
  # empirically found to be sufficient enough to train the pets dataset. This
  # effectively bypasses the learning rate schedule (the learning rate will
@ -122,21 +119,19 @@ train_config: {
 train_input_reader: {
  tf_record_input_reader {
-    input_path: "/pets_data/pet_train_with_masks.record"
+    input_path: "/pets_data/pet_faces_train.record-?????-of-00010"
  }
  label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt"
 }
 eval_config: {
-  num_examples: 2000
+  metrics_set: "coco_detection_metrics"
-  # Note: The below line limits the evaluation process to 10 evaluations.
+  num_examples: 1101
  # Remove the below line to evaluate indefinitely.
  max_evals: 10
 }
 eval_input_reader: {
  tf_record_input_reader {
-    input_path: "/pets_data/pet_val_with_masks.record"
+    input_path: "/pets_data/pet_faces_val.record-?????-of-00010"
  }
  label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt"
  shuffle: false
--- a/object_detection/docker/Dockerfile.training
+++ b/object_detection/docker/Dockerfile.training
@ -1,3 +1,17 @@
 # Copyright 2018 Intel Corporation.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 FROM ubuntu:16.04
 LABEL maintainer="Soila Kavulya <soila.p.kavulya@intel.com>"
@ -10,7 +24,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
        libpng12-dev \
        libzmq3-dev \
        pkg-config \
        protobuf-compiler \
        python \
        python-dev \
        python-pil \
@ -20,6 +33,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
        git \
        software-properties-common \
        unzip \
        wget \
        && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
@ -28,32 +42,31 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
    python get-pip.py && \
    rm get-pip.py
 RUN pip --no-cache-dir install \
        tensorflow
 RUN pip --no-cache-dir install \
        Cython \
-        glob2 \
+        contextlib2 \
        h5py \
        ipykernel \
        jupyter \
-        matplotlib \
+        matplotlib
        numpy \
        pandas \
        scipy \
        sklearn \
        six \
        tensorflow \
        tensorflow-serving-api \
        && \
    python -m ipykernel.kernelspec
 # Setup Universal Object Detection
 ENV MODELS_HOME "/models"
-RUN git clone https://github.com/tensorflow/models.git $MODELS_HOME && cd $MODELS_HOME && git checkout r1.5
+RUN git clone https://github.com/tensorflow/models.git $MODELS_HOME
-#COPY models $MODELS_HOME
+
-RUN cd $MODELS_HOME/research \
+RUN cd $MODELS_HOME/research && \
-        && protoc object_detection/protos/*.proto --python_out=.
+    wget -O protobuf.zip https://github.com/google/protobuf/releases/download/v3.0.0/protoc-3.0.0-linux-x86_64.zip && \
    unzip protobuf.zip && \
    ./bin/protoc object_detection/protos/*.proto --python_out=.
 RUN git clone https://github.com/cocodataset/cocoapi.git && \
    cd cocoapi/PythonAPI && \
    make && \
    cp -r pycocotools $MODELS_HOME/research
 ENV PYTHONPATH "$MODELS_HOME/research:$MODELS_HOME/research/slim:$PYTHONPATH"
-COPY scripts /scripts
+
 # TensorBoard
 EXPOSE 6006
@ -63,4 +76,4 @@ WORKDIR $MODELS_HOME
 ARG pipeline_config_path
 ARG train_dir
-CMD ["python", "$MODELS_HOME/research/object_detection/train.py", "--logtostderr", "--pipeline_config_path=$pipeline_config_path"  "--train_dir=$train_dir"]
+CMD ["python", "$MODELS_HOME/research/object_detection/model_main.py", "--pipeline_config_path=$pipeline_config_path"  "--model_dir=$train_dir"]
--- a/object_detection/export_tf_graph.md
+++ b/object_detection/export_tf_graph.md
@ -6,10 +6,25 @@ Before exporting the graph we first need to identify a checkpoint candidate in t
 To see what's being saved in `${MOUNT_PATH}/train` while the training job is running you can use:
 ```  
-kubectl -n kubeflow exec -it pets-training-master-r1hv-0-i6k7c sh  
+kubectl -n kubeflow exec tf-training-job-chief-0 -- ls ${MOUNT_PATH}/train
 ```  
 This will list the contents of the train directory. The output should something like this:
 ```
 checkpoint
 events.out.tfevents.1534465587.tf-training-job-chief-0
 events.out.tfevents.1534525812.tf-training-job-chief-0
 graph.pbtxt
 model.ckpt-0.data-00000-of-00001
 model.ckpt-0.index
 model.ckpt-0.meta
 model.ckpt-167.data-00000-of-00001
 model.ckpt-167.index
 model.ckpt-167.meta
 model.ckpt-334.data-00000-of-00001
 model.ckpt-334.index
 model.ckpt-334.meta
 pipeline.config
 ```
 This will open an interactive shell to your container and now you can execute `ls ${MOUNT_PATH}/train` and look for a
 checkpoint candidate.  
 Once you have identified the checkpoint next step is to configure the checkpoint in the `export-tf-graph-job` component and apply it.
--- a/object_detection/ks-app/components/export-tf-graph-job.jsonnet
+++ b/object_detection/ks-app/components/export-tf-graph-job.jsonnet
@ -26,7 +26,7 @@ apiVersion: "batch/v1",
          name: "export-graph",
          image: params.image,
          imagePullPolicy: "IfNotPresent",
-          command: ['python', 'models/research/object_detection/export_inference_graph.py'],
+          command: ['python', '/models/research/object_detection/export_inference_graph.py'],
          args: ['--input_type=' + params.inputType,
                 '--pipeline_config_path=' + params.pipelineConfigPath,
                 '--trained_checkpoint_prefix=' + params.trainedCheckpoint,
--- a/object_detection/ks-app/components/params.libsonnet
+++ b/object_detection/ks-app/components/params.libsonnet
@ -26,7 +26,7 @@
      pvc: 'pets-pvc',
    },
    "create-pet-record-job": {
-      dataDirPath: '/pets_data/images',
+      dataDirPath: '/pets_data',
      image: 'lcastell/pets_object_detection',
      mountPath: '/pets_data',
      name: 'create-pet-record-job',
--- a/object_detection/ks-app/components/tf-training-job.jsonnet
+++ b/object_detection/ks-app/components/tf-training-job.jsonnet
@ -21,14 +21,55 @@ local tfJobCpu = {
                workingDir: "/models",
                command: [
                  "python",
-                  "research/object_detection/train.py",
+                  "research/object_detection/model_main.py",
                ],
                args:[
-                  "--logstostderr",
+                  "--alsologtostderr",
                  "--pipeline_config_path=" + params.pipelineConfigPath,
-                  "--train_dir=" + params.trainDir,
+                  "--model_dir=" + params.trainDir,
                ],
                image: params.image,
                imagePullPolicy: "Always",
                name: "tensorflow",
                [if params.numGpu > 0 then "resources"] : {
                  limits:{
                    "nvidia.com/gpu": params.numGpu,
                  },
                },
                volumeMounts: [{
                  mountPath: params.mountPath,
                  name: "pets-data",
                },],
              },
            ],
            volumes: [{
                name: "pets-data",
                persistentVolumeClaim: {
                  claimName: params.pvc,
                },
            },],
            restartPolicy: "OnFailure",
          },
        },
      },
      Chief: {
        replicas: 1,
        template: {
          spec: {
            containers: [
              {
                workingDir: "/models",
                command: [
                  "python",
                  "research/object_detection/model_main.py",
                ],
                args:[
                  "--alsologtostderr",
                  "--pipeline_config_path=" + params.pipelineConfigPath,
                  "--model_dir=" + params.trainDir,
                ],
                image: params.image,
                imagePullPolicy: "Always",
                name: "tensorflow",
                [if params.numGpu > 0 then "resources"] : {
                  limits:{
@ -60,14 +101,15 @@ local tfJobCpu = {
                workingDir: "/models",
                command: [
                  "python",
-                  "research/object_detection/train.py",
+                  "research/object_detection/model_main.py",
                ],
                args:[
-                  "--logstostderr",
+                  "--alsologtostderr",
                  "--pipeline_config_path=" + params.pipelineConfigPath,
-                  "--train_dir=" + params.trainDir,
+                  "--model_dir=" + params.trainDir,
                ],
                image: params.image,
                imagePullPolicy: "Always",
                name: "tensorflow",
                [if params.numGpu > 0 then "resources"] : {
                  limits:{
--- a/object_detection/monitor_job.md
+++ b/object_detection/monitor_job.md
@ -8,14 +8,14 @@ kubectl -n kubeflow describe tfjobs tf-training-job
 ### View logs of individual pods
 ```
 kubectl -n kubeflow get pods
-kubectl -n kubeflow logs <name_of_master_pod>
+kubectl -n kubeflow logs <name_of_chief_pod>
 ```
 **NOTE:** When the job finishes, the pods will be automatically terminated. To see, run the `get pods` command with the `-a` flag:
 ```
 kubectl -n kubeflow get pods -a
 ```
-While the job is running, you should see something like this in your master pod logs:
+While the job is running, you should see something like this in your chief pod logs:
 ```
 INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt
 INFO:tensorflow:Recording summary at step 819.
@ -28,7 +28,7 @@ INFO:tensorflow:global step 834: loss = 0.2307 (16.493 sec/step)
 INFO:tensorflow:Recording summary at step 839
 ```
-When the job finishes, you should see something like this in your completed/terminated master pod logs:
+When the job finishes, you should see something like this in your completed/terminated chief pod logs:
 ```
 INFO:tensorflow:Starting Session.
 INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt
--- a/object_detection/setup.md
+++ b/object_detection/setup.md
@ -20,7 +20,7 @@ ambassador-7987df44b9-4pht8       2/2       Running   0          1m
 ambassador-7987df44b9-dh5h6               2/2       Running   0          1m
 ambassador-7987df44b9-qrgsm               2/2       Running   0          1m
 tf-hub-0                                  1/1       Running   0          1m
-tf-job-operator-78757955b-qkg7s   1/1       Running   0          1m
+tf-job-operator-v1alpha2-b76bfbdb-lgbjw   1/1       Running   0          1m
 ```
 ## Overview
@ -47,7 +47,7 @@ cd ks-app
 ## Preparing the training data
 **Note:** TensorFlow works with many file systems like HDFS and S3, you can use
-them to push the dataset and other configurations there and skip the Download and Decompress steps in this turorial.
+them to push the dataset and other configurations there and skip the Download and Decompress steps in this tutorial.
 First let's create a PVC to store the data. This step assumes that your K8s cluster has [Dynamic Volume Provisioning](https://kubernetes.io/docs/concepts/storage/dynamic-provisioning/) enabled.
@ -131,7 +131,7 @@ we need to create the TF pet records. For that, we wil configure and apply the `
 ```
 OBJ_DETECTION_IMAGE="lcastell/pets_object_detection"
-DATA_DIR_PATH="${MOUNT_PATH}/images"
+DATA_DIR_PATH="${MOUNT_PATH}"
 OUTPUT_DIR_PATH="${MOUNT_PATH}"
 ks param set create-pet-record-job image ${OBJ_DETECTION_IMAGE}