Updated object detection training example (#228)

* Updated Dockerfile.traning to use latest tensorflow
  and tensorflow object detetion api.
* Updated tf-training-job component and added a chief
  replica spec
* Corrected some typos and updated some instructions
This commit is contained in:
Daniel Castellanos 2018-08-20 19:32:12 -07:00 committed by k8s-ci-robot
parent f9873e6ac4
commit e6b6730650
8 changed files with 117 additions and 52 deletions

View File

@ -88,10 +88,6 @@ train_config: {
learning_rate: { learning_rate: {
manual_step_learning_rate { manual_step_learning_rate {
initial_learning_rate: 0.0003 initial_learning_rate: 0.0003
schedule {
step: 0
learning_rate: .0003
}
schedule { schedule {
step: 900000 step: 900000
learning_rate: .00003 learning_rate: .00003
@ -109,6 +105,7 @@ train_config: {
gradient_clipping_by_norm: 10.0 gradient_clipping_by_norm: 10.0
fine_tune_checkpoint: "/pets_data/faster_rcnn_resnet101_coco_2018_01_28/model.ckpt" fine_tune_checkpoint: "/pets_data/faster_rcnn_resnet101_coco_2018_01_28/model.ckpt"
from_detection_checkpoint: true from_detection_checkpoint: true
# load_all_detection_checkpoint_vars: true
# Note: The below line limits the training process to 200K steps, which we # Note: The below line limits the training process to 200K steps, which we
# empirically found to be sufficient enough to train the pets dataset. This # empirically found to be sufficient enough to train the pets dataset. This
# effectively bypasses the learning rate schedule (the learning rate will # effectively bypasses the learning rate schedule (the learning rate will
@ -122,21 +119,19 @@ train_config: {
train_input_reader: { train_input_reader: {
tf_record_input_reader { tf_record_input_reader {
input_path: "/pets_data/pet_train_with_masks.record" input_path: "/pets_data/pet_faces_train.record-?????-of-00010"
} }
label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt" label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt"
} }
eval_config: { eval_config: {
num_examples: 2000 metrics_set: "coco_detection_metrics"
# Note: The below line limits the evaluation process to 10 evaluations. num_examples: 1101
# Remove the below line to evaluate indefinitely.
max_evals: 10
} }
eval_input_reader: { eval_input_reader: {
tf_record_input_reader { tf_record_input_reader {
input_path: "/pets_data/pet_val_with_masks.record" input_path: "/pets_data/pet_faces_val.record-?????-of-00010"
} }
label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt" label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt"
shuffle: false shuffle: false

View File

@ -1,3 +1,17 @@
# Copyright 2018 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM ubuntu:16.04 FROM ubuntu:16.04
LABEL maintainer="Soila Kavulya <soila.p.kavulya@intel.com>" LABEL maintainer="Soila Kavulya <soila.p.kavulya@intel.com>"
@ -10,7 +24,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libpng12-dev \ libpng12-dev \
libzmq3-dev \ libzmq3-dev \
pkg-config \ pkg-config \
protobuf-compiler \
python \ python \
python-dev \ python-dev \
python-pil \ python-pil \
@ -20,6 +33,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
git \ git \
software-properties-common \ software-properties-common \
unzip \ unzip \
wget \
&& \ && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
@ -28,32 +42,31 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \ python get-pip.py && \
rm get-pip.py rm get-pip.py
RUN pip --no-cache-dir install \
tensorflow
RUN pip --no-cache-dir install \ RUN pip --no-cache-dir install \
Cython \ Cython \
glob2 \ contextlib2 \
h5py \
ipykernel \
jupyter \ jupyter \
matplotlib \ matplotlib
numpy \
pandas \
scipy \
sklearn \
six \
tensorflow \
tensorflow-serving-api \
&& \
python -m ipykernel.kernelspec
# Setup Universal Object Detection # Setup Universal Object Detection
ENV MODELS_HOME "/models" ENV MODELS_HOME "/models"
RUN git clone https://github.com/tensorflow/models.git $MODELS_HOME && cd $MODELS_HOME && git checkout r1.5 RUN git clone https://github.com/tensorflow/models.git $MODELS_HOME
#COPY models $MODELS_HOME
RUN cd $MODELS_HOME/research \ RUN cd $MODELS_HOME/research && \
&& protoc object_detection/protos/*.proto --python_out=. wget -O protobuf.zip https://github.com/google/protobuf/releases/download/v3.0.0/protoc-3.0.0-linux-x86_64.zip && \
unzip protobuf.zip && \
./bin/protoc object_detection/protos/*.proto --python_out=.
RUN git clone https://github.com/cocodataset/cocoapi.git && \
cd cocoapi/PythonAPI && \
make && \
cp -r pycocotools $MODELS_HOME/research
ENV PYTHONPATH "$MODELS_HOME/research:$MODELS_HOME/research/slim:$PYTHONPATH" ENV PYTHONPATH "$MODELS_HOME/research:$MODELS_HOME/research/slim:$PYTHONPATH"
COPY scripts /scripts
# TensorBoard # TensorBoard
EXPOSE 6006 EXPOSE 6006
@ -63,4 +76,4 @@ WORKDIR $MODELS_HOME
ARG pipeline_config_path ARG pipeline_config_path
ARG train_dir ARG train_dir
CMD ["python", "$MODELS_HOME/research/object_detection/train.py", "--logtostderr", "--pipeline_config_path=$pipeline_config_path" "--train_dir=$train_dir"] CMD ["python", "$MODELS_HOME/research/object_detection/model_main.py", "--pipeline_config_path=$pipeline_config_path" "--model_dir=$train_dir"]

View File

@ -6,10 +6,25 @@ Before exporting the graph we first need to identify a checkpoint candidate in t
To see what's being saved in `${MOUNT_PATH}/train` while the training job is running you can use: To see what's being saved in `${MOUNT_PATH}/train` while the training job is running you can use:
``` ```
kubectl -n kubeflow exec -it pets-training-master-r1hv-0-i6k7c sh kubectl -n kubeflow exec tf-training-job-chief-0 -- ls ${MOUNT_PATH}/train
```
This will list the contents of the train directory. The output should something like this:
```
checkpoint
events.out.tfevents.1534465587.tf-training-job-chief-0
events.out.tfevents.1534525812.tf-training-job-chief-0
graph.pbtxt
model.ckpt-0.data-00000-of-00001
model.ckpt-0.index
model.ckpt-0.meta
model.ckpt-167.data-00000-of-00001
model.ckpt-167.index
model.ckpt-167.meta
model.ckpt-334.data-00000-of-00001
model.ckpt-334.index
model.ckpt-334.meta
pipeline.config
``` ```
This will open an interactive shell to your container and now you can execute `ls ${MOUNT_PATH}/train` and look for a
checkpoint candidate.
Once you have identified the checkpoint next step is to configure the checkpoint in the `export-tf-graph-job` component and apply it. Once you have identified the checkpoint next step is to configure the checkpoint in the `export-tf-graph-job` component and apply it.

View File

@ -26,7 +26,7 @@ apiVersion: "batch/v1",
name: "export-graph", name: "export-graph",
image: params.image, image: params.image,
imagePullPolicy: "IfNotPresent", imagePullPolicy: "IfNotPresent",
command: ['python', 'models/research/object_detection/export_inference_graph.py'], command: ['python', '/models/research/object_detection/export_inference_graph.py'],
args: ['--input_type=' + params.inputType, args: ['--input_type=' + params.inputType,
'--pipeline_config_path=' + params.pipelineConfigPath, '--pipeline_config_path=' + params.pipelineConfigPath,
'--trained_checkpoint_prefix=' + params.trainedCheckpoint, '--trained_checkpoint_prefix=' + params.trainedCheckpoint,

View File

@ -26,7 +26,7 @@
pvc: 'pets-pvc', pvc: 'pets-pvc',
}, },
"create-pet-record-job": { "create-pet-record-job": {
dataDirPath: '/pets_data/images', dataDirPath: '/pets_data',
image: 'lcastell/pets_object_detection', image: 'lcastell/pets_object_detection',
mountPath: '/pets_data', mountPath: '/pets_data',
name: 'create-pet-record-job', name: 'create-pet-record-job',

View File

@ -21,14 +21,55 @@ local tfJobCpu = {
workingDir: "/models", workingDir: "/models",
command: [ command: [
"python", "python",
"research/object_detection/train.py", "research/object_detection/model_main.py",
], ],
args:[ args:[
"--logstostderr", "--alsologtostderr",
"--pipeline_config_path=" + params.pipelineConfigPath, "--pipeline_config_path=" + params.pipelineConfigPath,
"--train_dir=" + params.trainDir, "--model_dir=" + params.trainDir,
], ],
image: params.image, image: params.image,
imagePullPolicy: "Always",
name: "tensorflow",
[if params.numGpu > 0 then "resources"] : {
limits:{
"nvidia.com/gpu": params.numGpu,
},
},
volumeMounts: [{
mountPath: params.mountPath,
name: "pets-data",
},],
},
],
volumes: [{
name: "pets-data",
persistentVolumeClaim: {
claimName: params.pvc,
},
},],
restartPolicy: "OnFailure",
},
},
},
Chief: {
replicas: 1,
template: {
spec: {
containers: [
{
workingDir: "/models",
command: [
"python",
"research/object_detection/model_main.py",
],
args:[
"--alsologtostderr",
"--pipeline_config_path=" + params.pipelineConfigPath,
"--model_dir=" + params.trainDir,
],
image: params.image,
imagePullPolicy: "Always",
name: "tensorflow", name: "tensorflow",
[if params.numGpu > 0 then "resources"] : { [if params.numGpu > 0 then "resources"] : {
limits:{ limits:{
@ -60,14 +101,15 @@ local tfJobCpu = {
workingDir: "/models", workingDir: "/models",
command: [ command: [
"python", "python",
"research/object_detection/train.py", "research/object_detection/model_main.py",
], ],
args:[ args:[
"--logstostderr", "--alsologtostderr",
"--pipeline_config_path=" + params.pipelineConfigPath, "--pipeline_config_path=" + params.pipelineConfigPath,
"--train_dir=" + params.trainDir, "--model_dir=" + params.trainDir,
], ],
image: params.image, image: params.image,
imagePullPolicy: "Always",
name: "tensorflow", name: "tensorflow",
[if params.numGpu > 0 then "resources"] : { [if params.numGpu > 0 then "resources"] : {
limits:{ limits:{

View File

@ -8,14 +8,14 @@ kubectl -n kubeflow describe tfjobs tf-training-job
### View logs of individual pods ### View logs of individual pods
``` ```
kubectl -n kubeflow get pods kubectl -n kubeflow get pods
kubectl -n kubeflow logs <name_of_master_pod> kubectl -n kubeflow logs <name_of_chief_pod>
``` ```
**NOTE:** When the job finishes, the pods will be automatically terminated. To see, run the `get pods` command with the `-a` flag: **NOTE:** When the job finishes, the pods will be automatically terminated. To see, run the `get pods` command with the `-a` flag:
``` ```
kubectl -n kubeflow get pods -a kubectl -n kubeflow get pods -a
``` ```
While the job is running, you should see something like this in your master pod logs: While the job is running, you should see something like this in your chief pod logs:
``` ```
INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt
INFO:tensorflow:Recording summary at step 819. INFO:tensorflow:Recording summary at step 819.
@ -28,7 +28,7 @@ INFO:tensorflow:global step 834: loss = 0.2307 (16.493 sec/step)
INFO:tensorflow:Recording summary at step 839 INFO:tensorflow:Recording summary at step 839
``` ```
When the job finishes, you should see something like this in your completed/terminated master pod logs: When the job finishes, you should see something like this in your completed/terminated chief pod logs:
``` ```
INFO:tensorflow:Starting Session. INFO:tensorflow:Starting Session.
INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt

View File

@ -20,7 +20,7 @@ ambassador-7987df44b9-4pht8 2/2 Running 0 1m
ambassador-7987df44b9-dh5h6 2/2 Running 0 1m ambassador-7987df44b9-dh5h6 2/2 Running 0 1m
ambassador-7987df44b9-qrgsm 2/2 Running 0 1m ambassador-7987df44b9-qrgsm 2/2 Running 0 1m
tf-hub-0 1/1 Running 0 1m tf-hub-0 1/1 Running 0 1m
tf-job-operator-78757955b-qkg7s 1/1 Running 0 1m tf-job-operator-v1alpha2-b76bfbdb-lgbjw 1/1 Running 0 1m
``` ```
## Overview ## Overview
@ -47,7 +47,7 @@ cd ks-app
## Preparing the training data ## Preparing the training data
**Note:** TensorFlow works with many file systems like HDFS and S3, you can use **Note:** TensorFlow works with many file systems like HDFS and S3, you can use
them to push the dataset and other configurations there and skip the Download and Decompress steps in this turorial. them to push the dataset and other configurations there and skip the Download and Decompress steps in this tutorial.
First let's create a PVC to store the data. This step assumes that your K8s cluster has [Dynamic Volume Provisioning](https://kubernetes.io/docs/concepts/storage/dynamic-provisioning/) enabled. First let's create a PVC to store the data. This step assumes that your K8s cluster has [Dynamic Volume Provisioning](https://kubernetes.io/docs/concepts/storage/dynamic-provisioning/) enabled.
@ -131,7 +131,7 @@ we need to create the TF pet records. For that, we wil configure and apply the `
``` ```
OBJ_DETECTION_IMAGE="lcastell/pets_object_detection" OBJ_DETECTION_IMAGE="lcastell/pets_object_detection"
DATA_DIR_PATH="${MOUNT_PATH}/images" DATA_DIR_PATH="${MOUNT_PATH}"
OUTPUT_DIR_PATH="${MOUNT_PATH}" OUTPUT_DIR_PATH="${MOUNT_PATH}"
ks param set create-pet-record-job image ${OBJ_DETECTION_IMAGE} ks param set create-pet-record-job image ${OBJ_DETECTION_IMAGE}