mirror of https://github.com/kubeflow/examples.git
Updated object detection training example (#228)
* Updated Dockerfile.traning to use latest tensorflow and tensorflow object detetion api. * Updated tf-training-job component and added a chief replica spec * Corrected some typos and updated some instructions
This commit is contained in:
parent
f9873e6ac4
commit
e6b6730650
|
|
@ -88,10 +88,6 @@ train_config: {
|
||||||
learning_rate: {
|
learning_rate: {
|
||||||
manual_step_learning_rate {
|
manual_step_learning_rate {
|
||||||
initial_learning_rate: 0.0003
|
initial_learning_rate: 0.0003
|
||||||
schedule {
|
|
||||||
step: 0
|
|
||||||
learning_rate: .0003
|
|
||||||
}
|
|
||||||
schedule {
|
schedule {
|
||||||
step: 900000
|
step: 900000
|
||||||
learning_rate: .00003
|
learning_rate: .00003
|
||||||
|
|
@ -109,6 +105,7 @@ train_config: {
|
||||||
gradient_clipping_by_norm: 10.0
|
gradient_clipping_by_norm: 10.0
|
||||||
fine_tune_checkpoint: "/pets_data/faster_rcnn_resnet101_coco_2018_01_28/model.ckpt"
|
fine_tune_checkpoint: "/pets_data/faster_rcnn_resnet101_coco_2018_01_28/model.ckpt"
|
||||||
from_detection_checkpoint: true
|
from_detection_checkpoint: true
|
||||||
|
# load_all_detection_checkpoint_vars: true
|
||||||
# Note: The below line limits the training process to 200K steps, which we
|
# Note: The below line limits the training process to 200K steps, which we
|
||||||
# empirically found to be sufficient enough to train the pets dataset. This
|
# empirically found to be sufficient enough to train the pets dataset. This
|
||||||
# effectively bypasses the learning rate schedule (the learning rate will
|
# effectively bypasses the learning rate schedule (the learning rate will
|
||||||
|
|
@ -122,21 +119,19 @@ train_config: {
|
||||||
|
|
||||||
train_input_reader: {
|
train_input_reader: {
|
||||||
tf_record_input_reader {
|
tf_record_input_reader {
|
||||||
input_path: "/pets_data/pet_train_with_masks.record"
|
input_path: "/pets_data/pet_faces_train.record-?????-of-00010"
|
||||||
}
|
}
|
||||||
label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt"
|
label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt"
|
||||||
}
|
}
|
||||||
|
|
||||||
eval_config: {
|
eval_config: {
|
||||||
num_examples: 2000
|
metrics_set: "coco_detection_metrics"
|
||||||
# Note: The below line limits the evaluation process to 10 evaluations.
|
num_examples: 1101
|
||||||
# Remove the below line to evaluate indefinitely.
|
|
||||||
max_evals: 10
|
|
||||||
}
|
}
|
||||||
|
|
||||||
eval_input_reader: {
|
eval_input_reader: {
|
||||||
tf_record_input_reader {
|
tf_record_input_reader {
|
||||||
input_path: "/pets_data/pet_val_with_masks.record"
|
input_path: "/pets_data/pet_faces_val.record-?????-of-00010"
|
||||||
}
|
}
|
||||||
label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt"
|
label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt"
|
||||||
shuffle: false
|
shuffle: false
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright 2018 Intel Corporation.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# https://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
FROM ubuntu:16.04
|
FROM ubuntu:16.04
|
||||||
|
|
||||||
LABEL maintainer="Soila Kavulya <soila.p.kavulya@intel.com>"
|
LABEL maintainer="Soila Kavulya <soila.p.kavulya@intel.com>"
|
||||||
|
|
@ -10,7 +24,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
libpng12-dev \
|
libpng12-dev \
|
||||||
libzmq3-dev \
|
libzmq3-dev \
|
||||||
pkg-config \
|
pkg-config \
|
||||||
protobuf-compiler \
|
|
||||||
python \
|
python \
|
||||||
python-dev \
|
python-dev \
|
||||||
python-pil \
|
python-pil \
|
||||||
|
|
@ -20,6 +33,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
git \
|
git \
|
||||||
software-properties-common \
|
software-properties-common \
|
||||||
unzip \
|
unzip \
|
||||||
|
wget \
|
||||||
&& \
|
&& \
|
||||||
apt-get clean && \
|
apt-get clean && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
@ -28,32 +42,31 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||||
python get-pip.py && \
|
python get-pip.py && \
|
||||||
rm get-pip.py
|
rm get-pip.py
|
||||||
|
|
||||||
|
RUN pip --no-cache-dir install \
|
||||||
|
tensorflow
|
||||||
|
|
||||||
RUN pip --no-cache-dir install \
|
RUN pip --no-cache-dir install \
|
||||||
Cython \
|
Cython \
|
||||||
glob2 \
|
contextlib2 \
|
||||||
h5py \
|
|
||||||
ipykernel \
|
|
||||||
jupyter \
|
jupyter \
|
||||||
matplotlib \
|
matplotlib
|
||||||
numpy \
|
|
||||||
pandas \
|
|
||||||
scipy \
|
|
||||||
sklearn \
|
|
||||||
six \
|
|
||||||
tensorflow \
|
|
||||||
tensorflow-serving-api \
|
|
||||||
&& \
|
|
||||||
python -m ipykernel.kernelspec
|
|
||||||
|
|
||||||
# Setup Universal Object Detection
|
# Setup Universal Object Detection
|
||||||
ENV MODELS_HOME "/models"
|
ENV MODELS_HOME "/models"
|
||||||
RUN git clone https://github.com/tensorflow/models.git $MODELS_HOME && cd $MODELS_HOME && git checkout r1.5
|
RUN git clone https://github.com/tensorflow/models.git $MODELS_HOME
|
||||||
#COPY models $MODELS_HOME
|
|
||||||
RUN cd $MODELS_HOME/research \
|
RUN cd $MODELS_HOME/research && \
|
||||||
&& protoc object_detection/protos/*.proto --python_out=.
|
wget -O protobuf.zip https://github.com/google/protobuf/releases/download/v3.0.0/protoc-3.0.0-linux-x86_64.zip && \
|
||||||
|
unzip protobuf.zip && \
|
||||||
|
./bin/protoc object_detection/protos/*.proto --python_out=.
|
||||||
|
|
||||||
|
RUN git clone https://github.com/cocodataset/cocoapi.git && \
|
||||||
|
cd cocoapi/PythonAPI && \
|
||||||
|
make && \
|
||||||
|
cp -r pycocotools $MODELS_HOME/research
|
||||||
|
|
||||||
ENV PYTHONPATH "$MODELS_HOME/research:$MODELS_HOME/research/slim:$PYTHONPATH"
|
ENV PYTHONPATH "$MODELS_HOME/research:$MODELS_HOME/research/slim:$PYTHONPATH"
|
||||||
COPY scripts /scripts
|
|
||||||
# TensorBoard
|
# TensorBoard
|
||||||
EXPOSE 6006
|
EXPOSE 6006
|
||||||
|
|
||||||
|
|
@ -63,4 +76,4 @@ WORKDIR $MODELS_HOME
|
||||||
ARG pipeline_config_path
|
ARG pipeline_config_path
|
||||||
ARG train_dir
|
ARG train_dir
|
||||||
|
|
||||||
CMD ["python", "$MODELS_HOME/research/object_detection/train.py", "--logtostderr", "--pipeline_config_path=$pipeline_config_path" "--train_dir=$train_dir"]
|
CMD ["python", "$MODELS_HOME/research/object_detection/model_main.py", "--pipeline_config_path=$pipeline_config_path" "--model_dir=$train_dir"]
|
||||||
|
|
|
||||||
|
|
@ -6,10 +6,25 @@ Before exporting the graph we first need to identify a checkpoint candidate in t
|
||||||
|
|
||||||
To see what's being saved in `${MOUNT_PATH}/train` while the training job is running you can use:
|
To see what's being saved in `${MOUNT_PATH}/train` while the training job is running you can use:
|
||||||
```
|
```
|
||||||
kubectl -n kubeflow exec -it pets-training-master-r1hv-0-i6k7c sh
|
kubectl -n kubeflow exec tf-training-job-chief-0 -- ls ${MOUNT_PATH}/train
|
||||||
|
```
|
||||||
|
This will list the contents of the train directory. The output should something like this:
|
||||||
|
```
|
||||||
|
checkpoint
|
||||||
|
events.out.tfevents.1534465587.tf-training-job-chief-0
|
||||||
|
events.out.tfevents.1534525812.tf-training-job-chief-0
|
||||||
|
graph.pbtxt
|
||||||
|
model.ckpt-0.data-00000-of-00001
|
||||||
|
model.ckpt-0.index
|
||||||
|
model.ckpt-0.meta
|
||||||
|
model.ckpt-167.data-00000-of-00001
|
||||||
|
model.ckpt-167.index
|
||||||
|
model.ckpt-167.meta
|
||||||
|
model.ckpt-334.data-00000-of-00001
|
||||||
|
model.ckpt-334.index
|
||||||
|
model.ckpt-334.meta
|
||||||
|
pipeline.config
|
||||||
```
|
```
|
||||||
This will open an interactive shell to your container and now you can execute `ls ${MOUNT_PATH}/train` and look for a
|
|
||||||
checkpoint candidate.
|
|
||||||
|
|
||||||
Once you have identified the checkpoint next step is to configure the checkpoint in the `export-tf-graph-job` component and apply it.
|
Once you have identified the checkpoint next step is to configure the checkpoint in the `export-tf-graph-job` component and apply it.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ apiVersion: "batch/v1",
|
||||||
name: "export-graph",
|
name: "export-graph",
|
||||||
image: params.image,
|
image: params.image,
|
||||||
imagePullPolicy: "IfNotPresent",
|
imagePullPolicy: "IfNotPresent",
|
||||||
command: ['python', 'models/research/object_detection/export_inference_graph.py'],
|
command: ['python', '/models/research/object_detection/export_inference_graph.py'],
|
||||||
args: ['--input_type=' + params.inputType,
|
args: ['--input_type=' + params.inputType,
|
||||||
'--pipeline_config_path=' + params.pipelineConfigPath,
|
'--pipeline_config_path=' + params.pipelineConfigPath,
|
||||||
'--trained_checkpoint_prefix=' + params.trainedCheckpoint,
|
'--trained_checkpoint_prefix=' + params.trainedCheckpoint,
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@
|
||||||
pvc: 'pets-pvc',
|
pvc: 'pets-pvc',
|
||||||
},
|
},
|
||||||
"create-pet-record-job": {
|
"create-pet-record-job": {
|
||||||
dataDirPath: '/pets_data/images',
|
dataDirPath: '/pets_data',
|
||||||
image: 'lcastell/pets_object_detection',
|
image: 'lcastell/pets_object_detection',
|
||||||
mountPath: '/pets_data',
|
mountPath: '/pets_data',
|
||||||
name: 'create-pet-record-job',
|
name: 'create-pet-record-job',
|
||||||
|
|
|
||||||
|
|
@ -21,14 +21,55 @@ local tfJobCpu = {
|
||||||
workingDir: "/models",
|
workingDir: "/models",
|
||||||
command: [
|
command: [
|
||||||
"python",
|
"python",
|
||||||
"research/object_detection/train.py",
|
"research/object_detection/model_main.py",
|
||||||
],
|
],
|
||||||
args:[
|
args:[
|
||||||
"--logstostderr",
|
"--alsologtostderr",
|
||||||
"--pipeline_config_path=" + params.pipelineConfigPath,
|
"--pipeline_config_path=" + params.pipelineConfigPath,
|
||||||
"--train_dir=" + params.trainDir,
|
"--model_dir=" + params.trainDir,
|
||||||
],
|
],
|
||||||
image: params.image,
|
image: params.image,
|
||||||
|
imagePullPolicy: "Always",
|
||||||
|
name: "tensorflow",
|
||||||
|
[if params.numGpu > 0 then "resources"] : {
|
||||||
|
limits:{
|
||||||
|
"nvidia.com/gpu": params.numGpu,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
volumeMounts: [{
|
||||||
|
mountPath: params.mountPath,
|
||||||
|
name: "pets-data",
|
||||||
|
},],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
volumes: [{
|
||||||
|
name: "pets-data",
|
||||||
|
persistentVolumeClaim: {
|
||||||
|
claimName: params.pvc,
|
||||||
|
},
|
||||||
|
},],
|
||||||
|
restartPolicy: "OnFailure",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Chief: {
|
||||||
|
replicas: 1,
|
||||||
|
template: {
|
||||||
|
spec: {
|
||||||
|
containers: [
|
||||||
|
{
|
||||||
|
workingDir: "/models",
|
||||||
|
command: [
|
||||||
|
"python",
|
||||||
|
"research/object_detection/model_main.py",
|
||||||
|
],
|
||||||
|
args:[
|
||||||
|
"--alsologtostderr",
|
||||||
|
"--pipeline_config_path=" + params.pipelineConfigPath,
|
||||||
|
"--model_dir=" + params.trainDir,
|
||||||
|
],
|
||||||
|
image: params.image,
|
||||||
|
imagePullPolicy: "Always",
|
||||||
name: "tensorflow",
|
name: "tensorflow",
|
||||||
[if params.numGpu > 0 then "resources"] : {
|
[if params.numGpu > 0 then "resources"] : {
|
||||||
limits:{
|
limits:{
|
||||||
|
|
@ -60,14 +101,15 @@ local tfJobCpu = {
|
||||||
workingDir: "/models",
|
workingDir: "/models",
|
||||||
command: [
|
command: [
|
||||||
"python",
|
"python",
|
||||||
"research/object_detection/train.py",
|
"research/object_detection/model_main.py",
|
||||||
],
|
],
|
||||||
args:[
|
args:[
|
||||||
"--logstostderr",
|
"--alsologtostderr",
|
||||||
"--pipeline_config_path=" + params.pipelineConfigPath,
|
"--pipeline_config_path=" + params.pipelineConfigPath,
|
||||||
"--train_dir=" + params.trainDir,
|
"--model_dir=" + params.trainDir,
|
||||||
],
|
],
|
||||||
image: params.image,
|
image: params.image,
|
||||||
|
imagePullPolicy: "Always",
|
||||||
name: "tensorflow",
|
name: "tensorflow",
|
||||||
[if params.numGpu > 0 then "resources"] : {
|
[if params.numGpu > 0 then "resources"] : {
|
||||||
limits:{
|
limits:{
|
||||||
|
|
|
||||||
|
|
@ -8,14 +8,14 @@ kubectl -n kubeflow describe tfjobs tf-training-job
|
||||||
### View logs of individual pods
|
### View logs of individual pods
|
||||||
```
|
```
|
||||||
kubectl -n kubeflow get pods
|
kubectl -n kubeflow get pods
|
||||||
kubectl -n kubeflow logs <name_of_master_pod>
|
kubectl -n kubeflow logs <name_of_chief_pod>
|
||||||
```
|
```
|
||||||
**NOTE:** When the job finishes, the pods will be automatically terminated. To see, run the `get pods` command with the `-a` flag:
|
**NOTE:** When the job finishes, the pods will be automatically terminated. To see, run the `get pods` command with the `-a` flag:
|
||||||
```
|
```
|
||||||
kubectl -n kubeflow get pods -a
|
kubectl -n kubeflow get pods -a
|
||||||
```
|
```
|
||||||
|
|
||||||
While the job is running, you should see something like this in your master pod logs:
|
While the job is running, you should see something like this in your chief pod logs:
|
||||||
```
|
```
|
||||||
INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt
|
INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt
|
||||||
INFO:tensorflow:Recording summary at step 819.
|
INFO:tensorflow:Recording summary at step 819.
|
||||||
|
|
@ -28,7 +28,7 @@ INFO:tensorflow:global step 834: loss = 0.2307 (16.493 sec/step)
|
||||||
INFO:tensorflow:Recording summary at step 839
|
INFO:tensorflow:Recording summary at step 839
|
||||||
```
|
```
|
||||||
|
|
||||||
When the job finishes, you should see something like this in your completed/terminated master pod logs:
|
When the job finishes, you should see something like this in your completed/terminated chief pod logs:
|
||||||
```
|
```
|
||||||
INFO:tensorflow:Starting Session.
|
INFO:tensorflow:Starting Session.
|
||||||
INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt
|
INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ ambassador-7987df44b9-4pht8 2/2 Running 0 1m
|
||||||
ambassador-7987df44b9-dh5h6 2/2 Running 0 1m
|
ambassador-7987df44b9-dh5h6 2/2 Running 0 1m
|
||||||
ambassador-7987df44b9-qrgsm 2/2 Running 0 1m
|
ambassador-7987df44b9-qrgsm 2/2 Running 0 1m
|
||||||
tf-hub-0 1/1 Running 0 1m
|
tf-hub-0 1/1 Running 0 1m
|
||||||
tf-job-operator-78757955b-qkg7s 1/1 Running 0 1m
|
tf-job-operator-v1alpha2-b76bfbdb-lgbjw 1/1 Running 0 1m
|
||||||
```
|
```
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
@ -47,7 +47,7 @@ cd ks-app
|
||||||
## Preparing the training data
|
## Preparing the training data
|
||||||
|
|
||||||
**Note:** TensorFlow works with many file systems like HDFS and S3, you can use
|
**Note:** TensorFlow works with many file systems like HDFS and S3, you can use
|
||||||
them to push the dataset and other configurations there and skip the Download and Decompress steps in this turorial.
|
them to push the dataset and other configurations there and skip the Download and Decompress steps in this tutorial.
|
||||||
|
|
||||||
First let's create a PVC to store the data. This step assumes that your K8s cluster has [Dynamic Volume Provisioning](https://kubernetes.io/docs/concepts/storage/dynamic-provisioning/) enabled.
|
First let's create a PVC to store the data. This step assumes that your K8s cluster has [Dynamic Volume Provisioning](https://kubernetes.io/docs/concepts/storage/dynamic-provisioning/) enabled.
|
||||||
|
|
||||||
|
|
@ -131,7 +131,7 @@ we need to create the TF pet records. For that, we wil configure and apply the `
|
||||||
|
|
||||||
```
|
```
|
||||||
OBJ_DETECTION_IMAGE="lcastell/pets_object_detection"
|
OBJ_DETECTION_IMAGE="lcastell/pets_object_detection"
|
||||||
DATA_DIR_PATH="${MOUNT_PATH}/images"
|
DATA_DIR_PATH="${MOUNT_PATH}"
|
||||||
OUTPUT_DIR_PATH="${MOUNT_PATH}"
|
OUTPUT_DIR_PATH="${MOUNT_PATH}"
|
||||||
|
|
||||||
ks param set create-pet-record-job image ${OBJ_DETECTION_IMAGE}
|
ks param set create-pet-record-job image ${OBJ_DETECTION_IMAGE}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue