mirror of https://github.com/kubeflow/examples.git
Updated object detection training example (#228)
* Updated Dockerfile.traning to use latest tensorflow and tensorflow object detetion api. * Updated tf-training-job component and added a chief replica spec * Corrected some typos and updated some instructions
This commit is contained in:
parent
f9873e6ac4
commit
e6b6730650
|
|
@ -88,10 +88,6 @@ train_config: {
|
|||
learning_rate: {
|
||||
manual_step_learning_rate {
|
||||
initial_learning_rate: 0.0003
|
||||
schedule {
|
||||
step: 0
|
||||
learning_rate: .0003
|
||||
}
|
||||
schedule {
|
||||
step: 900000
|
||||
learning_rate: .00003
|
||||
|
|
@ -109,6 +105,7 @@ train_config: {
|
|||
gradient_clipping_by_norm: 10.0
|
||||
fine_tune_checkpoint: "/pets_data/faster_rcnn_resnet101_coco_2018_01_28/model.ckpt"
|
||||
from_detection_checkpoint: true
|
||||
# load_all_detection_checkpoint_vars: true
|
||||
# Note: The below line limits the training process to 200K steps, which we
|
||||
# empirically found to be sufficient enough to train the pets dataset. This
|
||||
# effectively bypasses the learning rate schedule (the learning rate will
|
||||
|
|
@ -122,21 +119,19 @@ train_config: {
|
|||
|
||||
train_input_reader: {
|
||||
tf_record_input_reader {
|
||||
input_path: "/pets_data/pet_train_with_masks.record"
|
||||
input_path: "/pets_data/pet_faces_train.record-?????-of-00010"
|
||||
}
|
||||
label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt"
|
||||
}
|
||||
|
||||
eval_config: {
|
||||
num_examples: 2000
|
||||
# Note: The below line limits the evaluation process to 10 evaluations.
|
||||
# Remove the below line to evaluate indefinitely.
|
||||
max_evals: 10
|
||||
metrics_set: "coco_detection_metrics"
|
||||
num_examples: 1101
|
||||
}
|
||||
|
||||
eval_input_reader: {
|
||||
tf_record_input_reader {
|
||||
input_path: "/pets_data/pet_val_with_masks.record"
|
||||
input_path: "/pets_data/pet_faces_val.record-?????-of-00010"
|
||||
}
|
||||
label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt"
|
||||
shuffle: false
|
||||
|
|
|
|||
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright 2018 Intel Corporation.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM ubuntu:16.04
|
||||
|
||||
LABEL maintainer="Soila Kavulya <soila.p.kavulya@intel.com>"
|
||||
|
|
@ -10,7 +24,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||
libpng12-dev \
|
||||
libzmq3-dev \
|
||||
pkg-config \
|
||||
protobuf-compiler \
|
||||
python \
|
||||
python-dev \
|
||||
python-pil \
|
||||
|
|
@ -20,6 +33,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||
git \
|
||||
software-properties-common \
|
||||
unzip \
|
||||
wget \
|
||||
&& \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
|
@ -28,32 +42,31 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
|||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
RUN pip --no-cache-dir install \
|
||||
tensorflow
|
||||
|
||||
RUN pip --no-cache-dir install \
|
||||
Cython \
|
||||
glob2 \
|
||||
h5py \
|
||||
ipykernel \
|
||||
contextlib2 \
|
||||
jupyter \
|
||||
matplotlib \
|
||||
numpy \
|
||||
pandas \
|
||||
scipy \
|
||||
sklearn \
|
||||
six \
|
||||
tensorflow \
|
||||
tensorflow-serving-api \
|
||||
&& \
|
||||
python -m ipykernel.kernelspec
|
||||
matplotlib
|
||||
|
||||
# Setup Universal Object Detection
|
||||
ENV MODELS_HOME "/models"
|
||||
RUN git clone https://github.com/tensorflow/models.git $MODELS_HOME && cd $MODELS_HOME && git checkout r1.5
|
||||
#COPY models $MODELS_HOME
|
||||
RUN cd $MODELS_HOME/research \
|
||||
&& protoc object_detection/protos/*.proto --python_out=.
|
||||
RUN git clone https://github.com/tensorflow/models.git $MODELS_HOME
|
||||
|
||||
RUN cd $MODELS_HOME/research && \
|
||||
wget -O protobuf.zip https://github.com/google/protobuf/releases/download/v3.0.0/protoc-3.0.0-linux-x86_64.zip && \
|
||||
unzip protobuf.zip && \
|
||||
./bin/protoc object_detection/protos/*.proto --python_out=.
|
||||
|
||||
RUN git clone https://github.com/cocodataset/cocoapi.git && \
|
||||
cd cocoapi/PythonAPI && \
|
||||
make && \
|
||||
cp -r pycocotools $MODELS_HOME/research
|
||||
|
||||
ENV PYTHONPATH "$MODELS_HOME/research:$MODELS_HOME/research/slim:$PYTHONPATH"
|
||||
COPY scripts /scripts
|
||||
|
||||
# TensorBoard
|
||||
EXPOSE 6006
|
||||
|
||||
|
|
@ -63,4 +76,4 @@ WORKDIR $MODELS_HOME
|
|||
ARG pipeline_config_path
|
||||
ARG train_dir
|
||||
|
||||
CMD ["python", "$MODELS_HOME/research/object_detection/train.py", "--logtostderr", "--pipeline_config_path=$pipeline_config_path" "--train_dir=$train_dir"]
|
||||
CMD ["python", "$MODELS_HOME/research/object_detection/model_main.py", "--pipeline_config_path=$pipeline_config_path" "--model_dir=$train_dir"]
|
||||
|
|
|
|||
|
|
@ -6,10 +6,25 @@ Before exporting the graph we first need to identify a checkpoint candidate in t
|
|||
|
||||
To see what's being saved in `${MOUNT_PATH}/train` while the training job is running you can use:
|
||||
```
|
||||
kubectl -n kubeflow exec -it pets-training-master-r1hv-0-i6k7c sh
|
||||
kubectl -n kubeflow exec tf-training-job-chief-0 -- ls ${MOUNT_PATH}/train
|
||||
```
|
||||
This will open an interactive shell to your container and now you can execute `ls ${MOUNT_PATH}/train` and look for a
|
||||
checkpoint candidate.
|
||||
This will list the contents of the train directory. The output should something like this:
|
||||
```
|
||||
checkpoint
|
||||
events.out.tfevents.1534465587.tf-training-job-chief-0
|
||||
events.out.tfevents.1534525812.tf-training-job-chief-0
|
||||
graph.pbtxt
|
||||
model.ckpt-0.data-00000-of-00001
|
||||
model.ckpt-0.index
|
||||
model.ckpt-0.meta
|
||||
model.ckpt-167.data-00000-of-00001
|
||||
model.ckpt-167.index
|
||||
model.ckpt-167.meta
|
||||
model.ckpt-334.data-00000-of-00001
|
||||
model.ckpt-334.index
|
||||
model.ckpt-334.meta
|
||||
pipeline.config
|
||||
```
|
||||
|
||||
Once you have identified the checkpoint next step is to configure the checkpoint in the `export-tf-graph-job` component and apply it.
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ apiVersion: "batch/v1",
|
|||
name: "export-graph",
|
||||
image: params.image,
|
||||
imagePullPolicy: "IfNotPresent",
|
||||
command: ['python', 'models/research/object_detection/export_inference_graph.py'],
|
||||
command: ['python', '/models/research/object_detection/export_inference_graph.py'],
|
||||
args: ['--input_type=' + params.inputType,
|
||||
'--pipeline_config_path=' + params.pipelineConfigPath,
|
||||
'--trained_checkpoint_prefix=' + params.trainedCheckpoint,
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@
|
|||
pvc: 'pets-pvc',
|
||||
},
|
||||
"create-pet-record-job": {
|
||||
dataDirPath: '/pets_data/images',
|
||||
dataDirPath: '/pets_data',
|
||||
image: 'lcastell/pets_object_detection',
|
||||
mountPath: '/pets_data',
|
||||
name: 'create-pet-record-job',
|
||||
|
|
|
|||
|
|
@ -21,14 +21,55 @@ local tfJobCpu = {
|
|||
workingDir: "/models",
|
||||
command: [
|
||||
"python",
|
||||
"research/object_detection/train.py",
|
||||
"research/object_detection/model_main.py",
|
||||
],
|
||||
args:[
|
||||
"--logstostderr",
|
||||
"--alsologtostderr",
|
||||
"--pipeline_config_path=" + params.pipelineConfigPath,
|
||||
"--train_dir=" + params.trainDir,
|
||||
"--model_dir=" + params.trainDir,
|
||||
],
|
||||
image: params.image,
|
||||
imagePullPolicy: "Always",
|
||||
name: "tensorflow",
|
||||
[if params.numGpu > 0 then "resources"] : {
|
||||
limits:{
|
||||
"nvidia.com/gpu": params.numGpu,
|
||||
},
|
||||
},
|
||||
volumeMounts: [{
|
||||
mountPath: params.mountPath,
|
||||
name: "pets-data",
|
||||
},],
|
||||
},
|
||||
],
|
||||
volumes: [{
|
||||
name: "pets-data",
|
||||
persistentVolumeClaim: {
|
||||
claimName: params.pvc,
|
||||
},
|
||||
},],
|
||||
restartPolicy: "OnFailure",
|
||||
},
|
||||
},
|
||||
},
|
||||
Chief: {
|
||||
replicas: 1,
|
||||
template: {
|
||||
spec: {
|
||||
containers: [
|
||||
{
|
||||
workingDir: "/models",
|
||||
command: [
|
||||
"python",
|
||||
"research/object_detection/model_main.py",
|
||||
],
|
||||
args:[
|
||||
"--alsologtostderr",
|
||||
"--pipeline_config_path=" + params.pipelineConfigPath,
|
||||
"--model_dir=" + params.trainDir,
|
||||
],
|
||||
image: params.image,
|
||||
imagePullPolicy: "Always",
|
||||
name: "tensorflow",
|
||||
[if params.numGpu > 0 then "resources"] : {
|
||||
limits:{
|
||||
|
|
@ -60,14 +101,15 @@ local tfJobCpu = {
|
|||
workingDir: "/models",
|
||||
command: [
|
||||
"python",
|
||||
"research/object_detection/train.py",
|
||||
"research/object_detection/model_main.py",
|
||||
],
|
||||
args:[
|
||||
"--logstostderr",
|
||||
"--alsologtostderr",
|
||||
"--pipeline_config_path=" + params.pipelineConfigPath,
|
||||
"--train_dir=" + params.trainDir,
|
||||
"--model_dir=" + params.trainDir,
|
||||
],
|
||||
image: params.image,
|
||||
imagePullPolicy: "Always",
|
||||
name: "tensorflow",
|
||||
[if params.numGpu > 0 then "resources"] : {
|
||||
limits:{
|
||||
|
|
|
|||
|
|
@ -8,14 +8,14 @@ kubectl -n kubeflow describe tfjobs tf-training-job
|
|||
### View logs of individual pods
|
||||
```
|
||||
kubectl -n kubeflow get pods
|
||||
kubectl -n kubeflow logs <name_of_master_pod>
|
||||
kubectl -n kubeflow logs <name_of_chief_pod>
|
||||
```
|
||||
**NOTE:** When the job finishes, the pods will be automatically terminated. To see, run the `get pods` command with the `-a` flag:
|
||||
```
|
||||
kubectl -n kubeflow get pods -a
|
||||
```
|
||||
|
||||
While the job is running, you should see something like this in your master pod logs:
|
||||
While the job is running, you should see something like this in your chief pod logs:
|
||||
```
|
||||
INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt
|
||||
INFO:tensorflow:Recording summary at step 819.
|
||||
|
|
@ -28,7 +28,7 @@ INFO:tensorflow:global step 834: loss = 0.2307 (16.493 sec/step)
|
|||
INFO:tensorflow:Recording summary at step 839
|
||||
```
|
||||
|
||||
When the job finishes, you should see something like this in your completed/terminated master pod logs:
|
||||
When the job finishes, you should see something like this in your completed/terminated chief pod logs:
|
||||
```
|
||||
INFO:tensorflow:Starting Session.
|
||||
INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt
|
||||
|
|
|
|||
|
|
@ -15,12 +15,12 @@ After completing the steps in the kubeflow getting started guide you will have t
|
|||
- The following pods in your kubernetes cluster in the `kubeflow` namespace:
|
||||
```
|
||||
kubectl -n kubeflow get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
ambassador-7987df44b9-4pht8 2/2 Running 0 1m
|
||||
ambassador-7987df44b9-dh5h6 2/2 Running 0 1m
|
||||
ambassador-7987df44b9-qrgsm 2/2 Running 0 1m
|
||||
tf-hub-0 1/1 Running 0 1m
|
||||
tf-job-operator-78757955b-qkg7s 1/1 Running 0 1m
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
ambassador-7987df44b9-4pht8 2/2 Running 0 1m
|
||||
ambassador-7987df44b9-dh5h6 2/2 Running 0 1m
|
||||
ambassador-7987df44b9-qrgsm 2/2 Running 0 1m
|
||||
tf-hub-0 1/1 Running 0 1m
|
||||
tf-job-operator-v1alpha2-b76bfbdb-lgbjw 1/1 Running 0 1m
|
||||
```
|
||||
|
||||
## Overview
|
||||
|
|
@ -47,7 +47,7 @@ cd ks-app
|
|||
## Preparing the training data
|
||||
|
||||
**Note:** TensorFlow works with many file systems like HDFS and S3, you can use
|
||||
them to push the dataset and other configurations there and skip the Download and Decompress steps in this turorial.
|
||||
them to push the dataset and other configurations there and skip the Download and Decompress steps in this tutorial.
|
||||
|
||||
First let's create a PVC to store the data. This step assumes that your K8s cluster has [Dynamic Volume Provisioning](https://kubernetes.io/docs/concepts/storage/dynamic-provisioning/) enabled.
|
||||
|
||||
|
|
@ -131,7 +131,7 @@ we need to create the TF pet records. For that, we wil configure and apply the `
|
|||
|
||||
```
|
||||
OBJ_DETECTION_IMAGE="lcastell/pets_object_detection"
|
||||
DATA_DIR_PATH="${MOUNT_PATH}/images"
|
||||
DATA_DIR_PATH="${MOUNT_PATH}"
|
||||
OUTPUT_DIR_PATH="${MOUNT_PATH}"
|
||||
|
||||
ks param set create-pet-record-job image ${OBJ_DETECTION_IMAGE}
|
||||
|
|
|
|||
Loading…
Reference in New Issue