From b6a3c4c0ea1b11ea5598725d7df9b94c55ddded3 Mon Sep 17 00:00:00 2001 From: Daniel Castellanos Date: Tue, 3 Jul 2018 14:10:20 -0700 Subject: [PATCH] Added tutorial for object detection distributed training (#74) * Added tutorial for object detection distributed training Added steps on how to leverage kubeflow tooling to submit a distributed object detection training job in a small kubernetes cluster (minikube, 2-4 node cluster) * Added Jobs to prepare the training data and model * Updated instructions * fixed typos and added export tf graph job * Fixed paths in jobs and instructions * Enhanced instructions and re-arranged folder structure * Updated links to kubeflow user guide documentation --- CONTRIBUTING.md | 12 +- README.md | 11 +- object_detection/README.md | 11 ++ .../conf/faster_rcnn_resnet101_pets.config | 144 ++++++++++++++++++ object_detection/docker/Dockerfile | 66 ++++++++ object_detection/export_tf_graph.md | 62 ++++++++ object_detection/jobs/00create-pvc.yaml | 12 ++ object_detection/jobs/01get-dataset.yaml | 23 +++ object_detection/jobs/02get-annotations.yaml | 23 +++ object_detection/jobs/03get-model-job.yaml | 22 +++ .../jobs/04decompress-images.yaml | 23 +++ .../jobs/05decompress-annotations.yaml | 23 +++ object_detection/jobs/06decompress-model.yaml | 23 +++ .../jobs/07get-fasterrcnn-config.yaml | 22 +++ .../jobs/08create-pet-record.yaml | 24 +++ object_detection/jobs/export-tf-graph.yaml | 30 ++++ object_detection/jobs/pets-training.yaml | 77 ++++++++++ object_detection/monitor_job.md | 51 +++++++ object_detection/setup.md | 59 +++++++ object_detection/submit_job.md | 73 +++++++++ object_detection/tf-serving/tf-serving.yaml | 77 ++++++++++ 21 files changed, 861 insertions(+), 7 deletions(-) create mode 100644 object_detection/README.md create mode 100644 object_detection/conf/faster_rcnn_resnet101_pets.config create mode 100644 object_detection/docker/Dockerfile create mode 100644 object_detection/export_tf_graph.md create mode 100644 object_detection/jobs/00create-pvc.yaml create mode 100644 object_detection/jobs/01get-dataset.yaml create mode 100644 object_detection/jobs/02get-annotations.yaml create mode 100644 object_detection/jobs/03get-model-job.yaml create mode 100644 object_detection/jobs/04decompress-images.yaml create mode 100644 object_detection/jobs/05decompress-annotations.yaml create mode 100644 object_detection/jobs/06decompress-model.yaml create mode 100644 object_detection/jobs/07get-fasterrcnn-config.yaml create mode 100644 object_detection/jobs/08create-pet-record.yaml create mode 100644 object_detection/jobs/export-tf-graph.yaml create mode 100644 object_detection/jobs/pets-training.yaml create mode 100644 object_detection/monitor_job.md create mode 100644 object_detection/setup.md create mode 100644 object_detection/submit_job.md create mode 100644 object_detection/tf-serving/tf-serving.yaml diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7bbe43a7..37b48473 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -146,7 +146,7 @@ your github ID in the appropriate column, creating a corresponding GitHub issue, a PR. It is not an exhaustive list, only the result of brainstorming for inspiration. Feel free to add to this list and/or reprioritize. -Priority guidance: +Priority guidance: * **P0**: Very important, try to self-assign if there is a P0 available * **P1**: Important, try to self-assign if there is no P0 available @@ -160,8 +160,8 @@ Priority guidance: | [Zillow housing prediction](https://www.kaggle.com/c/zillow-prize-1/kernels) | Zillow's home value prediction on Kaggle | **P0** | High prize Kaggle competition w/ opportunity to show XGBoost | XGBoost | [puneith](https://github.com/puneith) | Google | [issue #16](https://github.com/kubeflow/examples/issues/16) | | [Mercari price suggestion challenge](https://www.kaggle.com/c/mercari-price-suggestion-challenge) | Automatically suggest product process to online sellers | **P0** | | | | | | | [Airbnb new user bookings](https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings) | Where will a new guest book their first travel experience | | | | | | | -| [TensorFlow object detection](https://github.com/tensorflow/models/tree/master/research/object_detection) | Object detection using TensorFlow API | | | | | | | -| [TFGAN](https://github.com/tensorflow/models/blob/master/research/gan/tutorial.ipynb) | Define, Train and Evaluate GAN | | GANs are of great interest currently | | | | | -| [Nested LSTM](https://github.com/hannw/nlstm) | TensorFlow implementation of nested LSTM cell | | LSTM are the canonical implementation of RNN to solve vanishing gradient problem and widely used for Time Series | | | | | -| [How to solve 90% of NLP problems: A step by step guide on Medium](https://blog.insightdatascience.com/how-to-solve-90-of-nlp-problems-a-step-by-step-guide-fda605278e4e) | Medium post on how to solve 90% of NLP problems from Emmanuel Ameisen | | Solves a really common problem in a generic way. Great example for people who want to do NLP and don't know how to do 80% of stuff like tokenzation, basic transforms, stop word removal etc and are boilerplate across every NLP task | | | | | -| [Tour of top-10 algorithms for ML newbies](https://towardsdatascience.com/a-tour-of-the-top-10-algorithms-for-machine-learning-newbies-dde4edffae11) | Top 10 algorithms for ML newbies | | Medium post with 8K claps and a guide for ML newbies to get started with ML | | | | | | +| [TensorFlow object detection](https://github.com/tensorflow/models/tree/master/research/object_detection) | Object detection using TensorFlow API | | | TensorFlow | [ldcastell](https://github.com/ldcastell) | Intel | [issue #73](https://github.com/kubeflow/examples/issues/73) | +| [TFGAN](https://github.com/tensorflow/models/blob/master/research/gan/tutorial.ipynb) | Define, Train and Evaluate GAN | GANs are of great interest currently | | | | | | +| [Nested LSTM](https://github.com/hannw/nlstm) | TensorFlow implementation of nested LSTM cell | LSTM are the canonical implementation of RNN to solve vanishing gradient problem and widely used for Time Series | | | | | | +| [How to solve 90% of NLP problems: A step by step guide on Medium](https://blog.insightdatascience.com/how-to-solve-90-of-nlp-problems-a-step-by-step-guide-fda605278e4e) | Medium post on how to solve 90% of NLP problems from Emmanuel Ameisen | Solves a really common problem in a generic way. Great example for people who want to do NLP and don't know how to do 80% of stuff like tokenzation, basic transforms, stop word removal etc and are boilerplate across every NLP task | | | | | | +| [Tour of top-10 algorithms for ML newbies](https://towardsdatascience.com/a-tour-of-the-top-10-algorithms-for-machine-learning-newbies-dde4edffae11) | Top 10 algorithms for ML newbies | Medium post with 8K claps and a guide for ML newbies to get started with ML | | | | | | diff --git a/README.md b/README.md index abcff94f..b846d05b 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,15 @@ This example covers the following concepts: 1. Monitoring with Argo UI and Tensorboard 1. Serving with Tensorflow +### [Distributed Object Detection](./object_detection) + +Author: [Daniel Castellanos](https://github.com/ldcastell) + +This example covers the following concepts: +1. Gathering and preparing the data for model training using K8s jobs +1. Using Kubeflow tf-job and tf-operator to launch a distributed object training job +1. Serving the model through Kubeflow's tf-serving + ## Component-focused 1. @@ -45,7 +54,7 @@ This example covers the following concepts: ## Third-party hosted -| Source | Example | Description | +| Source | Example | Description | | ------ | ------- | ----------- | | | | | | diff --git a/object_detection/README.md b/object_detection/README.md new file mode 100644 index 00000000..88783491 --- /dev/null +++ b/object_detection/README.md @@ -0,0 +1,11 @@ +# Distributed TensorFlow Object Detection Training on K8s with [Kubeflow](https://github.com/kubeflow/kubeflow) +This example demonstrates how to use `kubeflow` to train an object detection model on an existing K8s cluster using +the [TensorFlow object detection API](https://github.com/tensorflow/models/tree/master/research/object_detection) + +This example is based on the TensorFlow [Pets tutorial](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_pets.md). + +## Steps: +1. [Setup a Kubeflow cluster](setup.md) +2. [Submit a distributed object detection training job](submit_job.md) +3. [Monitor your training job](monitor_job.md) +4. [Serve your model with TensorFlow serving](export_tf_graph.md) diff --git a/object_detection/conf/faster_rcnn_resnet101_pets.config b/object_detection/conf/faster_rcnn_resnet101_pets.config new file mode 100644 index 00000000..6235f5a1 --- /dev/null +++ b/object_detection/conf/faster_rcnn_resnet101_pets.config @@ -0,0 +1,144 @@ +# Faster R-CNN with Resnet-101 (v1) configured for the Oxford-IIIT Pet Dataset. +# Users should configure the fine_tune_checkpoint field in the train config as +# well as the label_map_path and input_path fields in the train_input_reader and +# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that +# should be configured. + +model { + faster_rcnn { + num_classes: 37 + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 600 + max_dimension: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_resnet101' + first_stage_features_stride: 16 + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + first_stage_nms_score_threshold: 0.0 + first_stage_nms_iou_threshold: 0.7 + first_stage_max_proposals: 300 + first_stage_localization_loss_weight: 2.0 + first_stage_objectness_loss_weight: 1.0 + initial_crop_size: 14 + maxpool_kernel_size: 2 + maxpool_stride: 2 + second_stage_box_predictor { + mask_rcnn_box_predictor { + use_dropout: false + dropout_keep_probability: 1.0 + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + variance_scaling_initializer { + factor: 1.0 + uniform: true + mode: FAN_AVG + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.0 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 300 + } + score_converter: SOFTMAX + } + second_stage_localization_loss_weight: 2.0 + second_stage_classification_loss_weight: 1.0 + } +} + +train_config: { + batch_size: 1 + optimizer { + momentum_optimizer: { + learning_rate: { + manual_step_learning_rate { + initial_learning_rate: 0.0003 + schedule { + step: 0 + learning_rate: .0003 + } + schedule { + step: 900000 + learning_rate: .00003 + } + schedule { + step: 1200000 + learning_rate: .000003 + } + } + } + momentum_optimizer_value: 0.9 + } + use_moving_average: false + } + gradient_clipping_by_norm: 10.0 + fine_tune_checkpoint: "/pets_data/faster_rcnn_resnet101_coco_2018_01_28/model.ckpt" + from_detection_checkpoint: true + # Note: The below line limits the training process to 200K steps, which we + # empirically found to be sufficient enough to train the pets dataset. This + # effectively bypasses the learning rate schedule (the learning rate will + # never decay). Remove the below line to train indefinitely. + num_steps: 200000 + data_augmentation_options { + random_horizontal_flip { + } + } +} + +train_input_reader: { + tf_record_input_reader { + input_path: "/pets_data/pet_train_with_masks.record" + } + label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt" +} + +eval_config: { + num_examples: 2000 + # Note: The below line limits the evaluation process to 10 evaluations. + # Remove the below line to evaluate indefinitely. + max_evals: 10 +} + +eval_input_reader: { + tf_record_input_reader { + input_path: "/pets_data/pet_val_with_masks.record" + } + label_map_path: "/models/research/object_detection/data/pet_label_map.pbtxt" + shuffle: false + num_readers: 1 +} \ No newline at end of file diff --git a/object_detection/docker/Dockerfile b/object_detection/docker/Dockerfile new file mode 100644 index 00000000..4f3264c8 --- /dev/null +++ b/object_detection/docker/Dockerfile @@ -0,0 +1,66 @@ +FROM ubuntu:16.04 + +LABEL maintainer="Soila Kavulya " + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + protobuf-compiler \ + python \ + python-dev \ + python-pil \ + python-tk \ + python-lxml \ + rsync \ + git \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Cython \ + glob2 \ + h5py \ + ipykernel \ + jupyter \ + matplotlib \ + numpy \ + pandas \ + scipy \ + sklearn \ + six \ + tensorflow \ + tensorflow-serving-api \ + && \ + python -m ipykernel.kernelspec + +# Setup Universal Object Detection +ENV MODELS_HOME "/models" +RUN git clone https://github.com/tensorflow/models.git $MODELS_HOME && cd $MODELS_HOME && git checkout r1.5 +#COPY models $MODELS_HOME +RUN cd $MODELS_HOME/research \ + && protoc object_detection/protos/*.proto --python_out=. + +ENV PYTHONPATH "$MODELS_HOME/research:$MODELS_HOME/research/slim:$PYTHONPATH" +COPY scripts /scripts +# TensorBoard +EXPOSE 6006 + +WORKDIR $MODELS_HOME + +# Run training job +ARG pipeline_config_path +ARG train_dir + +CMD ["python", "$MODELS_HOME/research/object_detection/train.py", "--logtostderr", "--pipeline_config_path=$pipeline_config_path" "--train_dir=$train_dir"] diff --git a/object_detection/export_tf_graph.md b/object_detection/export_tf_graph.md new file mode 100644 index 00000000..9bb62c18 --- /dev/null +++ b/object_detection/export_tf_graph.md @@ -0,0 +1,62 @@ + +## Export the TensorFlow Graph + +In the [jobs](./jobs) directory you will find a manifest file [export-tf-graph.yaml](./jobs/export-tf-graph.yaml). +Before executing the job we first need to identify a checkpoint candidate in the `pets-data-claim` pvc under +`/pets_data/train`. + +To see what's being saved in `/pets_data/train` while the training job is running you can use: +``` +kubectl -n kubeflow exec -it pets-training-master-r1hv-0-i6k7c sh +``` +This will open an interactive shell to your container and now you can execute `ls /pets_data/train` and look for a +checkpoint candidate. + +Once you have identified the checkpoint open the `export-tf-graph.yaml` file under the ./jobs directory +and edit the container command args: `--trained_checkpoint_prefix model.ckpt-` +(line 20) to match the chosen checkpoint. + +Now you can execute the job with: +``` +kubectl -n kubeflow apply ./jobs/export-tf-graph.yaml +``` + +Once the job is completed a new directory called `exported_graphs` under `/pets_data` in the pets-data-claim PCV +will be created containing the model and the frozen graph. + +Before serving the model we need to perform a quick hack since the object detection export python api does not +generate a "version" folder for the saved model. This hack consists on creating a directory and move some files to it. +One way of doing this is by accessing to an interactive shell in one of your running containers and moving the data yourself + +``` +kubectl -n kubeflow exec -it pets-training-master-r1hv-0-i6k7c sh +mkdir /pets_data/exported_graphs/saved_model/1 +cp /pets_data/exported_graphs/saved_model/* /pets_data/exported_graphs/saved_model/1 +``` + +## Serve the model using TF-Serving + +Apply the manifest file under [tf-serving](./tf-serving) directory: +``` +kubectl -n kubeflow apply -f ./tf-serving/tf-serving.yaml +``` + +After that you should see pets-model pod. Run: +``` +kubectl -n kubeflow get pods | grep pets-model +``` +That will output something like this: +``` +pets-model-v1-57674c8f76-4qrqp 1/1 Running 0 4h +``` +Take a look at the logs: +``` +kubectl -n kubeflow logs pets-model-v1-57674c8f76-4qrqp +``` +And you should see: +``` +2018-06-21 19:20:32.325406: I tensorflow_serving/core/loader_harness.cc:86] Successfully loaded servable version {name: pets-model version: 1} +E0621 19:20:34.134165172 7 ev_epoll1_linux.c:1051] grpc epoll fd: 3 +2018-06-21 19:20:34.135354: I tensorflow_serving/model_servers/main.cc:288] Running ModelServer at 0.0.0.0:9000 ... +``` +Now you can use a gRPC client to run inference using your trained model! diff --git a/object_detection/jobs/00create-pvc.yaml b/object_detection/jobs/00create-pvc.yaml new file mode 100644 index 00000000..6bda9bfb --- /dev/null +++ b/object_detection/jobs/00create-pvc.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: pets-data-claim +spec: + accessModes: + - ReadWriteMany + volumeMode: Block + resources: + requests: + storage: 20Gi \ No newline at end of file diff --git a/object_detection/jobs/01get-dataset.yaml b/object_detection/jobs/01get-dataset.yaml new file mode 100644 index 00000000..35880956 --- /dev/null +++ b/object_detection/jobs/01get-dataset.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: get-pets-dataset +spec: + template: + spec: + containers: + - name: get-data + image: inutano/wget + imagePullPolicy: IfNotPresent + command: ["wget", "http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz", "-P", "/pets_data"] + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: Never + + backoffLimit: 4 \ No newline at end of file diff --git a/object_detection/jobs/02get-annotations.yaml b/object_detection/jobs/02get-annotations.yaml new file mode 100644 index 00000000..42bfbadf --- /dev/null +++ b/object_detection/jobs/02get-annotations.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: get-pets-annotations +spec: + template: + spec: + containers: + - name: get-annotations + image: inutano/wget + imagePullPolicy: IfNotPresent + command: ["wget", "http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz", "-P", "/pets_data" ] + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: Never + + backoffLimit: 4 \ No newline at end of file diff --git a/object_detection/jobs/03get-model-job.yaml b/object_detection/jobs/03get-model-job.yaml new file mode 100644 index 00000000..912b0140 --- /dev/null +++ b/object_detection/jobs/03get-model-job.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: get-fasterrcnn-model +spec: + template: + spec: + containers: + - name: get-model + image: inutano/wget + imagePullPolicy: IfNotPresent + command: ["wget", "http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_coco_2018_01_28.tar.gz", "-P", "/pets_data"] + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: Never + backoffLimit: 4 \ No newline at end of file diff --git a/object_detection/jobs/04decompress-images.yaml b/object_detection/jobs/04decompress-images.yaml new file mode 100644 index 00000000..cee5bf73 --- /dev/null +++ b/object_detection/jobs/04decompress-images.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: decompress-images +spec: + template: + spec: + containers: + - name: decompress-images + image: ubuntu:16.04 + imagePullPolicy: IfNotPresent + command: ["tar", "-xzvf", "/pets_data/images.tar.gz", "-C", "/pets_data"] + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: Never + + backoffLimit: 4 \ No newline at end of file diff --git a/object_detection/jobs/05decompress-annotations.yaml b/object_detection/jobs/05decompress-annotations.yaml new file mode 100644 index 00000000..525e44a5 --- /dev/null +++ b/object_detection/jobs/05decompress-annotations.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: decompress-annotations +spec: + template: + spec: + containers: + - name: decompress-annotations + image: ubuntu:16.04 + imagePullPolicy: IfNotPresent + command: ["tar", "-xzvf", "/pets_data/annotations.tar.gz", "-C", "/pets_data"] + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: Never + + backoffLimit: 4 \ No newline at end of file diff --git a/object_detection/jobs/06decompress-model.yaml b/object_detection/jobs/06decompress-model.yaml new file mode 100644 index 00000000..0a5a5d27 --- /dev/null +++ b/object_detection/jobs/06decompress-model.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: decompress-fasterrcnn-model +spec: + template: + spec: + containers: + - name: decompress-model + image: ubuntu:16.04 + imagePullPolicy: IfNotPresent + command: ["tar", "-xzvf", "/pets_data/faster_rcnn_resnet101_coco_2018_01_28.tar.gz", "-C", "/pets_data"] + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: Never + + backoffLimit: 4 \ No newline at end of file diff --git a/object_detection/jobs/07get-fasterrcnn-config.yaml b/object_detection/jobs/07get-fasterrcnn-config.yaml new file mode 100644 index 00000000..2ae4a970 --- /dev/null +++ b/object_detection/jobs/07get-fasterrcnn-config.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: get-fasterrcnn-config +spec: + template: + spec: + containers: + - name: get-model + image: inutano/wget + imagePullPolicy: IfNotPresent + command: ["wget", "--no-check-certificate", "https://raw.githubusercontent.com/ldcastell/examples/distributedTrainihttps://raw.githubusercontent.com/ldcastell/examples/distributedTrainingExample/object_detection/conf/faster_rcnngExample/object_detection/distributed_training/conf/faster_rcnn_resnet101_pets.config", "-P", "/pets_data"] + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: Never + backoffLimit: 4 \ No newline at end of file diff --git a/object_detection/jobs/08create-pet-record.yaml b/object_detection/jobs/08create-pet-record.yaml new file mode 100644 index 00000000..e4a1f1b0 --- /dev/null +++ b/object_detection/jobs/08create-pet-record.yaml @@ -0,0 +1,24 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: create-tf-record +spec: + template: + spec: + containers: + - name: create-tf-record + image: lcastell/pets_object_detection + imagePullPolicy: IfNotPresent + workingDir: "/" + command: ["python", "models/research/object_detection/dataset_tools/create_pet_tf_record.py", "--label_map_path=models/research/object_detection/data/pet_label_map.pbtxt", "--data_dir=/pets_data", "--output_dir=/pets_data"] + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: Never + + backoffLimit: 4 \ No newline at end of file diff --git a/object_detection/jobs/export-tf-graph.yaml b/object_detection/jobs/export-tf-graph.yaml new file mode 100644 index 00000000..5ba20515 --- /dev/null +++ b/object_detection/jobs/export-tf-graph.yaml @@ -0,0 +1,30 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: export-tf-graph +spec: + template: + spec: + containers: + - name: export-tf-graph + image: lcastell/pets_object_detection + imagePullPolicy: IfNotPresent + workingDir: "/" + command: + - "python" + - "models/research/object_detection/export_inference_graph.py" + args: + - "--input_type=image_tensor" + - "--pipeline_config_path=/pets_data/faster_rcnn_resnet101_pets.config" + - "--trained_checkpoint_prefix=/pets_data/train/model.ckpt-" + - "--output_directory=/pets_data/exported_graphs" + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: Never + backoffLimit: 4 \ No newline at end of file diff --git a/object_detection/jobs/pets-training.yaml b/object_detection/jobs/pets-training.yaml new file mode 100644 index 00000000..71f0179b --- /dev/null +++ b/object_detection/jobs/pets-training.yaml @@ -0,0 +1,77 @@ +--- +apiVersion: kubeflow.org/v1alpha1 +kind: TFJob +metadata: + name: pets-training +spec: + replicaSpecs: + - replicas: 1 + template: + spec: + containers: + - image: lcastell/pets_object_detection + imagePullPolicy: IfNotPresent + name: tensorflow + workingDir: /models + command: + - "python" + - "research/object_detection/train.py" + args: + - "--logtostderr" + - "--pipeline_config_path=/pets_data/faster_rcnn_resnet101_pets.config" + - "--train_dir=/pets_data/train" + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: OnFailure + tfReplicaType: MASTER + - replicas: 1 + template: + spec: + containers: + - image: lcastell/pets_object_detection + imagePullPolicy: IfNotPresent + name: tensorflow + command: + - "python" + - "research/object_detection/train.py" + args: + - "--logtostderr" + - "--pipeline_config_path=/pets_data/faster_rcnn_resnet101_pets.config" + - "--train_dir=/pets_data/train" + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: OnFailure + tfReplicaType: WORKER + - replicas: 1 + template: + spec: + containers: + - image: lcastell/pets_object_detection + imagePullPolicy: IfNotPresent + name: tensorflow + command: + - "python" + - "research/object_detection/train.py" + args: + - "--logtostderr" + - "--pipeline_config_path=/pets_data/faster_rcnn_resnet101_pets.config" + - "--train_dir=/pets_data/train" + volumeMounts: + - mountPath: "/pets_data" + name: pets-data + volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim + restartPolicy: OnFailure + tfReplicaType: PS \ No newline at end of file diff --git a/object_detection/monitor_job.md b/object_detection/monitor_job.md new file mode 100644 index 00000000..38dfe51b --- /dev/null +++ b/object_detection/monitor_job.md @@ -0,0 +1,51 @@ +## Monitor your job + +### View status +``` +kubectl -n kubeflow describe tfjobs pets-training +``` + +### View logs of individual pods +``` +kubectl -n kubeflow get pods +kubectl -n kubeflow logs +``` +**NOTE:** When the job finishes, the pods will be automatically terminated. To see, run the `get pods` command with the `-a` flag: +``` +kubectl -n kubeflow get pods -a +``` + +While the job is running, you should see something like this in your master pod logs: +``` +INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt +INFO:tensorflow:Recording summary at step 819. +INFO:tensorflow:global step 819: loss = 0.8603 (19.898 sec/step) +INFO:tensorflow:global step 822: loss = 1.9421 (18.507 sec/step) +INFO:tensorflow:global step 825: loss = 0.7147 (17.088 sec/step) +INFO:tensorflow:global step 828: loss = 1.7722 (18.033 sec/step) +INFO:tensorflow:global step 831: loss = 1.3933 (17.739 sec/step) +INFO:tensorflow:global step 834: loss = 0.2307 (16.493 sec/step) +INFO:tensorflow:Recording summary at step 839 +``` + +When the job finishes, you should see something like this in your completed/terminated master pod logs: +``` +INFO:tensorflow:Starting Session. +INFO:tensorflow:Saving checkpoint to path /pets_data/train/model.ckpt +INFO:tensorflow:Starting Queues. +INFO:tensorflow:global_step/sec: 0 +INFO:tensorflow:Recording summary at step 200006. +INFO:tensorflow:global step 200006: loss = 0.0091 (9.854 sec/step) +INFO:tensorflow:Stopping Training. +INFO:tensorflow:Finished training! Saving model to disk. +``` + +Now you have a trained model!! find it at `/pets_data/train` inside pvc `pets-data-claim``. + +### Delete job +``` +kubectl -n kubeflow delete -f training/pets-tf-jobs.yaml +``` + +## Next +[Export the TensorFlow Graph and Serve the model with TF Serving](./export_tf_graph.md) \ No newline at end of file diff --git a/object_detection/setup.md b/object_detection/setup.md new file mode 100644 index 00000000..d1d30499 --- /dev/null +++ b/object_detection/setup.md @@ -0,0 +1,59 @@ +## Setup Kubeflow +### Requirements + + - Kubernetes cluster + - Access to a working `kubectl` (Kubernetes CLI) + - Ksonnet CLI: [ks](https://ksonnet.io/) + +### Setup +Refer to the [user guide](https://www.kubeflow.org/docs/about/user_guide) for instructions on how to setup kubeflow on your kubernetes cluster. Specifically, look at the section on [deploying kubeflow](https://www.kubeflow.org/docs/about/user_guide#deploy-kubeflow). +For this example, we will be using ks `nocloud` environment. If you plan to use `cloud` ks environment, please make sure you follow the proper instructions in the kubeflow user guide. + +After completing the steps in the kubeflow user guide you will have the following: +- A ksonnet app directory called `my-kubeflow` +- A new namespace in you K8s cluster called `kubeflow` +- The following pods in your kubernetes cluster in the `kubeflow` namespace: +``` +kubectl -n kubeflow get pods +NAME READY STATUS RESTARTS AGE +ambassador-7987df44b9-4pht8 2/2 Running 0 1m +ambassador-7987df44b9-dh5h6 2/2 Running 0 1m +ambassador-7987df44b9-qrgsm 2/2 Running 0 1m +tf-hub-0 1/1 Running 0 1m +tf-job-operator-78757955b-qkg7s 1/1 Running 0 1m +``` +## Preparing the training data +We have prepared a set of K8s batch jobs to create a persistent volume and copy the data to it. +The `yaml` manifest files for these jobs can be found at [jobs](./jobs) directory. These `yaml` files are numbered and must be executed in order. + +``` +# First create the PVC where the training data will be stored +kubectl -n kubeflow apply -f ./jobs/00create-pvc.yaml +``` +The 00create-pvc.yaml creates a PVC with `ReadWriteMany` access mode if your Kubernetes cluster +does not support this feature you can modify the manifest to create the PVC in `ReadWriteOnce` +and before you execute the tf-job to train the model add a `nodeSelector:` configuration to execute the pods +in the same node. You can find more about assigning pods to specific nodes [here](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/) + +``` +# Get the dataset, annotations and faster-rcnn-model tars +kubectl -n kubeflow apply -f ./jobs/01get-dataset.yaml +kubectl -n kubeflow apply -f ./jobs/02get-annotations.yaml +kubectl -n kubeflow apply -f ./jobs/03get-model-job.yaml +``` + +``` +# Decompress tar files +kubectl -n kubeflow apply -f ./jobs/04decompress-images.yaml +kubectl -n kubeflow apply -f ./jobs/05decompress-annotations.yaml +kubectl -n kubeflow apply -f ./jobs/06decompress-model.yaml +``` + +``` +# Configuring the training pipeline +kubectl -n kubeflow apply -f ./jobs/07get-fasterrcnn-config.yaml +kubectl -n kubeflow apply -f ./jobs/08create-pet-record.yaml +``` + +## Next +[Submit the TF Job](submit_job.md) diff --git a/object_detection/submit_job.md b/object_detection/submit_job.md new file mode 100644 index 00000000..242f4989 --- /dev/null +++ b/object_detection/submit_job.md @@ -0,0 +1,73 @@ +# Launch a distributed object detection training job +## Requirements + + - Docker + - Docker Registry + - Object Detection Training Docker Image + +Build the TensorFlow object detection training image, or use the pre-built image `lcastell/pets_object_detection` in Docker hub. + +## To build the image: +First copy the Dockerfile file from `./docker` directory into your $HOME path +``` +# from your $HOME directory +docker build --pull -t $USER/pets_object_detection -f ./Dockerfile . +``` + +### Push the image to your docker registry +``` +# from your $HOME directory +docker tag $USER/pets_object_detection /pets_object_detection +docker push /pets_object_detection +``` + +## Create training TF-Job deployment and launching it +**NOTE:** You can skip this step and copy the [pets-training.yaml](./jobs/pets-training.yaml) from the `conf` directory and modify it to your needs. +Or simply run: + +``` +kubectl -n kubeflow apply -f ./jobs/pets-training.yaml +``` + +### Follow these steps to generate the tf-job manifest file: + +Generate the ksonnet component using the tf-job prototype +``` +# from the my-kubeflow directory +ks generate tf-job pets-training --name=pets-traning \ +--namespace=kubeflow \ +--image=/pets_object_detection \ +--num_masters=1 \ +--num_workers= 1 \ +--num_ps= 1 +``` +Dump the generated component into a K8s deployment manifest file. +``` +ks show nocloud -c pets-training > pets-training.yaml +``` +Add the volume mounts information at the end manifest file. We will be mounting `/pets_data` path to all the containers so they can pull the data for the training job +``` +vim pets-training.yaml +``` +Add the following to the template.spec: +``` +volumes: + - name: pets-data + persistentVolumeClaim: + claimName: pets-data-claim +``` +Add the following to the container properties: +``` +volumeMounts: +- mountPath: "/pets_data" + name: pets-data +``` +At the end you should have something similar to [this](./jobs/pets-training.yaml) + +No you can submit the TF-Job to K8s: +``` +kubectl -n kubeflow apply -f pets-training.yaml +``` + +## Next +[Monitor your job](monitor_job.md) \ No newline at end of file diff --git a/object_detection/tf-serving/tf-serving.yaml b/object_detection/tf-serving/tf-serving.yaml new file mode 100644 index 00000000..10baacbf --- /dev/null +++ b/object_detection/tf-serving/tf-serving.yaml @@ -0,0 +1,77 @@ +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + getambassador.io/config: |- + --- + apiVersion: ambassador/v0 + kind: Mapping + name: tfserving-mapping-pets-model-get + prefix: /pets_data/exported_graphs/saved_model/ + rewrite: / + method: GET + service: pets-model.default:8000 + --- + apiVersion: ambassador/v0 + kind: Mapping + name: tfserving-mapping-pets-model-post + prefix: /pets_data/exported_graphs/saved_model/ + rewrite: /pets_data/exported_graphs/saved_model/:predict + method: POST + service: pets-model.default:8000 + labels: + app: pets-model + name: pets-model +spec: + ports: + - name: grpc-tf-serving + port: 9000 + targetPort: 9000 + - name: http-tf-serving-proxy + port: 8000 + targetPort: 8000 + selector: + app: pets-model + type: ClusterIP +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app: pets-model + name: pets-model-v1 +spec: + template: + metadata: + labels: + app: pets-model + version: v1 + spec: + containers: + - args: + - /usr/bin/tensorflow_model_server + - --port=9000 + - --model_name=pets-model + - --model_base_path=/pets_data/exported_graphs/saved_model + image: gcr.io/kubeflow-images-public/tf-model-server-cpu:v20180327-995786ec + imagePullPolicy: IfNotPresent + name: pets-model + ports: + - containerPort: 9000 + resources: + limits: + cpu: "4" + memory: 4Gi + requests: + cpu: "1" + memory: 1Gi + securityContext: + runAsUser: 1000 + volumeMounts: + - mountPath: /pets_data + name: nfs + volumes: + - name: nfs + persistentVolumeClaim: + claimName: pets-data-claim \ No newline at end of file