mirror of https://github.com/kubeflow/examples.git
Isolate t2t execution into docker (#131)
* Isolate t2t execution into a docker * Add image build script, update run interface * Fix grammar typo
This commit is contained in:
parent
d3c781772c
commit
3bff3339f7
|
|
@ -8,6 +8,7 @@ Github Dataset hosted on BigQuery.
|
||||||
* Python 2.7 (with `pip`)
|
* Python 2.7 (with `pip`)
|
||||||
* Python 3.6+ (with `pip3`)
|
* Python 3.6+ (with `pip3`)
|
||||||
* Python `virtualenv`
|
* Python `virtualenv`
|
||||||
|
* Docker
|
||||||
|
|
||||||
**NOTE**: `Apache Beam` lacks `Python3` support and hence the multiple versions needed.
|
**NOTE**: `Apache Beam` lacks `Python3` support and hence the multiple versions needed.
|
||||||
|
|
||||||
|
|
@ -68,47 +69,54 @@ $ python preprocess/scripts/process_github_archive.py -i files/select_github_arc
|
||||||
--max-num-workers 16
|
--max-num-workers 16
|
||||||
```
|
```
|
||||||
|
|
||||||
## 2. Function Summarizer
|
## 2. Model Training
|
||||||
|
|
||||||
|
A `Dockerfile` based on Tensorflow is provided along which has all the dependencies for this part of the pipeline.
|
||||||
|
By default, it is based off Tensorflow CPU 1.8.0 for `Python3` but can be overridden in the Docker image build using
|
||||||
|
the following command
|
||||||
|
|
||||||
|
```
|
||||||
|
$ export BUILD_IMAGE_TAG=my-new-tag # (optional) to change built image tag
|
||||||
|
$ export BASE_IMAGE_TAG=1.8.0-gpu-py3 # (optional) for GPU base image
|
||||||
|
$ ./language_task/build_image.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.1 Function Summarizer
|
||||||
|
|
||||||
This part generates a model to summarize functions into docstrings using the data generated in previous
|
This part generates a model to summarize functions into docstrings using the data generated in previous
|
||||||
step. It uses `tensor2tensor`.
|
step. It uses `tensor2tensor`.
|
||||||
|
|
||||||
* Install dependencies
|
|
||||||
```
|
|
||||||
(venv3) $ pip install -r summarizer/requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
* Generate `TFRecords` for training
|
* Generate `TFRecords` for training
|
||||||
```
|
```
|
||||||
(venv3) $ t2t-datagen --t2t_usr_dir=language_task/t2t_problems --problem=github_function_summarizer \
|
$ export MOUNT_DATA_DIR=/path/to/data/folder # (optional) mount a local data directory
|
||||||
--data_dir=~/data --tmp_dir=/tmp
|
$ export DOCKER_ENTRYPOINT=t2t-datagen # (required)
|
||||||
|
$ ./language_task/run.sh --problem=github_function_summarizer
|
||||||
```
|
```
|
||||||
|
|
||||||
* Train transduction model using `Tranformer Networks` and a base hyper-parameters set
|
* Train transduction model using `Tranformer Networks` and a base hyper-parameters set
|
||||||
```
|
```
|
||||||
(venv3) $ t2t-trainer --t2t_usr_dir=language_task/t2t_problems --problem=github_function_summarizer \
|
$ export MOUNT_DATA_DIR=/path/to/data/folder # (optional) mount a local data directory
|
||||||
--data_dir=~/data --model=transformer --hparams_set=transformer_base --output_dir=~/train
|
$ export DOCKER_ENTRYPOINT=t2t-trainer # (required)
|
||||||
|
$ ./language_task/run.sh --problem=github_function_summarizer --model=transformer --hparams_set=transformer_base
|
||||||
```
|
```
|
||||||
|
|
||||||
## 3. Docstrings Language Model
|
### 2.2 Docstrings Language Model
|
||||||
|
|
||||||
This part trains a language model based on the docstrings in the dataset and uses `tensor2tensor`
|
This part trains a language model based on the docstrings in the dataset and uses `tensor2tensor`
|
||||||
|
|
||||||
* Install dependencies
|
|
||||||
```
|
|
||||||
(venv3) $ pip install -r summarizer/requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
* Generate `TFRecords` for training
|
* Generate `TFRecords` for training
|
||||||
```
|
```
|
||||||
(venv3) $ t2t-datagen --t2t_usr_dir=language_task/t2t_problems --problem=github_docstring_language_model \
|
$ export MOUNT_DATA_DIR=/path/to/data/folder # (optional) mount a local data directory
|
||||||
--data_dir=~/data --tmp_dir=/tmp
|
$ export DOCKER_ENTRYPOINT=t2t-datagen # (required)
|
||||||
|
$ ./language_task/run.sh --problem=github_docstring_language_model
|
||||||
```
|
```
|
||||||
|
|
||||||
* Train language model using `Tranformer Networks` and a custom hyper-parameters set
|
* Train language model using `Tranformer Networks` and a custom hyper-parameters set
|
||||||
```
|
```
|
||||||
(venv3) $ t2t-trainer --t2t_usr_dir=language_task/t2t_problems --problem=github_docstring_language_model \
|
$ export MOUNT_DATA_DIR=/path/to/data/folder # (optional) mount a local data directory
|
||||||
--data_dir=~/data --model=transformer --hparams_set=transformer_gh_lm --output_dir=~/train
|
$ export MOUNT_OUTPUT_DIR=/path/to/output/folder # (optional) mount a local output directory
|
||||||
|
$ export DOCKER_ENTRYPOINT=t2t-trainer # (required)
|
||||||
|
$ ./language_task/run.sh --problem=github_docstring_language_model --model=transformer --hparams_set=transformer_gh_lm
|
||||||
```
|
```
|
||||||
|
|
||||||
# Acknowledgements
|
# Acknowledgements
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
# NOTE: The context for this build must be the `language_task` directory
|
||||||
|
|
||||||
|
ARG BASE_IMAGE_TAG=1.8.0-py3
|
||||||
|
|
||||||
|
FROM tensorflow/tensorflow:$BASE_IMAGE_TAG
|
||||||
|
|
||||||
|
ADD requirements.txt /
|
||||||
|
|
||||||
|
RUN pip3 --no-cache-dir install -r /requirements.txt
|
||||||
|
|
||||||
|
VOLUME ["/data", "/output"]
|
||||||
|
|
||||||
|
ADD t2t_problems/* /t2t_problems/
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
BASE_IMAGE_TAG=${BASE_IMAGE_TAG:-1.8.0-py3} # 1.8.0-gpu-py3 for GPU-based image
|
||||||
|
BUILD_IMAGE_TAG=${BUILD_IMAGE_TAG:-semantic-code-search:devel}
|
||||||
|
|
||||||
|
# Directory of this script used as docker context
|
||||||
|
_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||||
|
|
||||||
|
pushd "$_SCRIPT_DIR"
|
||||||
|
|
||||||
|
docker build -t ${BUILD_IMAGE_TAG} --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} .
|
||||||
|
|
||||||
|
popd
|
||||||
|
|
@ -1,3 +1,2 @@
|
||||||
tensorflow~=1.8.0
|
|
||||||
tensor2tensor~=1.6.0
|
tensor2tensor~=1.6.0
|
||||||
oauth2client~=4.1.0
|
oauth2client~=4.1.0
|
||||||
|
|
@ -0,0 +1,36 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Script Variables
|
||||||
|
IMAGE_TAG=${IMAGE_TAG:-semantic-code-search:devel}
|
||||||
|
DOCKER_ENTRYPOINT=${DOCKER_ENTRYPOINT:-}
|
||||||
|
|
||||||
|
MOUNT_DATA_DIR=${MOUNT_DATA_DIR:-}
|
||||||
|
MOUNT_OUTPUT_DIR=${MOUNT_OUTPUT_DIR:-}
|
||||||
|
|
||||||
|
DATA_DIR=${DATA_DIR:-/data}
|
||||||
|
OUTPUT_DIR=${OUTPUT_DIR:-/output}
|
||||||
|
|
||||||
|
# Internal Variables
|
||||||
|
_DOCKER_RUN_OPTS="-it --rm --entrypoint=${DOCKER_ENTRYPOINT}"
|
||||||
|
_DOCKER_CMD="${@} --t2t_usr_dir=/t2t_problems --tmp_dir=/tmp --data_dir=${DATA_DIR} --output_dir=${OUTPUT_DIR}"
|
||||||
|
|
||||||
|
if [[ -z ${DOCKER_ENTRYPOINT} ]]; then
|
||||||
|
echo "ERROR: Missing DOCKER_ENTRYPOINT environment variable! Use 't2t-datagen' or 't2t-trainer'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Mount local directories (if specified)
|
||||||
|
if [[ ! -z ${MOUNT_DATA_DIR} ]]; then
|
||||||
|
_DOCKER_RUN_OPTS="${_DOCKER_RUN_OPTS} -v ${MOUNT_DATA_DIR}:${DATA_DIR}:rw"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -z ${MOUNT_OUTPUT_DIR} ]]; then
|
||||||
|
_DOCKER_RUN_OPTS="${_DOCKER_RUN_OPTS} -v ${MOUNT_OUTPUT_DIR}:${OUTPUT_DIR}:rw"
|
||||||
|
fi
|
||||||
|
|
||||||
|
_FINAL_CMD="docker run ${_DOCKER_RUN_OPTS} ${IMAGE_TAG} ${_DOCKER_CMD}"
|
||||||
|
|
||||||
|
echo "${_FINAL_CMD}"
|
||||||
|
eval "${_FINAL_CMD}"
|
||||||
Loading…
Reference in New Issue