added named entity recognition example (#590)
* added named entity recognition example https://github.com/kubeflow/website/issues/853 * added previous and next steps * changed all absolute links to relative links * changed headline for better understanding * moved dataset description section to top * fixed style * added missing Jupyter notebook * changed headline * added link to documentation * fixed meaning of images and components * adapted documentation to https://www.kubeflow.org/docs/about/style-guide/#address-the-audience-directly * added link to ai platform models * make it clear these are optional extensions * changed summary and goals * added kubeflow version * fixed s/an/a/ also checked the rest of the documentation * added #!/bin/sh * added environment variables for build scripts and adapted documentation * changed PROJECT TO PROJECT_ID * added link to kaggle dataset and removed not required copy script (due to direct public location in gs://). Adapted Jupyter notebook input data path * added hint to make clear no further steps are required * fixed s/Run/RUN/ * grammar fix * optimized text * added prev link to index * removed model description due to lack of information * added significance and congrats =) * added example * guided the user's attention to specific screens/metrics/graphs * explenation of pieces * updated main readme * updated parts * fixed typo * adapted dataset path * made scripts executable chmod +x * Update step-1-setup.md swaped sections and added env variables to gsutil comand * added information regarding public access * added named entity recognition example https://github.com/kubeflow/website/issues/853 * added previous and next steps * changed all absolute links to relative links * changed headline for better understanding * moved dataset description section to top * fixed style * added missing Jupyter notebook * changed headline * added link to documentation * fixed meaning of images and components * adapted documentation to https://www.kubeflow.org/docs/about/style-guide/#address-the-audience-directly * added link to ai platform models * make it clear these are optional extensions * changed summary and goals * added kubeflow version * fixed s/an/a/ also checked the rest of the documentation * added #!/bin/sh * added environment variables for build scripts and adapted documentation * changed PROJECT TO PROJECT_ID * added link to kaggle dataset and removed not required copy script (due to direct public location in gs://). Adapted Jupyter notebook input data path * added hint to make clear no further steps are required * fixed s/Run/RUN/ * grammar fix * optimized text * added prev link to index * removed model description due to lack of information * added significance and congrats =) * added example * guided the user's attention to specific screens/metrics/graphs * explenation of pieces * updated main readme * updated parts * fixed typo * adapted dataset path * made scripts executable chmod +x * Update step-1-setup.md swaped sections and added env variables to gsutil comand * added information regarding public access * fixed lint error * fixed lint issues * fixed lint issues * figured kubeflow examples are using 2 rather then 4 spaces (due to tensorflow standards) * lint fixes * reverted changes * removed unused import * removed object inherit * fixed lint issues * added kwargs to ignored-argument-name (due to best practice in Google custom prediction routine) * fix lint issues * set pylintrc back to default and removed unused argument
11
README.md
|
@ -11,6 +11,17 @@ This repository is home to the following types of examples and demos:
|
|||
|
||||
## End-to-end
|
||||
|
||||
### [Named Entity Recognition](./named_entity_recognition)
|
||||
Author: [Sascha Heyer](https://github.com/saschaheyer)
|
||||
|
||||
This example covers the following concepts:
|
||||
1. Build reusable pipeline components
|
||||
2. Run Kubeflow Pipelines with Jupyter notebooks
|
||||
1. Train a Named Entity Recognition model on a Kubernetes cluster
|
||||
1. Deploy a Keras model to AI Platform
|
||||
1. Use Kubeflow metrics
|
||||
1. Use Kubeflow visualizations
|
||||
|
||||
### [GitHub issue summarization](./github_issue_summarization)
|
||||
Author: [Hamel Husain](https://github.com/hamelsmu)
|
||||
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
# custom
|
||||
custom_prediction_routine.egg-info
|
||||
custom_prediction_routine*
|
|
@ -0,0 +1,33 @@
|
|||
# Named Entity Recognition with Kubeflow and Keras
|
||||
|
||||
In this walkthrough, you will learn how to use Kubeflow to build reusable components to train your model on an kubernetes cluster and deploy it to AI platform.
|
||||
|
||||
## Goals
|
||||
|
||||
* Demonstrate how to build reusable pipeline components
|
||||
* Demonstrate how to use Keras only models
|
||||
* Demonstrate how to train a Named Entity Recognition model on a Kubernetes cluster
|
||||
* Demonstrate how to deploy a Keras model to AI Platform
|
||||
* Demonstrate how to use a custom prediction routine
|
||||
* Demonstrate how to use Kubeflow metrics
|
||||
* Demonstrate how to use Kubeflow visualizations
|
||||
|
||||
## What is Named Entity Recognition
|
||||
Named Entity Recognition is a word classification problem, which extract data called entities from text.
|
||||
|
||||

|
||||
|
||||
### Steps
|
||||
|
||||
1. [Setup Kubeflow and clone repository](documentation/step-1-setup.md)
|
||||
1. [Build the pipeline components](documentation/step-2-build-components.md)
|
||||
1. [Upload the dataset](documentation/step-3-upload-dataset.md)
|
||||
1. [Custom prediction routine](documentation/step-4-custom-prediction-routine.md)
|
||||
1. [Run the pipeline](documentation/step-5-run-pipeline.md)
|
||||
1. [Monitor the training](documentation/step-6-monitor-training.md)
|
||||
1. [Predict](documentation/step-7-predictions.md)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
#!/bin/sh
|
||||
|
||||
echo "\nBuild and push preprocess component"
|
||||
./preprocess/build_image.sh
|
||||
|
||||
echo "\nBuild and push train component"
|
||||
./train/build_image.sh
|
||||
|
||||
echo "\nBuild and push deploy component"
|
||||
./deploy/build_image.sh
|
|
@ -0,0 +1,13 @@
|
|||
#!/bin/sh
|
||||
|
||||
BUCKET="your-bucket-name"
|
||||
|
||||
echo "\nCopy component specifications to Google Cloud Storage"
|
||||
gsutil cp preprocess/component.yaml gs://${BUCKET}/components/preprocess/component.yaml
|
||||
gsutil acl ch -u AllUsers:R gs://${BUCKET}/components/preprocess/component.yaml
|
||||
|
||||
gsutil cp train/component.yaml gs://${BUCKET}/components/train/component.yaml
|
||||
gsutil acl ch -u AllUsers:R gs://${BUCKET}/components/train/component.yaml
|
||||
|
||||
gsutil cp deploy/component.yaml gs://${BUCKET}/components/deploy/component.yaml
|
||||
gsutil acl ch -u AllUsers:R gs://${BUCKET}/components/deploy/component.yaml
|
|
@ -0,0 +1,4 @@
|
|||
FROM google/cloud-sdk:latest
|
||||
ADD ./src /pipelines/component/src
|
||||
RUN chmod 755 /pipelines/component/src/deploy.sh
|
||||
ENTRYPOINT ["/pipelines/component/src/deploy.sh"]
|
|
@ -0,0 +1,11 @@
|
|||
#!/bin/sh
|
||||
|
||||
image_name=gcr.io/$PROJECT_ID/kubeflow/ner/deploy
|
||||
image_tag=latest
|
||||
|
||||
full_image_name=${image_name}:${image_tag}
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
docker build -t "${full_image_name}" .
|
||||
docker push "$full_image_name"
|
|
@ -0,0 +1,44 @@
|
|||
name: deploy
|
||||
description: Deploy the model with custom prediction route
|
||||
inputs:
|
||||
- name: Model path
|
||||
type: GCSPath
|
||||
description: 'Path of GCS directory containing exported Tensorflow model.'
|
||||
- name: Model name
|
||||
type: String
|
||||
description: 'The name specified for the model when it was or get created'
|
||||
- name: Model region
|
||||
type: String
|
||||
description: 'The region where the model is going to be deployed'
|
||||
- name: Model version
|
||||
type: String
|
||||
description: 'The version of the model'
|
||||
- name: Model runtime version
|
||||
type: String
|
||||
description: 'The runtime version of the model'
|
||||
- name: Model prediction class
|
||||
type: String
|
||||
description: 'The runtime version of the model'
|
||||
- name: Model python version
|
||||
type: String
|
||||
description: 'The python version of the model'
|
||||
- name: Model package uris
|
||||
type: String
|
||||
description: 'The packge uri of the model'
|
||||
outputs:
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/<PROJECT-ID>/kubeflow/ner/deploy:latest
|
||||
command: [
|
||||
sh, /pipelines/component/src/deploy.sh
|
||||
]
|
||||
args: [
|
||||
--model-path, {inputValue: Model path},
|
||||
--model-name, {inputValue: Model name},
|
||||
--model-region, {inputValue: Model region},
|
||||
--model-version, {inputValue: Model version},
|
||||
--model-runtime-version, {inputValue: Model runtime version},
|
||||
--model-prediction-class, {inputValue: Model prediction class},
|
||||
--model-python-version, {inputValue: Model python version},
|
||||
--model-package-uris, {inputValue: Model package uris},
|
||||
]
|
|
@ -0,0 +1,88 @@
|
|||
# loop through all parameters
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
"--model-path")
|
||||
shift
|
||||
MODEL_PATH="$1"
|
||||
echo
|
||||
shift
|
||||
;;
|
||||
"--model-name")
|
||||
shift
|
||||
MODEL_NAME="$1"
|
||||
echo
|
||||
shift
|
||||
;;
|
||||
"--model-region")
|
||||
shift
|
||||
MODEL_REGION="$1"
|
||||
echo
|
||||
shift
|
||||
;;
|
||||
"--model-version")
|
||||
shift
|
||||
MODEL_VERSION="$1"
|
||||
echo
|
||||
shift
|
||||
;;
|
||||
"--model-runtime-version")
|
||||
shift
|
||||
RUNTIME_VERSION="$1"
|
||||
echo
|
||||
shift
|
||||
;;
|
||||
"--model-prediction-class")
|
||||
shift
|
||||
MODEL_PREDICTION_CLASS="$1"
|
||||
echo
|
||||
shift
|
||||
;;
|
||||
"--model-python-version")
|
||||
shift
|
||||
MODEL_PYTHON_VERSION="$1"
|
||||
echo
|
||||
shift
|
||||
;;
|
||||
"--model-package-uris")
|
||||
shift
|
||||
MODEL_PACKAGE_URIS="$1"
|
||||
echo
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
esac
|
||||
done
|
||||
|
||||
# echo inputs
|
||||
echo MODEL_PATH = "${MODEL_PATH}"
|
||||
echo MODEL = "${MODEL_EXPORT_PATH}"
|
||||
echo MODEL_NAME = "${MODEL_NAME}"
|
||||
echo MODEL_REGION = "${MODEL_REGION}"
|
||||
echo MODEL_VERSION = "${MODEL_VERSION}"
|
||||
echo RUNTIME_VERSION = "${RUNTIME_VERSION}"
|
||||
echo MODEL_PREDICTION_CLASS = "${MODEL_PREDICTION_CLASS}"
|
||||
echo MODEL_PYTHON_VERSION = "${MODEL_PYTHON_VERSION}"
|
||||
echo MODEL_PACKAGE_URIS = "${MODEL_PACKAGE_URIS}"
|
||||
|
||||
|
||||
# create model
|
||||
modelname=$(gcloud ai-platform models list | grep -w "$MODEL_NAME")
|
||||
echo "$modelname"
|
||||
if [ -z "$modelname" ]; then
|
||||
echo "Creating model $MODEL_NAME in region $REGION"
|
||||
|
||||
gcloud ai-platform models create ${MODEL_NAME} \
|
||||
--regions ${MODEL_REGION}
|
||||
else
|
||||
echo "Model $MODEL_NAME already exists"
|
||||
fi
|
||||
|
||||
# create version with custom prediction routine (beta)
|
||||
echo "Creating version $MODEL_VERSION from $MODEL_PATH"
|
||||
gcloud beta ai-platform versions create ${MODEL_VERSION} \
|
||||
--model ${MODEL_NAME} \
|
||||
--origin ${MODEL_PATH} \
|
||||
--python-version ${MODEL_PYTHON_VERSION} \
|
||||
--runtime-version ${RUNTIME_VERSION} \
|
||||
--package-uris ${MODEL_PACKAGE_URIS} \
|
||||
--prediction-class ${MODEL_PREDICTION_CLASS}
|
|
@ -0,0 +1,4 @@
|
|||
ARG BASE_IMAGE_TAG=1.12.0-py3
|
||||
FROM tensorflow/tensorflow:$BASE_IMAGE_TAG
|
||||
RUN python3 -m pip install keras
|
||||
COPY ./src /pipelines/component/src
|
|
@ -0,0 +1,12 @@
|
|||
#!/bin/sh
|
||||
|
||||
image_name=gcr.io/$PROJECT_ID/kubeflow/ner/preprocess
|
||||
image_tag=latest
|
||||
|
||||
full_image_name=${image_name}:${image_tag}
|
||||
base_image_tag=1.12.0-py3
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
docker build --build-arg BASE_IMAGE_TAG=${base_image_tag} -t "${full_image_name}" .
|
||||
docker push "$full_image_name"
|
|
@ -0,0 +1,34 @@
|
|||
name: preprocess
|
||||
description: Performs the IOB preprocessing.
|
||||
inputs:
|
||||
- {name: Input 1 URI, type: GCSPath}
|
||||
- {name: Output x URI template, type: GCSPath}
|
||||
- {name: Output y URI template, type: GCSPath}
|
||||
- {name: Output preprocessing state URI template, type: GCSPath}
|
||||
outputs:
|
||||
- name: Output x URI
|
||||
type: GCSPath
|
||||
- name: Output y URI
|
||||
type: String
|
||||
- name: Output tags
|
||||
type: String
|
||||
- name: Output words
|
||||
type: String
|
||||
- name: Output preprocessing state URI
|
||||
type: String
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/<PROJECT-ID>/kubeflow/ner/preprocess:latest
|
||||
command: [
|
||||
python3, /pipelines/component/src/component.py,
|
||||
--input1-path, {inputValue: Input 1 URI},
|
||||
--output-y-path, {inputValue: Output y URI template},
|
||||
--output-x-path, {inputValue: Output x URI template},
|
||||
--output-preprocessing-state-path, {inputValue: Output preprocessing state URI template},
|
||||
|
||||
--output-y-path-file, {outputPath: Output y URI},
|
||||
--output-x-path-file, {outputPath: Output x URI},
|
||||
--output-preprocessing-state-path-file, {outputPath: Output preprocessing state URI},
|
||||
--output-tags, {outputPath: Output tags},
|
||||
--output-words, {outputPath: Output words},
|
||||
]
|
|
@ -0,0 +1,132 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
|
||||
import pandas as pd
|
||||
from tensorflow import gfile
|
||||
from keras.utils import to_categorical
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from text_preprocessor import TextPreprocessor
|
||||
|
||||
PREPROCESS_FILE = 'processor_state.pkl'
|
||||
|
||||
|
||||
def read_data(input1_path):
|
||||
with gfile.Open(input1_path, 'r') as input1_file:
|
||||
print('processing')
|
||||
print('input file', input1_file)
|
||||
csv_data = pd.read_csv(input1_file, error_bad_lines=False)
|
||||
return csv_data
|
||||
|
||||
|
||||
# Defining and parsing the command-line arguments
|
||||
parser = argparse.ArgumentParser(description='My program description')
|
||||
parser.add_argument('--input1-path', type=str,
|
||||
help='Path of the local file or GCS blob containing the Input 1 data.')
|
||||
|
||||
parser.add_argument('--output-tags', type=str, help='')
|
||||
parser.add_argument('--output-words', type=str, help='')
|
||||
|
||||
parser.add_argument('--output-x-path', type=str, help='')
|
||||
parser.add_argument('--output-x-path-file', type=str, help='')
|
||||
|
||||
parser.add_argument('--output-y-path', type=str, help='')
|
||||
parser.add_argument('--output-y-path-file', type=str, help='')
|
||||
|
||||
parser.add_argument('--output-preprocessing-state-path', type=str, help='')
|
||||
parser.add_argument(
|
||||
'--output-preprocessing-state-path-file', type=str, help='')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# read data
|
||||
data = read_data(args.input1_path)
|
||||
|
||||
# remove not required columns
|
||||
data = data.drop(['Unnamed: 0', 'lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos',
|
||||
'next-next-shape', 'next-next-word', 'next-pos', 'next-shape',
|
||||
'next-word', 'prev-iob', 'prev-lemma', 'prev-pos',
|
||||
'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape',
|
||||
'prev-prev-word', 'prev-shape', 'prev-word', "pos", "shape"], axis=1)
|
||||
|
||||
print(data.head())
|
||||
|
||||
# build sentences
|
||||
|
||||
|
||||
def agg_func(s):
|
||||
return [(w, t) for w, t in zip(s["word"].values.tolist(),
|
||||
s["tag"].values.tolist())]
|
||||
|
||||
|
||||
grouped = data.groupby("sentence_idx").apply(agg_func)
|
||||
sentences = [s for s in grouped]
|
||||
sentences_list = [" ".join([s[0] for s in sent]) for sent in sentences]
|
||||
|
||||
# calculate maxlen
|
||||
maxlen = max([len(s) for s in sentences])
|
||||
print('Maximum sequence length:', maxlen)
|
||||
|
||||
# calculate words
|
||||
words = list(set(data["word"].values))
|
||||
n_words = len(words)
|
||||
print('Number of words:', n_words)
|
||||
|
||||
# calculate tags
|
||||
tags = list(set(data["tag"].values))
|
||||
n_tags = len(tags)
|
||||
print('Number of tags:', n_tags)
|
||||
print('Type of tags:', tags)
|
||||
|
||||
# create output folder for x and y
|
||||
gfile.MakeDirs(os.path.dirname(args.output_x_path))
|
||||
gfile.MakeDirs(os.path.dirname(args.output_y_path))
|
||||
|
||||
# preprocess text
|
||||
processor = TextPreprocessor(140)
|
||||
processor.fit(sentences_list)
|
||||
processor.labels = list(set(data["tag"].values))
|
||||
|
||||
X = processor.transform(sentences_list)
|
||||
|
||||
# preprocess tags
|
||||
tag2idx = {t: i for i, t in enumerate(tags)}
|
||||
y = [[tag2idx[w[1]] for w in s] for s in sentences]
|
||||
y = pad_sequences(maxlen=140, sequences=y, padding="post", value=tag2idx["O"])
|
||||
y = [to_categorical(i, num_classes=n_tags) for i in y]
|
||||
|
||||
# export features and labels for training
|
||||
with gfile.GFile(args.output_x_path, 'w') as output_X:
|
||||
pickle.dump(X, output_X)
|
||||
|
||||
with gfile.GFile(args.output_y_path, 'w') as output_y:
|
||||
pickle.dump(y, output_y)
|
||||
|
||||
# export preprocessing state, required for custom prediction route used
|
||||
# during inference
|
||||
preprocess_output = args.output_preprocessing_state_path + '/' + PREPROCESS_FILE
|
||||
with gfile.GFile(preprocess_output, 'w') as output_preprocessing_state:
|
||||
pickle.dump(processor, output_preprocessing_state)
|
||||
|
||||
# with open('./processor_state.pkl', 'wb') as f:
|
||||
# pickle.dump(processor, f)
|
||||
|
||||
# writing x and y path to a file for downstream tasks
|
||||
Path(args.output_x_path_file).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(args.output_x_path_file).write_text(args.output_x_path)
|
||||
|
||||
Path(args.output_y_path_file).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(args.output_y_path_file).write_text(args.output_y_path)
|
||||
|
||||
Path(args.output_preprocessing_state_path_file).parent.mkdir(
|
||||
parents=True, exist_ok=True)
|
||||
Path(args.output_preprocessing_state_path_file).write_text(
|
||||
args.output_preprocessing_state_path + '/' + PREPROCESS_FILE)
|
||||
|
||||
# TODO @Sascha use int rather then str
|
||||
Path(args.output_tags).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(args.output_tags).write_text(str(n_tags))
|
||||
|
||||
Path(args.output_words).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(args.output_words).write_text(str(n_words))
|
|
@ -0,0 +1,27 @@
|
|||
from keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing import text
|
||||
|
||||
|
||||
class TextPreprocessor():
|
||||
|
||||
def __init__(self, max_sequence_length):
|
||||
self._max_sequence_length = max_sequence_length
|
||||
self._labels = None
|
||||
self.number_words = None
|
||||
self._tokenizer = None
|
||||
|
||||
def fit(self, instances):
|
||||
tokenizer = text.Tokenizer(lower=False, filters=[], oov_token=None)
|
||||
tokenizer.fit_on_texts(instances)
|
||||
self._tokenizer = tokenizer
|
||||
self.number_words = len(tokenizer.word_index)
|
||||
print(self.number_words)
|
||||
|
||||
def transform(self, instances):
|
||||
sequences = self._tokenizer.texts_to_sequences(instances)
|
||||
padded_sequences = pad_sequences(
|
||||
maxlen=140,
|
||||
sequences=sequences,
|
||||
padding="post",
|
||||
value=self.number_words - 1)
|
||||
return padded_sequences
|
|
@ -0,0 +1,4 @@
|
|||
ARG BASE_IMAGE_TAG=1.12.0-py3
|
||||
FROM tensorflow/tensorflow:$BASE_IMAGE_TAG
|
||||
RUN python3 -m pip install keras
|
||||
COPY ./src /pipelines/component/src
|
|
@ -0,0 +1,13 @@
|
|||
|
||||
#!/bin/sh
|
||||
|
||||
image_name=gcr.io/$PROJECT_ID/kubeflow/ner/train
|
||||
image_tag=latest
|
||||
|
||||
full_image_name=${image_name}:${image_tag}
|
||||
base_image_tag=1.12.0-py3
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
docker build --build-arg BASE_IMAGE_TAG=${base_image_tag} -t "${full_image_name}" .
|
||||
docker push "$full_image_name"
|
|
@ -0,0 +1,29 @@
|
|||
name: train
|
||||
description: Trains the NER Bi-LSTM.
|
||||
inputs:
|
||||
- {name: Input x URI, type: GCSPath}
|
||||
- {name: Input y URI, type: GCSPath}
|
||||
- {name: Input job dir URI, type: GCSPath}
|
||||
- {name: Input tags, type: Integer}
|
||||
- {name: Input words, type: Integer}
|
||||
- {name: Input dropout }
|
||||
- {name: Output model URI template, type: GCSPath}
|
||||
outputs:
|
||||
- name: Output model URI
|
||||
type: GCSPath
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/<PROJECT-ID>/kubeflow/ner/train:latest
|
||||
command: [
|
||||
python3, /pipelines/component/src/train.py,
|
||||
--input-x-path, {inputValue: Input x URI},
|
||||
--input-job-dir, {inputValue: Input job dir URI},
|
||||
--input-y-path, {inputValue: Input y URI},
|
||||
--input-tags, {inputValue: Input tags},
|
||||
--input-words, {inputValue: Input words},
|
||||
--input-dropout, {inputValue: Input dropout},
|
||||
--output-model-path, {inputValue: Output model URI template},
|
||||
|
||||
--output-model-path-file, {outputPath: Output model URI},
|
||||
|
||||
]
|
|
@ -0,0 +1,120 @@
|
|||
import argparse
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
from tensorflow import gfile
|
||||
from tensorflow.python.lib.io import file_io
|
||||
from keras.models import Model, Input
|
||||
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
|
||||
from keras.callbacks import TensorBoard
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
MODEL_FILE = 'keras_saved_model.h5'
|
||||
|
||||
|
||||
def load_feature(input_x_path):
|
||||
with gfile.Open(input_x_path, 'rb') as input_x_file:
|
||||
return pickle.loads(input_x_file.read())
|
||||
|
||||
|
||||
def load_label(input_y_path):
|
||||
with gfile.Open(input_y_path, 'rb') as input_y_file:
|
||||
return pickle.loads(input_y_file.read())
|
||||
|
||||
|
||||
# Defining and parsing the command-line arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--input-x-path', type=str, help='')
|
||||
parser.add_argument('--input-y-path', type=str, help='')
|
||||
parser.add_argument('--input-job-dir', type=str, help='')
|
||||
|
||||
parser.add_argument('--input-tags', type=int, help='')
|
||||
parser.add_argument('--input-words', type=int, help='')
|
||||
parser.add_argument('--input-dropout', type=float, help='')
|
||||
|
||||
parser.add_argument('--output-model-path', type=str, help='')
|
||||
parser.add_argument('--output-model-path-file', type=str, help='')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(os.path.dirname(args.output_model_path))
|
||||
|
||||
print(args.input_x_path)
|
||||
print(args.input_y_path)
|
||||
print(args.input_job_dir)
|
||||
print(args.input_tags)
|
||||
print(args.input_words)
|
||||
print(args.input_dropout)
|
||||
print(args.output_model_path)
|
||||
print(args.output_model_path_file)
|
||||
|
||||
X = load_feature(args.input_x_path)
|
||||
y = load_label(args.input_y_path)
|
||||
|
||||
|
||||
# split data
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
||||
|
||||
# initialize tensorboard
|
||||
tensorboard = TensorBoard(
|
||||
log_dir=os.path.join(args.input_job_dir, 'logs'),
|
||||
histogram_freq=0,
|
||||
write_graph=True,
|
||||
embeddings_freq=0)
|
||||
|
||||
callbacks = [tensorboard]
|
||||
|
||||
# model
|
||||
model_input = Input(shape=(140,))
|
||||
model = Embedding(input_dim=args.input_words,
|
||||
output_dim=140, input_length=140)(model_input)
|
||||
model = Dropout(args.input_dropout)(model)
|
||||
model = Bidirectional(
|
||||
LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
|
||||
out = TimeDistributed(Dense(args.input_tags, activation="softmax"))(
|
||||
model) # softmax output layer
|
||||
model = Model(model_input, out)
|
||||
model.compile(optimizer="adam", loss="categorical_crossentropy",
|
||||
metrics=["accuracy"])
|
||||
model.summary()
|
||||
|
||||
history = model.fit(X_train, np.array(y_train), batch_size=32,
|
||||
epochs=1, validation_split=0.1, verbose=1, callbacks=callbacks)
|
||||
|
||||
loss, accuracy = model.evaluate(X_test, np.array(y_test))
|
||||
|
||||
# save model
|
||||
print('saved model to ', args.output_model_path)
|
||||
model.save(MODEL_FILE)
|
||||
with file_io.FileIO(MODEL_FILE, mode='rb') as input_f:
|
||||
with file_io.FileIO(args.output_model_path + '/' + MODEL_FILE, mode='wb+') as output_f:
|
||||
output_f.write(input_f.read())
|
||||
|
||||
# write out metrics
|
||||
metrics = {
|
||||
'metrics': [{
|
||||
'name': 'accuracy-score',
|
||||
'numberValue': accuracy,
|
||||
'format': "PERCENTAGE",
|
||||
}]
|
||||
}
|
||||
|
||||
with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
|
||||
json.dump(metrics, f)
|
||||
|
||||
# write out TensorBoard viewer
|
||||
metadata = {
|
||||
'outputs': [{
|
||||
'type': 'tensorboard',
|
||||
'source': args.input_job_dir,
|
||||
}]
|
||||
}
|
||||
|
||||
with open('/mlpipeline-ui-metadata.json', 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
|
||||
Path(args.output_model_path_file).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(args.output_model_path_file).write_text(args.output_model_path)
|
After Width: | Height: | Size: 5.5 KiB |
After Width: | Height: | Size: 10 KiB |
After Width: | Height: | Size: 19 KiB |
After Width: | Height: | Size: 48 KiB |
After Width: | Height: | Size: 48 KiB |
After Width: | Height: | Size: 19 KiB |
After Width: | Height: | Size: 20 KiB |
After Width: | Height: | Size: 34 KiB |
After Width: | Height: | Size: 25 KiB |
After Width: | Height: | Size: 10 KiB |
After Width: | Height: | Size: 18 KiB |
|
@ -0,0 +1,33 @@
|
|||
# Setup
|
||||
|
||||
## Deploying Kubeflow to Google Cloud Platform
|
||||
This example requires a running Kubeflow environment (v0.5.0). The easiest way to setup a Kubeflow environment is by using the [Deployment UI](https://www.kubeflow.org/docs/gke/deploy/deploy-ui/).
|
||||
|
||||
## Set enviornment variables
|
||||
|
||||
Create the following environment variables, follow the [documenation](https://cloud.google.com/resource-manager/docs/creating-managing-projects#identifying_projects) to get the project id :
|
||||
|
||||
```bash
|
||||
export BUCKET=your-bucket-name
|
||||
export PROJECT_ID=your-gcp-project-id
|
||||
```
|
||||
|
||||
## Create bucket
|
||||
Create a bucket that will contain everything required for our Kubeflow pipeline.
|
||||
|
||||
```bash
|
||||
gsutil mb -c regional -l us-east1 gs://${BUCKET}
|
||||
```
|
||||
|
||||
## Clone this repository
|
||||
Clone the following repository, which contains everything needed for this example.
|
||||
|
||||
```bash
|
||||
git clone https://github.com/kubeflow/examples.git
|
||||
```
|
||||
|
||||
Open a Terminal and navigate to the folder `/examples/named-entity-recognition/`.
|
||||
|
||||
*Next*: [Build the pipeline components](step-2-build-components.md)
|
||||
|
||||
*Previous*: [Index](../README.md)
|
|
@ -0,0 +1,77 @@
|
|||
# Build components
|
||||
|
||||
A component is code that performs one step in the Kubeflow pipeline. It is a containerized implementation of an ML task. **Components can be reused in other pipelines.**
|
||||
|
||||
## Component structure
|
||||
A component follows a specific [structure](https://www.kubeflow.org/docs/pipelines/sdk/component-development/) and contains:
|
||||
|
||||
* `/src` - Component logic .
|
||||
* `component.yaml` - Component specification.
|
||||
* `Dockerfile` - Dockerfile to build the container.
|
||||
* `readme.md` - Readme to explain the component and its inputs and outputs.
|
||||
* `build_image.sh` - Scripts to build the component and push it to a Docker repository.
|
||||
|
||||
## Components
|
||||
This Kubeflow project contains 3 components:
|
||||
|
||||
### Preprocess component
|
||||
The preprocess component is downloading the training data and performs several preprocessing steps. This preprocessing step is required in order to have data which can be used by our model.
|
||||
|
||||
|
||||
### Train component
|
||||
The train component is using the preprocessed training data. Contains the model itself and manages the training process.
|
||||
|
||||
### Deploy component
|
||||
The deploy component is using the model and starts a deployment to AI Platform.
|
||||
|
||||
## Build and push component images
|
||||
In order to use the components later on in our pipelines,you have to build and then push the image to a Docker registry. In this example, you are using the
|
||||
[Google Container Registry](https://cloud.google.com/container-registry/), it is possible to use any other docker registry.
|
||||
|
||||
Each component has its dedicated build script `build_image.sh`, the build scripts are located in each component folder:
|
||||
|
||||
* `/components/preprocess/build_image.sh`
|
||||
* `/components/train/build_image.sh`
|
||||
* `/components/deploy/build_image.sh`
|
||||
|
||||
To build and push the Docker images open a Terminal, navigate to `/components/` and run the following command:
|
||||
|
||||
```bash
|
||||
$ ./build_components.sh
|
||||
```
|
||||
|
||||
## Check that the images are successfully pushed to the Google Cloud Repository
|
||||
|
||||
Navigate to the Google Cloud Container Registry and validate that you see the components.
|
||||
|
||||

|
||||
|
||||
## Upload the component specification
|
||||
The specification contains anything you need to use the component. Therefore you need access to these files later on in your pipeline.
|
||||
It also contains the path to our docker images, open `component.yaml` for each component and set **`<PROJECT-ID>`** to your Google Cloud Platform project id.
|
||||
|
||||
Upload all three component specifications to your Google Cloud Storage and make it public accessible by setting the permission to `allUsers`.
|
||||
|
||||
> It is also possible to upload those files to a storage solution of your choice. GCS currently only supports public object in the GCS.
|
||||
|
||||
Navigate to the components folder `/components/` open `copy_specification.sh` set your bucket name `BUCKET="your-bucket"` and run the following command:
|
||||
|
||||
```bash
|
||||
$ ./copy_specification.sh
|
||||
```
|
||||
|
||||
The bucket contains 3 folder:
|
||||
|
||||

|
||||
|
||||
|
||||
## Troubleshooting
|
||||
Run `gcloud auth configure-docker` to configure docker, in case you get the following error message:
|
||||
|
||||
```b
|
||||
You don't have the needed permissions to perform this operation, and you may have invalid credentials. To authenticate your request, follow the steps in: https://cloud.google.com/container-registry/docs/advanced-authentication
|
||||
```
|
||||
|
||||
*Next*: [Upload the dataset](step-3-upload-dataset.md)
|
||||
|
||||
*Previous*: [Setup Kubeflow and clone repository](step-1-setup.md)
|
|
@ -0,0 +1,43 @@
|
|||
# Dataset
|
||||
|
||||
## Dataset description
|
||||
|
||||
This example project is using the popular CoNLL 2002 dataset. The csv consists of multiple rows each containing a word with the corresponding tag. Multiple rows are building a single sentence.
|
||||
|
||||
The dataset itself contains different tags
|
||||
* geo = Geographical Entity
|
||||
* org = Organization
|
||||
* per = Person
|
||||
* gpe = Geopolitical Entity
|
||||
* tim = Time indicator
|
||||
* art = Artifact
|
||||
* eve = Event
|
||||
* nat = Natural Phenomenon
|
||||
|
||||
Each tag is defined in an IOB format, IOB (short for inside, outside, beginning) is a common tagging format for tagging tokens.
|
||||
|
||||
> B - indicates the beginning of a token
|
||||
|
||||
> I - indicates the inside of a token
|
||||
|
||||
> O - indicates that the token is outside of any entity not annotated
|
||||
|
||||
### Example
|
||||
|
||||
```bash
|
||||
"London on Monday evening"
|
||||
"London(B-geo) on(O) Monday(B-tim) evening(I-tim)"
|
||||
```
|
||||
|
||||
## Data Preparation
|
||||
You can download the dataset from the [Kaggle dataset](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus). In order to make it convenient we have uploaded the dataset on GCS.
|
||||
|
||||
```
|
||||
gs://kubeflow-examples-data/named_entity_recognition_dataset/ner.csv
|
||||
```
|
||||
|
||||
> The training pipeline will use this data, there are no further data preperation steps required.
|
||||
|
||||
*Next*: [Custom prediction routine](step-4-custom-prediction-routine.md)
|
||||
|
||||
*Previous*: [Build the pipeline components](step-2-build-components.md)
|
|
@ -0,0 +1,38 @@
|
|||
# Custom prediction routine
|
||||
|
||||
Custom prediction routines allow us to specify additional code that runs with every prediction request.
|
||||
Without custom prediction routine the machine learning framework handles the prediction operation.
|
||||
|
||||
## Why custom prediction routine
|
||||
Our model requires numeric inputs, which we convert from text before training (this is the preprocessing step). To perform the same conversion at prediction time, inject the preprocessing code by defining a custom prediction routine.
|
||||
|
||||
> Without a custom prediction routine, you would need to create a wrapper, e.g. with App Engine or Cloud Functions, which would add complexity and latency.
|
||||
|
||||
## How do custom prediction routines work?
|
||||
|
||||
Our custom prediction routine requires six parts
|
||||
|
||||
* `keras_saved_model.h5` - The model stored as part of our training component (artifact).
|
||||
* `processor_state.pkl` - The preprocessing state stored as part of our training component (artifact).
|
||||
* `model_prediction.py` - The custom prediction routine logic.
|
||||
* `text_preprocessor.py` - The pre-processing logic.
|
||||
* `custom_prediction_routine.tar.gz` - A Python package `tar.gz` which contains our implementation.
|
||||
* `setup.py` - Used to create the Python package.
|
||||
|
||||
To build our custom prediction routine run the build script located `/routine/build_routine.sh`. This creates a `tar.gz` which is required when you deploy your model.
|
||||
|
||||
Navigate to the routine folder `/routine/` and run the following build script:
|
||||
|
||||
```bash
|
||||
$ ./build_routine.sh
|
||||
```
|
||||
|
||||
## Upload custom prediction routine to Google Cloud Storage
|
||||
|
||||
```bash
|
||||
gsutil cp custom_prediction_routine-0.2.tar.gz gs://${BUCKET}/routine/custom_prediction_routine-0.2.tar.gz
|
||||
```
|
||||
|
||||
*Next*: [Run the pipeline](step-5-run-pipeline.md)
|
||||
|
||||
*Previous*: [Upload the dataset](step-3-upload-dataset.md)
|
|
@ -0,0 +1,136 @@
|
|||
# Run the pipeline
|
||||
If you are not familiar with pipelines have a look into the following article ["Kubeflow Components and Pipelines"](https://towardsdatascience.com/kubeflow-components-and-pipelines-33a1aa3cc338).
|
||||
|
||||
## Open the Kubeflow Notebook
|
||||
The pipeline can be created using our Jupyter notebook. For that, you have to create a Notebook in Kubeflow.
|
||||
|
||||
Open the Jupyter notebook interface and create a new Terminal by clicking on menu, New -> Terminal. In the Terminal, clone this git repo by executing:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/kubeflow/examples.git
|
||||
```
|
||||
|
||||
Now you have all the code required to run the pipeline. Navigate to the `examples/named-entity-recognition/notebooks` folder and open `Pipeline.ipynb`
|
||||
|
||||
## Configure the pipeline
|
||||
|
||||
The pipeline need several parameter in order to execute the components. After you set up all the parameter, run the notebook and click on the `Open experiment` link.
|
||||
|
||||
### Configure preprocess component
|
||||
|
||||
* `input_1_uri` - The input data csv
|
||||
* `output_y_uri_template` - Output storage location for our preprocessed labels.
|
||||
* `output_x_uri_template` - Output storage location for our preprocessed features.
|
||||
* `output_preprocessing_state_uri_template` - Output storage location for our preprocessing state.
|
||||
|
||||
### Configure train component
|
||||
|
||||
* `input_x_uri` - Output of the previous pipeline step, contains preprocessed features.
|
||||
* `input_y_uri` - Output of the previous pipeline step, contains preprocessed labels.
|
||||
* `input_job_dir_uri` - Output storage location for the training job files.
|
||||
* `input_tags` - Output of the previous pipeline step, contains the number of tags.
|
||||
* `input_words` - Output of the previous pipeline step, contains the number of words.
|
||||
* `output_model_uri_template` - Output storage location for our trained model.
|
||||
|
||||
|
||||
### Configure deploy component
|
||||
* `model_path` - The model path is the output of the previous pipeline step the training.
|
||||
* `model_name` - The model name is later displayed in AI Platform.
|
||||
* `model_region` - The region where the model sould be deployed.
|
||||
* `model_version` - The version of the trained model.
|
||||
* `model_runtime_version` - The runtime version, in your case you used TensorFlow 1.13 .
|
||||
* `model_prediction_class` - The prediction class of our custom prediction routine.
|
||||
* `model_python_version` - The used python version
|
||||
* `model_package_uris` - The package which contains our custom prediction routine.
|
||||
|
||||
## Whats happening in the notebook?
|
||||
### Load the component
|
||||
Components can be used in Pipelines by loading them from an URL. Everyone with access to the Docker repository can use these components.
|
||||
The component can be loaded via components.load_component_from_url()
|
||||
|
||||
```python
|
||||
preprocess_operation = kfp.components.load_component_from_url(
|
||||
'https://storage.googleapis.com/{}/components/preprocess/component.yaml'.format(BUCKET))
|
||||
help(preprocess_operation)
|
||||
|
||||
train_operation = kfp.components.load_component_from_url(
|
||||
'https://storage.googleapis.com/{}/components/train/component.yaml'.format(BUCKET))
|
||||
help(train_operation)
|
||||
|
||||
ai_platform_deploy_operation = comp.load_component_from_url(
|
||||
"https://storage.googleapis.com/{}/components/deploy/component.yaml".format(BUCKET))
|
||||
help(ai_platform_deploy_operation)
|
||||
```
|
||||
|
||||
Example based on the training component:
|
||||
|
||||
1. `kfp.components.load_component_from_url` loads the pipeline component.
|
||||
2. You then have a operation that runs the container image and accepts arguments for the component inputs.
|
||||
|
||||

|
||||
|
||||
### Create the pipeline
|
||||
The pipeline is created by defining a decorator. The dsl decorator is provided via the pipeline SDK. `dsl.pipeline` defines a decorator for Python functions which returns a pipeline.
|
||||
|
||||
```python
|
||||
@dsl.pipeline(
|
||||
name='Named Entity Recognition Pipeline',
|
||||
description='Performs preprocessing, training and deployment.'
|
||||
)
|
||||
def pipeline():
|
||||
...
|
||||
```
|
||||
|
||||
### Compile the pipeline
|
||||
To compile the pipeline you use the `compiler.Compile()` function which is part of the pipeline SDK.
|
||||
The compiler generates a yaml definition which is used by Kubernetes to create the execution resources.
|
||||
|
||||
```python
|
||||
pipeline_func = pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.pipeline.zip'
|
||||
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
### Create an experiment
|
||||
Pipelines are always part of an experiment.
|
||||
They can be created with the Kubeflow pipeline client `kfp.client()`.
|
||||
Experiments cannot be removed at the moment.
|
||||
|
||||
```python
|
||||
client = kfp.Client()
|
||||
|
||||
try:
|
||||
experiment = client.get_experiment(experiment_name=EXPERIMENT_NAME)
|
||||
except:
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
```
|
||||
|
||||
### Run the pipeline
|
||||
Use the experiment id and the compiled pipeline to run a pipeline. `client.run_pipeline()` runs the pipelines and provides a direct link to the Kubeflow experiment.
|
||||
|
||||
```python
|
||||
arguments = {}
|
||||
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id,
|
||||
run_name,
|
||||
pipeline_filename,
|
||||
arguments)
|
||||
```
|
||||
|
||||
## Options to scale your training
|
||||
|
||||
> These are optional extenstions and outside of the scope of this example.
|
||||
|
||||
As default, training jobs are running within the CPU pool.
|
||||
If the dataset size or model complexity increases you have several options:
|
||||
|
||||
* Scale the training with AI Platform .
|
||||
* Train in Kubeflow by enabling GPU or TPU on the ContainerOp.
|
||||
* Converting the Keras model to a TensorFlow estimator and take advantage of distributed training.
|
||||
|
||||
*Next*: [Monitor the training](step-6-monitor-training.md)
|
||||
|
||||
*Previous*: [Custom prediction routine](step-4-custom-prediction-routine.md)
|
|
@ -0,0 +1,41 @@
|
|||
# Monitor your pipeline
|
||||
|
||||
## Pipeline steps
|
||||
Open Kubeflow and go to `pipeline dashboard` click `experiments` and open the `run`. You can see the pipeline graph which shows each step in our pipeline. As you can see all of your steps completed successfully.
|
||||
|
||||

|
||||
|
||||
|
||||
## Open TensorBoard
|
||||
During the training of your model, you are interested how your model loss and accuracy changes for each iteration. TensorBoard provides a visual presenation of iterations.
|
||||
|
||||
The logs of the training are uploaded to a Google Cloud Storage Bucket. TensorBoard automatically references this log location and displays the corresponding data.
|
||||
|
||||
The training component contains a TensorBoard visualization (TensorBoard viewer), which makes is comfortable to open the TensorBoard session for training jobs.
|
||||
|
||||
To open TensorBoard click on the `training` component in your experiment run. Located on the ride side is the artifact windows which shows a very handy button called (Open TensorBoard).
|
||||
|
||||
In order to use his visualizations, your pipeline component must write a JSON file. Kubeflow provides a good documenation on [how visualizations are working](https://www.kubeflow.org/docs/pipelines/sdk/output-viewer/) and what types are available.
|
||||
|
||||
```
|
||||
# write out TensorBoard viewer
|
||||
metadata = {
|
||||
'outputs' : [{
|
||||
'type': 'tensorboard',
|
||||
'source': args.input_job_dir,
|
||||
}]
|
||||
}
|
||||
|
||||
with open('/mlpipeline-ui-metadata.json', 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
```
|
||||
|
||||
## Training metrics
|
||||
|
||||
Your training component creates a metric (accuracy-score) which are displayed in the experiment UI. With those metrics, you can compare your different runs and model performance.
|
||||
|
||||

|
||||
|
||||
*Next*: [Predict](step-7-predictions.md)
|
||||
|
||||
*Previous*: [Run the pipeline](step-5-run-pipeline.md)
|
|
@ -0,0 +1,38 @@
|
|||
# Prediction
|
||||
|
||||
Open AI Platform and navigate to your [model](https://console.cloud.google.com/ai-platform/models), there is one model listed:
|
||||
|
||||

|
||||
|
||||
Open the model and choose your version then click on the Tab `TEST & USE` and enter the following input data:
|
||||
|
||||
```
|
||||
{"instances": ["London on Monday evening"]}
|
||||
```
|
||||

|
||||
|
||||
After a couple of seconds, you get the prediction response. Where `London` got pedicted as geolocation (B-geo), and `Monday evening` as time where Monday is the beginning (B-tim) and evening is inisde (I-tim).
|
||||
|
||||
```json
|
||||
{
|
||||
"predictions": [
|
||||
[
|
||||
"B-geo",
|
||||
"O",
|
||||
"B-tim",
|
||||
"I-tim",
|
||||
]
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Congratulations you trained and deployed a Named Entity Recognition model where you can extract entities. There are many use cases where such models can be used.
|
||||
|
||||
Examples:
|
||||
|
||||
* Optimize search results by extract specific entities out of search queries.
|
||||
* Classify large document archives by making entities filterable.
|
||||
* Enhance access for digital research of large document archives.
|
||||
* Route customer support message by extracting the department or product.
|
||||
|
||||
*Previous*: [Monitor the training](step-6-monitor-training.md)
|
|
@ -0,0 +1,271 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Named Entity Recognition pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"EXPERIMENT_NAME = 'named-entity-recognition'\n",
|
||||
"BUCKET = \"your-bucket-name\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp\n",
|
||||
"from kfp import compiler\n",
|
||||
"import kfp.components as comp\n",
|
||||
"import kfp.dsl as dsl\n",
|
||||
"from kfp import gcp"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load components"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Help on function preprocess:\n",
|
||||
"\n",
|
||||
"preprocess(input_1_uri:'GCSPath', output_x_uri_template:'GCSPath', output_y_uri_template:'GCSPath', output_preprocessing_state_uri_template:'GCSPath')\n",
|
||||
" Performs the IOB preprocessing.\n",
|
||||
"\n",
|
||||
"Help on function train:\n",
|
||||
"\n",
|
||||
"train(input_x_uri:'GCSPath', input_y_uri:'GCSPath', input_job_dir_uri:'GCSPath', input_tags:'Integer', input_words:'Integer', input_dropout, output_model_uri_template:'GCSPath')\n",
|
||||
" Trains the NER Bi-LSTM.\n",
|
||||
"\n",
|
||||
"Help on function deploy:\n",
|
||||
"\n",
|
||||
"deploy(model_path:'GCSPath', model_name:'String', model_region:'String', model_version:'String', model_runtime_version:'String', model_prediction_class:'String', model_python_version:'String', model_package_uris:'String')\n",
|
||||
" Deploy the model with custom prediction route\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"preprocess_operation = kfp.components.load_component_from_url(\n",
|
||||
" 'https://storage.googleapis.com/{}/components/preprocess/component.yaml'.format(BUCKET))\n",
|
||||
"help(preprocess_operation)\n",
|
||||
"\n",
|
||||
"train_operation = kfp.components.load_component_from_url(\n",
|
||||
" 'https://storage.googleapis.com/{}/components/train/component.yaml'.format(BUCKET))\n",
|
||||
"help(train_operation)\n",
|
||||
"\n",
|
||||
"ai_platform_deploy_operation = comp.load_component_from_url(\n",
|
||||
" \"https://storage.googleapis.com/{}/components/deploy/component.yaml\".format(BUCKET))\n",
|
||||
"help(ai_platform_deploy_operation)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Build the Pipeline "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@dsl.pipeline(\n",
|
||||
" name='Named Entity Recognition Pipeline',\n",
|
||||
" description='Performs preprocessing, training and deployment.'\n",
|
||||
")\n",
|
||||
"def pipeline():\n",
|
||||
" \n",
|
||||
" preprocess_task = preprocess_operation(\n",
|
||||
" input_1_uri='gs://kubeflow-examples-data/named_entity_recognition_dataset/ner.csv,\n",
|
||||
" output_y_uri_template=\"gs://{}/{{workflow.uid}}/preprocess/y/data\".format(BUCKET),\n",
|
||||
" output_x_uri_template=\"gs://{}/{{workflow.uid}}/preprocess/x/data\".format(BUCKET),\n",
|
||||
" output_preprocessing_state_uri_template=\"gs://{}/{{workflow.uid}}/model\".format(BUCKET)\n",
|
||||
" ).apply(kfp.gcp.use_gcp_secret('user-gcp-sa')) \n",
|
||||
" \n",
|
||||
" \n",
|
||||
" train_task = train_operation(\n",
|
||||
" input_x_uri=preprocess_task.outputs['output-x-uri'],\n",
|
||||
" input_y_uri=preprocess_task.outputs['output-y-uri'],\n",
|
||||
" input_job_dir_uri=\"gs://{}/{{workflow.uid}}/job\".format(BUCKET),\n",
|
||||
" input_tags=preprocess_task.outputs['output-tags'],\n",
|
||||
" input_words=preprocess_task.outputs['output-words'],\n",
|
||||
" input_dropout=0.1,\n",
|
||||
" output_model_uri_template=\"gs://{}/{{workflow.uid}}/model\".format(BUCKET)\n",
|
||||
" ).apply(kfp.gcp.use_gcp_secret('user-gcp-sa')) \n",
|
||||
" \n",
|
||||
" \n",
|
||||
" deploy_task = ai_platform_deploy_operation(\n",
|
||||
" model_path= train_task.output,\n",
|
||||
" model_name=\"named_entity_recognition_kubeflow\",\n",
|
||||
" model_region=\"us-central1\",\n",
|
||||
" model_version=\"version1\",\n",
|
||||
" model_runtime_version=\"1.13\",\n",
|
||||
" model_prediction_class=\"model_prediction.CustomModelPrediction\",\n",
|
||||
" model_python_version=\"3.5\",\n",
|
||||
" model_package_uris=\"gs://{}/routine/custom_prediction_routine-0.2.tar.gz\".format(BUCKET)\n",
|
||||
" ).apply(kfp.gcp.use_gcp_secret('user-gcp-sa'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Compile the Pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.pipeline.zip'\n",
|
||||
"\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create a Kubeflow Experiment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'created_at': datetime.datetime(2019, 7, 5, 10, 32, 13, tzinfo=tzlocal()),\n",
|
||||
" 'description': None,\n",
|
||||
" 'id': '84e88563-7774-4bae-aa33-4a67649c136a',\n",
|
||||
" 'name': 'named-entity-recognition'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"client = kfp.Client()\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" experiment = client.get_experiment(experiment_name=EXPERIMENT_NAME)\n",
|
||||
"except:\n",
|
||||
" experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
" \n",
|
||||
"print(experiment)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Run the Pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"Run link <a href=\"/pipeline/#/runs/details/705a2bc2-9f1c-11e9-9120-42010a800045\" target=\"_blank\" >here</a>"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.HTML object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"84e88563-7774-4bae-aa33-4a67649c136a\n",
|
||||
"pipeline run\n",
|
||||
"pipeline.pipeline.zip\n",
|
||||
"{}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, \n",
|
||||
" run_name, \n",
|
||||
" pipeline_filename, \n",
|
||||
" arguments)\n",
|
||||
"\n",
|
||||
"print(experiment.id)\n",
|
||||
"print(run_name)\n",
|
||||
"print(pipeline_filename)\n",
|
||||
"print(arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
#!/bin/sh
|
||||
|
||||
python setup.py sdist --dist-dir=.
|
|
@ -0,0 +1,39 @@
|
|||
import os
|
||||
import pickle
|
||||
import numpy as np
|
||||
|
||||
|
||||
class CustomModelPrediction():
|
||||
|
||||
def __init__(self, model, processor):
|
||||
self._model = model
|
||||
self._processor = processor
|
||||
|
||||
def postprocess(self, predictions):
|
||||
labeled_predictions = []
|
||||
|
||||
for prediction in predictions:
|
||||
labeled_prediction = []
|
||||
for word_prediction in prediction:
|
||||
labeled_prediction.append(self._processor.labels[word_prediction])
|
||||
labeled_predictions.append(labeled_prediction)
|
||||
|
||||
return labeled_predictions
|
||||
|
||||
def predict(self, instances):
|
||||
transformed_instances = self._processor.transform(instances)
|
||||
predictions = self._model.predict(np.array(transformed_instances))
|
||||
predictions = np.argmax(predictions, axis=-1).tolist()
|
||||
|
||||
labels = self.postprocess(predictions)
|
||||
return labels
|
||||
|
||||
@classmethod
|
||||
def from_path(cls, model_dir):
|
||||
import tensorflow.keras as keras
|
||||
model = keras.models.load_model(
|
||||
os.path.join(model_dir, 'keras_saved_model.h5'))
|
||||
with open(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
|
||||
processor = pickle.load(f)
|
||||
|
||||
return cls(model, processor)
|
|
@ -0,0 +1,15 @@
|
|||
from setuptools import find_packages
|
||||
from setuptools import setup
|
||||
|
||||
REQUIRED_PACKAGES = [
|
||||
'Keras==2.2.4'
|
||||
]
|
||||
|
||||
setup(
|
||||
name="custom_prediction_routine",
|
||||
version="0.2",
|
||||
include_package_data=True,
|
||||
install_requires=REQUIRED_PACKAGES,
|
||||
packages=find_packages(),
|
||||
scripts=["model_prediction.py", "text_preprocessor.py"]
|
||||
)
|
|
@ -0,0 +1,27 @@
|
|||
from keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing import text
|
||||
|
||||
|
||||
class TextPreprocessor():
|
||||
|
||||
def __init__(self, max_sequence_length):
|
||||
self._max_sequence_length = max_sequence_length
|
||||
self._labels = None
|
||||
self.number_words = None
|
||||
self._tokenizer = None
|
||||
|
||||
def fit(self, instances):
|
||||
tokenizer = text.Tokenizer(lower=False, filters=[], oov_token=None)
|
||||
tokenizer.fit_on_texts(instances)
|
||||
self._tokenizer = tokenizer
|
||||
self.number_words = len(tokenizer.word_index)
|
||||
print(self.number_words)
|
||||
|
||||
def transform(self, instances):
|
||||
sequences = self._tokenizer.texts_to_sequences(instances)
|
||||
padded_sequences = pad_sequences(
|
||||
maxlen=140,
|
||||
sequences=sequences,
|
||||
padding="post",
|
||||
value=self.number_words - 1)
|
||||
return padded_sequences
|