pipelines/components/dataflow/tft/transform.py

304 lines
11 KiB
Python

# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import apache_beam as beam
import argparse
import datetime
import csv
import json
import logging
import os
import tensorflow as tf
import tensorflow_transform as tft
from apache_beam.io import textio
from apache_beam.io import tfrecordio
from apache_beam.options.pipeline_options import PipelineOptions
from tensorflow.contrib.slim.python.slim.nets.inception_v3 import inception_v3
from tensorflow.contrib.slim.python.slim.nets.inception_v3 import inception_v3_arg_scope
from tensorflow.python.lib.io import file_io
from tensorflow_transform.beam import impl as beam_impl
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tensorflow_transform.coders.csv_coder import CsvCoder
from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import metadata_io
# Inception Checkpoint
INCEPTION_V3_CHECKPOINT = 'gs://cloud-ml-data/img/flower_photos/inception_v3_2016_08_28.ckpt'
INCEPTION_EXCLUDED_VARIABLES = ['InceptionV3/AuxLogits', 'InceptionV3/Logits', 'global_step']
DELIMITERS = '.,!?() '
VOCAB_SIZE = 100000
def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('--output',
type=str,
required=True,
help='GCS or local directory.')
parser.add_argument('--train',
type=str,
required=True,
help='GCS path of train file patterns.')
parser.add_argument('--eval',
type=str,
required=True,
help='GCS path of eval file patterns.')
parser.add_argument('--schema',
type=str,
required=True,
help='GCS json schema file path.')
parser.add_argument('--project',
type=str,
required=True,
help='The GCP project to run the dataflow job.')
parser.add_argument('--mode',
choices=['local', 'cloud'],
help='whether to run the job locally or in Cloud Dataflow.')
parser.add_argument('--preprocessing-module',
type=str,
required=False,
help=('GCS path to a python file defining '
'"preprocess" and "get_feature_columns" functions.'))
args = parser.parse_args()
return args
def _image_to_vec(image_str_tensor):
def _decode_and_resize(image_str_tensor):
"""Decodes jpeg string, resizes it and returns a uint8 tensor."""
# These constants are set by Inception v3's expectations.
height = 299
width = 299
channels = 3
image = tf.read_file(image_str_tensor)
image = tf.image.decode_jpeg(image, channels=channels)
image = tf.expand_dims(image, 0)
image = tf.image.resize_bilinear(image, [height, width], align_corners=False)
image = tf.squeeze(image, squeeze_dims=[0])
image = tf.cast(image, dtype=tf.uint8)
return image
image = tf.map_fn(_decode_and_resize, image_str_tensor, back_prop=False, dtype=tf.uint8)
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
image = tf.subtract(image, 0.5)
inception_input = tf.multiply(image, 2.0)
# Build Inception layers, which expect a tensor of type float from [-1, 1)
# and shape [batch_size, height, width, channels].
with tf.contrib.slim.arg_scope(inception_v3_arg_scope()):
_, end_points = inception_v3(inception_input, is_training=False)
embeddings = end_points['PreLogits']
inception_embeddings = tf.squeeze(embeddings, [1, 2], name='SpatialSqueeze')
return inception_embeddings
def make_preprocessing_fn(schema):
"""Makes a preprocessing function.
Args:
schema: the schema of the training data.
Returns:
a preprocessing_fn function used by tft.
"""
def preprocessing_fn(inputs):
"""TFT preprocessing function.
Args:
inputs: dictionary of input `tensorflow_transform.Column`.
Returns:
A dictionary of `tensorflow_transform.Column` representing the transformed
columns.
"""
features_dict = {}
for col_schema in schema:
col_name = col_schema['name']
if col_schema['type'] == 'NUMBER':
features_dict[col_name] = inputs[col_name]
elif col_schema['type'] == 'CATEGORY':
features_dict[col_name] = tft.string_to_int(inputs[col_name],
vocab_filename='vocab_' + col_name)
elif col_schema['type'] == 'TEXT':
tokens = tf.string_split(inputs[col_name], DELIMITERS)
# TODO: default_value = 0 is wrong. It means OOV gets 0 for their index.
# But this is to workaround the issue that trainer can use the true vocab
# size. Otherwise trainer has to use VOCAB_SIZE defined in this file which
# is too large. I am talking to TFT folks on this. If there is no workaround,
# user has to provide a vocab_size.
indices = tft.string_to_int(tokens,
vocab_filename='vocab_' + col_name,
default_value=0)
# Add one for the oov bucket created by string_to_int.
bow_indices, bow_weights = tft.tfidf(indices, VOCAB_SIZE + 1)
features_dict[col_name + '_indices'] = bow_indices
features_dict[col_name + '_weights'] = bow_weights
elif col_schema['type'] == 'IMAGE_URL':
features_dict[col_name] = tft.apply_function_with_checkpoint(
_image_to_vec,
[inputs[col_name]],
INCEPTION_V3_CHECKPOINT,
exclude=INCEPTION_EXCLUDED_VARIABLES)
elif col_schema['type'] == 'KEY':
features_dict[col_name] = inputs[col_name]
else:
raise ValueError('Invalid schema. Unknown type ' + col_schema['type'])
return features_dict
return preprocessing_fn
def make_tft_input_metadata(schema):
"""Make a TFT Schema object
In the tft framework, this is where default values are recoreded for training.
Args:
schema: schema list of training data.
Returns:
TFT metadata object.
"""
tft_schema = {}
for col_schema in schema:
col_type = col_schema['type']
col_name = col_schema['name']
if col_type == 'NUMBER':
tft_schema[col_name] = dataset_schema.ColumnSchema(
tf.float32, [], dataset_schema.FixedColumnRepresentation(default_value=0.0))
elif col_type in ['CATEGORY', 'TEXT', 'IMAGE_URL', 'KEY']:
tft_schema[col_name] = dataset_schema.ColumnSchema(
tf.string, [], dataset_schema.FixedColumnRepresentation(default_value=''))
return dataset_metadata.DatasetMetadata(dataset_schema.Schema(tft_schema))
def run_transform(output_dir, schema, train_data_file, eval_data_file,
project, mode, preprocessing_fn=None):
"""Writes a tft transform fn, and metadata files.
Args:
output_dir: output folder
schema: schema list.
train_data_file: training data file pattern.
eval_data_file: eval data file pattern.
project: the project to run dataflow in.
local: whether the job should be local or cloud.
preprocessing_fn: a function used to preprocess the raw data. If not
specified, a function will be automatically inferred
from the schema.
"""
tft_input_metadata = make_tft_input_metadata(schema)
temp_dir = os.path.join(output_dir, 'tmp')
preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema)
if mode == 'local':
pipeline_options = None
runner = 'DirectRunner'
elif mode == 'cloud':
options = {
'job_name': 'pipeline-tft-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
'temp_location': temp_dir,
'project': project,
'extra_packages': ['gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz']
}
pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
runner = 'DataFlowRunner'
else:
raise ValueError("Invalid mode %s." % mode)
with beam.Pipeline(runner, options=pipeline_options) as p:
with beam_impl.Context(temp_dir=temp_dir):
names = [x['name'] for x in schema]
converter = CsvCoder(names, tft_input_metadata.schema)
train_data = (
p
| 'ReadTrainData' >> textio.ReadFromText(train_data_file)
| 'DecodeTrainData' >> beam.Map(converter.decode))
train_dataset = (train_data, tft_input_metadata)
transformed_dataset, transform_fn = (
train_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
transformed_data, transformed_metadata = transformed_dataset
# Writes transformed_metadata and transfrom_fn folders
_ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(output_dir))
# Write the raw_metadata
metadata_io.write_metadata(
metadata=tft_input_metadata,
path=os.path.join(output_dir, 'metadata'))
_ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
os.path.join(output_dir, 'train'),
coder=ExampleProtoCoder(transformed_metadata.schema))
eval_data = (
p
| 'ReadEvalData' >> textio.ReadFromText(eval_data_file)
| 'DecodeEvalData' >> beam.Map(converter.decode))
eval_dataset = (eval_data, tft_input_metadata)
transformed_eval_dataset = (
(eval_dataset, transform_fn) | beam_impl.TransformDataset())
transformed_eval_data, transformed_metadata = transformed_eval_dataset
_ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
os.path.join(output_dir, 'eval'),
coder=ExampleProtoCoder(transformed_metadata.schema))
def main():
logging.getLogger().setLevel(logging.INFO)
args = parse_arguments()
schema = json.loads(file_io.read_file_to_string(args.schema))
preprocessing_fn = None
if args.preprocessing_module:
module_dir = os.path.abspath(os.path.dirname(__file__))
preprocessing_module_path = os.path.join(module_dir, 'preprocessing.py')
with open(preprocessing_module_path, 'w+') as preprocessing_file:
preprocessing_file.write(
file_io.read_file_to_string(args.preprocessing_module))
import preprocessing
def wrapped_preprocessing_fn(inputs):
outputs = preprocessing.preprocess(inputs)
for key in outputs:
if outputs[key].dtype == tf.bool:
outputs[key] = tft.string_to_int(tf.as_string(outputs[key]),
vocab_filename='vocab_' + key)
return outputs
preprocessing_fn = wrapped_preprocessing_fn
run_transform(args.output, schema, args.train, args.eval,
args.project, args.mode, preprocessing_fn=preprocessing_fn)
with open('/output.txt', 'w') as f:
f.write(args.output)
if __name__== "__main__":
main()