261 lines
9.0 KiB
Python
261 lines
9.0 KiB
Python
# Copyright 2018 Google LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
|
|
# TODO: Add Unit or Integration Test
|
|
|
|
import apache_beam as beam
|
|
import argparse
|
|
from apache_beam.options.pipeline_options import PipelineOptions
|
|
import datetime
|
|
import json
|
|
import logging
|
|
import os
|
|
from tensorflow.python.lib.io import file_io
|
|
|
|
|
|
def parse_arguments():
|
|
"""Parse command line arguments."""
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--output',
|
|
type=str,
|
|
required=True,
|
|
help='GCS or local directory.')
|
|
parser.add_argument('--data',
|
|
type=str,
|
|
required=True,
|
|
help='GCS or local path of test file patterns.')
|
|
parser.add_argument('--schema',
|
|
type=str,
|
|
required=True,
|
|
help='GCS or local json schema file path.')
|
|
parser.add_argument('--model',
|
|
type=str,
|
|
required=True,
|
|
help='GCS or local path of model trained with tft preprocessed data.')
|
|
parser.add_argument('--target',
|
|
type=str,
|
|
required=True,
|
|
help='Name of the column for prediction target.')
|
|
parser.add_argument('--project',
|
|
type=str,
|
|
required=True,
|
|
help='The GCP project to run the dataflow job.')
|
|
parser.add_argument('--mode',
|
|
choices=['local', 'cloud'],
|
|
help='whether to run the job locally or in Cloud Dataflow.')
|
|
parser.add_argument('--batchsize',
|
|
type=int,
|
|
default=32,
|
|
help='Batch size used in prediction.')
|
|
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
class EmitAsBatchDoFn(beam.DoFn):
|
|
"""A DoFn that buffers the records and emits them batch by batch."""
|
|
|
|
def __init__(self, batch_size):
|
|
self._batch_size = batch_size
|
|
self._cached = []
|
|
|
|
def process(self, element):
|
|
from apache_beam.transforms import window
|
|
from apache_beam.utils.windowed_value import WindowedValue
|
|
self._cached.append(element)
|
|
if len(self._cached) >= self._batch_size:
|
|
emit = self._cached
|
|
self._cached = []
|
|
yield emit
|
|
|
|
def finish_bundle(self, context=None):
|
|
from apache_beam.transforms import window
|
|
from apache_beam.utils.windowed_value import WindowedValue
|
|
if len(self._cached) > 0:
|
|
yield WindowedValue(self._cached, -1, [window.GlobalWindow()])
|
|
|
|
|
|
class TargetToLastDoFn(beam.DoFn):
|
|
"""A DoFn that moves specified target column to last."""
|
|
|
|
def __init__(self, names, target_name):
|
|
self._names = names
|
|
self._target_name = target_name
|
|
self._names_no_target = list(names)
|
|
self._names_no_target.remove(target_name)
|
|
|
|
def process(self, element):
|
|
import csv
|
|
content = csv.DictReader([element], fieldnames=self._names).next()
|
|
target = content.pop(self._target_name)
|
|
yield [content[x] for x in self._names_no_target] + [target]
|
|
|
|
|
|
class PredictDoFn(beam.DoFn):
|
|
"""A DoFn that performs predictions with given trained model."""
|
|
|
|
def __init__(self, model_export_dir):
|
|
self._model_export_dir = model_export_dir
|
|
|
|
def start_bundle(self):
|
|
from tensorflow.contrib import predictor
|
|
|
|
# We need to import the tensorflow_transform library in order to
|
|
# register all of the ops that might be used by a saved model that
|
|
# incorporates TFT transformations.
|
|
import tensorflow_transform
|
|
|
|
self._predict_fn = predictor.from_saved_model(self._model_export_dir)
|
|
|
|
def process(self, element):
|
|
import csv
|
|
import StringIO
|
|
|
|
prediction_inputs = []
|
|
for instance in element:
|
|
instance_copy = list(instance)
|
|
instance_copy.pop() # remove target
|
|
buf = StringIO.StringIO()
|
|
writer = csv.writer(buf, lineterminator='')
|
|
writer.writerow(instance_copy)
|
|
prediction_inputs.append(buf.getvalue())
|
|
|
|
return_dict = self._predict_fn({"inputs": prediction_inputs})
|
|
return_dict['source'] = element
|
|
yield return_dict
|
|
|
|
|
|
class ListToCsvDoFn(beam.DoFn):
|
|
"""A DoFn function that convert list to csv line."""
|
|
|
|
def process(self, element):
|
|
import csv
|
|
import StringIO
|
|
buf = StringIO.StringIO()
|
|
writer = csv.writer(buf, lineterminator='')
|
|
writer.writerow(element)
|
|
yield buf.getvalue()
|
|
|
|
|
|
def run_predict(output_dir, data_path, schema, target_name, model_export_dir,
|
|
project, mode, batch_size):
|
|
"""Run predictions with given model using DataFlow.
|
|
Args:
|
|
output_dir: output folder
|
|
data_path: test data file path.
|
|
schema: schema list.
|
|
target_name: target column name.
|
|
model_export_dir: GCS or local path of exported model trained with tft preprocessed data.
|
|
project: the project to run dataflow in.
|
|
local: whether the job should be local or cloud.
|
|
batch_size: batch size when running prediction.
|
|
"""
|
|
|
|
target_type = next(x for x in schema if x['name']==target_name)['type']
|
|
labels_file = os.path.join(model_export_dir, 'assets', 'vocab_' + target_name)
|
|
is_classification = file_io.file_exists(labels_file)
|
|
|
|
output_file_prefix = os.path.join(output_dir, 'prediction_results')
|
|
output_schema_file = os.path.join(output_dir, 'schema.json')
|
|
names = [x['name'] for x in schema]
|
|
|
|
output_schema = filter(lambda x: x['name'] != target_name, schema)
|
|
if is_classification:
|
|
with file_io.FileIO(labels_file, mode='r') as f:
|
|
labels = [x.strip() for x in f.readlines()]
|
|
|
|
output_schema.append({'name': 'target', 'type': 'CATEGORY'})
|
|
output_schema.append({'name': 'predicted', 'type': 'CATEGORY'})
|
|
output_schema.extend([{'name': x, 'type': 'NUMBER'} for x in labels])
|
|
else:
|
|
output_schema.append({'name': 'target', 'type': 'NUMBER'})
|
|
output_schema.append({'name': 'predicted', 'type': 'NUMBER'})
|
|
|
|
if mode == 'local':
|
|
pipeline_options = None
|
|
runner = 'DirectRunner'
|
|
elif mode == 'cloud':
|
|
options = {
|
|
'job_name': 'pipeline-predict-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
|
|
'temp_location': os.path.join(output_dir, 'tmp'),
|
|
'project': project,
|
|
'setup_file': './setup.py',
|
|
}
|
|
pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
|
|
runner = 'DataFlowRunner'
|
|
else:
|
|
raise ValueError("Invalid mode %s." % mode)
|
|
|
|
with beam.Pipeline(runner, options=pipeline_options) as p:
|
|
raw_results = (p
|
|
| 'read data' >> beam.io.ReadFromText(data_path)
|
|
| 'move target to last' >> beam.ParDo(TargetToLastDoFn(names, target_name))
|
|
| 'batch' >> beam.ParDo(EmitAsBatchDoFn(batch_size))
|
|
| 'predict' >> beam.ParDo(PredictDoFn(model_export_dir)))
|
|
|
|
if is_classification:
|
|
processed_results = (raw_results
|
|
| 'unbatch' >> beam.FlatMap(lambda x: zip(x['source'], x['scores']))
|
|
| 'get predicted' >> beam.Map(lambda x: x[0] + [labels[x[1].argmax()]] + list(x[1])))
|
|
else:
|
|
processed_results = (raw_results
|
|
| 'unbatch' >> beam.FlatMap(lambda x: zip(x['source'], x['outputs']))
|
|
| 'get predicted' >> beam.Map(lambda x: x[0] + list(x[1])))
|
|
|
|
results_save = (processed_results
|
|
| 'write csv lines' >> beam.ParDo(ListToCsvDoFn())
|
|
| 'write file' >> beam.io.WriteToText(output_file_prefix))
|
|
|
|
(results_save
|
|
| 'fixed one' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1)
|
|
| 'set schema' >> beam.Map(lambda path: json.dumps(output_schema))
|
|
| 'write schema file' >> beam.io.WriteToText(output_schema_file, shard_name_template=''))
|
|
|
|
|
|
def main():
|
|
logging.getLogger().setLevel(logging.INFO)
|
|
args = parse_arguments()
|
|
# Models trained with estimator are exported to base/export/export/123456781 directory.
|
|
# Our trainer export only one model.
|
|
export_parent_dir = os.path.join(args.model, 'export', 'export')
|
|
model_export_dir = os.path.join(export_parent_dir, file_io.list_directory(export_parent_dir)[0])
|
|
schema = json.loads(file_io.read_file_to_string(args.schema))
|
|
|
|
run_predict(args.output, args.data, schema, args.target, model_export_dir,
|
|
args.project, args.mode, args.batchsize)
|
|
prediction_results = os.path.join(args.output, 'prediction_results-*')
|
|
with open('/output.txt', 'w') as f:
|
|
f.write(prediction_results)
|
|
|
|
with file_io.FileIO(os.path.join(args.output, 'schema.json'), 'r') as f:
|
|
schema = json.load(f)
|
|
|
|
metadata = {
|
|
'outputs' : [{
|
|
'type': 'table',
|
|
'storage': 'gcs',
|
|
'format': 'csv',
|
|
'header': [x['name'] for x in schema],
|
|
'source': prediction_results
|
|
}]
|
|
}
|
|
with open('/mlpipeline-ui-metadata.json', 'w') as f:
|
|
json.dump(metadata, f)
|
|
|
|
|
|
if __name__== "__main__":
|
|
main()
|