mirror of https://github.com/kubeflow/examples.git
72 lines
3.5 KiB
Python
72 lines
3.5 KiB
Python
#!/usr/bin/env python
|
|
#coding: utf-8
|
|
|
|
#Text classification with an RNN
|
|
#This text classification tutorial trains a [recurrent neural network](https://developers.google.com/machine-learning/glossary/#recurrent_neural_network) on the [IMDB large movie review dataset](http://ai.stanford.edu/~amaas/data/sentiment/) for sentiment analysis.
|
|
|
|
import importlib
|
|
import argparse
|
|
import tensorflow as tf
|
|
import tensorflow_datasets as tfds
|
|
import os
|
|
|
|
def data_loader(hyperparams, local_data_dir):
|
|
dataset, info = tfds.load('imdb_reviews/subwords8k',
|
|
data_dir=local_data_dir,
|
|
with_info=True,
|
|
as_supervised=True)
|
|
train_dataset, test_dataset = dataset['train'], dataset['test']
|
|
encoder = info.features['text'].encoder
|
|
train_dataset = train_dataset.shuffle(hyperparams['BUFFER_SIZE'])
|
|
train_dataset = train_dataset.padded_batch(hyperparams['BATCH_SIZE'], padded_shapes=None)
|
|
test_dataset = test_dataset.padded_batch(hyperparams['BATCH_SIZE'], padded_shapes=None)
|
|
return train_dataset, test_dataset, encoder
|
|
|
|
def define_model(encoder):
|
|
model = tf.keras.Sequential([
|
|
tf.keras.layers.Embedding(encoder.vocab_size, 64),
|
|
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
|
|
tf.keras.layers.Dense(64, activation='relu'),
|
|
tf.keras.layers.Dense(1)
|
|
])
|
|
return model
|
|
|
|
class MovieReviewClassification(object):
|
|
def __init__(self, learning_rate=0.01, batch_size=64, epochs=2, local_data_dir='/app/tensorflow_datasets'):
|
|
hyperparams = {'BUFFER_SIZE': 10000, 'BATCH_SIZE': batch_size}
|
|
self.model_file = "lstm_trained"
|
|
self.learning_rate = learning_rate
|
|
self.epochs = epochs
|
|
self.train_dataset, self.test_dataset, self.encoder = data_loader(hyperparams, local_data_dir)
|
|
|
|
def train(self):
|
|
model = define_model(self.encoder)
|
|
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
|
|
optimizer=tf.keras.optimizers.Adam(self.learning_rate),
|
|
metrics=['accuracy'])
|
|
#steps per epoch are reduced here to train on limited resources
|
|
#you are free to remove this argument
|
|
history = model.fit(self.train_dataset,
|
|
epochs=self.epochs,
|
|
shuffle=True,
|
|
steps_per_epoch=30,
|
|
validation_data=self.test_dataset,
|
|
validation_steps=30,
|
|
verbose=0)
|
|
model.save(self.model_file)
|
|
val_losses = history.history['val_loss']
|
|
val_accuracies = history.history['val_accuracy']
|
|
for epoch, val_loss, val_accuracy in zip(range(self.epochs), val_losses, val_accuracies):
|
|
print("epoch {}:\nval_loss={:.2f}\nval_accuracy={:.2f}\n".format(epoch + 1, val_loss, val_accuracy))
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("-lr", "--learning_rate", default="1e-4", help="Learning rate for the Keras optimizer")
|
|
parser.add_argument("-bsz", "--batch_size", default="64", help="Batch size for each step of learning")
|
|
parser.add_argument("-e", "--epochs", default="2", help="Number of epochs in each trial")
|
|
args = parser.parse_args()
|
|
learning_rate = float(args.learning_rate)
|
|
batch_size = int(args.batch_size)
|
|
epochs = int(args.epochs)
|
|
model = MovieReviewClassification(learning_rate, batch_size, epochs, local_data_dir="/app/tensorflow_datasets")
|
|
model.train() |