examples/tensorflow_cuj/text_classification/text_classification_rnn.py

72 lines
3.5 KiB
Python

#!/usr/bin/env python
#coding: utf-8
#Text classification with an RNN
#This text classification tutorial trains a [recurrent neural network](https://developers.google.com/machine-learning/glossary/#recurrent_neural_network) on the [IMDB large movie review dataset](http://ai.stanford.edu/~amaas/data/sentiment/) for sentiment analysis.
import importlib
import argparse
import tensorflow as tf
import tensorflow_datasets as tfds
import os
def data_loader(hyperparams, local_data_dir):
dataset, info = tfds.load('imdb_reviews/subwords8k',
data_dir=local_data_dir,
with_info=True,
as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']
encoder = info.features['text'].encoder
train_dataset = train_dataset.shuffle(hyperparams['BUFFER_SIZE'])
train_dataset = train_dataset.padded_batch(hyperparams['BATCH_SIZE'], padded_shapes=None)
test_dataset = test_dataset.padded_batch(hyperparams['BATCH_SIZE'], padded_shapes=None)
return train_dataset, test_dataset, encoder
def define_model(encoder):
model = tf.keras.Sequential([
tf.keras.layers.Embedding(encoder.vocab_size, 64),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1)
])
return model
class MovieReviewClassification(object):
def __init__(self, learning_rate=0.01, batch_size=64, epochs=2, local_data_dir='/app/tensorflow_datasets'):
hyperparams = {'BUFFER_SIZE': 10000, 'BATCH_SIZE': batch_size}
self.model_file = "lstm_trained"
self.learning_rate = learning_rate
self.epochs = epochs
self.train_dataset, self.test_dataset, self.encoder = data_loader(hyperparams, local_data_dir)
def train(self):
model = define_model(self.encoder)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(self.learning_rate),
metrics=['accuracy'])
#steps per epoch are reduced here to train on limited resources
#you are free to remove this argument
history = model.fit(self.train_dataset,
epochs=self.epochs,
shuffle=True,
steps_per_epoch=30,
validation_data=self.test_dataset,
validation_steps=30,
verbose=0)
model.save(self.model_file)
val_losses = history.history['val_loss']
val_accuracies = history.history['val_accuracy']
for epoch, val_loss, val_accuracy in zip(range(self.epochs), val_losses, val_accuracies):
print("epoch {}:\nval_loss={:.2f}\nval_accuracy={:.2f}\n".format(epoch + 1, val_loss, val_accuracy))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-lr", "--learning_rate", default="1e-4", help="Learning rate for the Keras optimizer")
parser.add_argument("-bsz", "--batch_size", default="64", help="Batch size for each step of learning")
parser.add_argument("-e", "--epochs", default="2", help="Number of epochs in each trial")
args = parser.parse_args()
learning_rate = float(args.learning_rate)
batch_size = int(args.batch_size)
epochs = int(args.epochs)
model = MovieReviewClassification(learning_rate, batch_size, epochs, local_data_dir="/app/tensorflow_datasets")
model.train()