# Text classification with an RNN

This text classification tutorial trains a [recurrent neural network](https://developers.google.com/machine-learning/glossary/#recurrent_neural_network) on the [IMDB large movie review dataset](http://ai.stanford.edu/~amaas/data/sentiment/) for sentiment analysis.

In [1]:
from kubeflow import fairing
from kubeflow.fairing import TrainJob
import importlib
import argparse
import tensorflow as tf
import tensorflow_datasets as tfds
import os

In [2]:
def data_loader(hyperparams, local_data_dir):
    dataset, info = tfds.load('imdb_reviews/subwords8k', 
                              data_dir=local_data_dir,
                              with_info=True,
                              as_supervised=True)
    train_dataset, test_dataset = dataset['train'], dataset['test']
    encoder = info.features['text'].encoder
    train_dataset = train_dataset.shuffle(hyperparams['BUFFER_SIZE'])
    train_dataset = train_dataset.padded_batch(hyperparams['BATCH_SIZE'], padded_shapes=None)
    test_dataset = test_dataset.padded_batch(hyperparams['BATCH_SIZE'], padded_shapes=None)
    return train_dataset, test_dataset, encoder

In [3]:
def define_model(encoder):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(encoder.vocab_size, 64),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    return model

In [4]:
class MovieReviewClassification(object):
    def __init__(self, learning_rate=1e-4, batch_size=64, epochs=2, local_data_dir='/app/tensorflow_datasets'):
        hyperparams = {'BUFFER_SIZE': 10000, 'BATCH_SIZE': batch_size}
        self.model_file = "lstm_trained"
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.train_dataset, self.test_dataset, self.encoder = data_loader(hyperparams, local_data_dir)
        
    def train(self):
        model = define_model(self.encoder)
        model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(self.learning_rate),
                      metrics=['accuracy'])
        history = model.fit(self.train_dataset, epochs=self.epochs,
                            validation_data=self.test_dataset,
                            validation_steps=30)
        model.save(self.model_file)
        test_loss, test_acc = model.evaluate(self.test_dataset)
        print('Test Loss: {}'.format(test_loss))
        print('Test Accuracy: {}'.format(test_acc))

In [5]:
#using Fairing in Jupyter
GCP_PROJECT = fairing.cloud.gcp.guess_project_name()
DOCKER_REGISTRY = 'gcr.io/{}/fairing-job'.format(GCP_PROJECT)
BuildContext = None
FAIRING_BACKEND = 'KubeflowGKEBackend'
BackendClass = getattr(importlib.import_module('kubeflow.fairing.backends'), FAIRING_BACKEND)

data_files = ['tensorflow_datasets/downloads/ai.stanfor.edu_amaas_sentime_aclImdb_v1xA90oY07YfkP66HhdzDg046Ll8Bf3nAIlC6Rkj0WWP4.tar.gz', 
              'tensorflow_datasets/downloads/ai.stanfor.edu_amaas_sentime_aclImdb_v1xA90oY07YfkP66HhdzDg046Ll8Bf3nAIlC6Rkj0WWP4.tar.gz.INFO',
              'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/dataset_info.json',
              'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/imdb_reviews-test.tfrecord-00000-of-00001',
              'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/imdb_reviews-train.tfrecord-00000-of-00001',
              'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/imdb_reviews-unsupervised.tfrecord-00000-of-00001',
              'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/label.labels.txt',
              'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/text.text.subwords',
              'requirements.txt']

train_job = TrainJob(MovieReviewClassification,
                      input_files=data_files, 
                      docker_registry=DOCKER_REGISTRY, 
                      backend=BackendClass(build_context_source=BuildContext))
train_job.submit()


[W 200703 10:01:34 tasks:54] Using default base docker image: registry.hub.docker.com/library/python:3.6.9
[W 200703 10:01:34 tasks:62] Using builder: <class 'kubeflow.fairing.builders.cluster.cluster.ClusterBuilder'>
[I 200703 10:01:34 tasks:66] Building the docker image.
[I 200703 10:01:34 cluster:46] Building image using cluster builder.
[W 200703 10:01:34 base:94] /home/jovyan/.local/lib/python3.6/site-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[I 200703 10:01:34 base:107] Creating docker context: /tmp/fairing_context_fh52_czq
[W 200703 10:01:34 base:94] /home/jovyan/.local/lib/python3.6/site-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[W 200703 10:01:48 manager:298] Waiting for fairing-builder-64vxg-8d45k to start...
[W 200703 10:01:48 manager:298] Waiting for fairing-builder-64vxg-8d45k to start...
[W 200703 10:01:48 manager:298] Waiting for fairing-builder-64vxg-8d45k to start...
[I 200703 10:01:5

E0703 10:01:54.197364       1 aws_credentials.go:77] while getting AWS credentials NoCredentialProviders: no valid providers in chain. Deprecated.
	For verbose messaging see aws.Config.CredentialsChainVerboseErrors
E0703 10:01:54.207742       1 metadata.go:248] Failed to unmarshal scopes: json: cannot unmarshal string into Go value of type []string
[36mINFO[0m[0004] Retrieving image manifest registry.hub.docker.com/library/python:3.6.9
E0703 10:01:54.657863       1 metadata.go:154] while reading 'google-dockercfg' metadata: http status code: 404 while fetching url http://metadata.google.internal./computeMetadata/v1/instance/attributes/google-dockercfg
E0703 10:01:54.659838       1 metadata.go:166] while reading 'google-dockercfg-url' metadata: http status code: 404 while fetching url http://metadata.google.internal./computeMetadata/v1/instance/attributes/google-dockercfg-url
[36mINFO[0m[0007] Retrieving image manifest registry.hub.docker.com/library/python:3.6.9
[36mINFO[0m[0009]

[W 200703 10:03:30 job:101] The job fairing-job-qwn9z launched.
[W 200703 10:03:30 manager:298] Waiting for fairing-job-qwn9z-5qkpt to start...
[W 200703 10:03:30 manager:298] Waiting for fairing-job-qwn9z-5qkpt to start...
[W 200703 10:03:30 manager:298] Waiting for fairing-job-qwn9z-5qkpt to start...
[I 200703 10:04:10 manager:304] Pod started running True


TFDS datasets with text encoding are deprecated and will be removed in a future version. Instead, you should use the plain text version and tokenize the text using `tensorflow_text` (See: https://www.tensorflow.org/tutorials/tensorflow_text/intro#tfdata_example)
Load dataset info from /app/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0
Reusing dataset imdb_reviews (/app/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0)
Constructing tf.data.Dataset for split None, from /app/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0
2020-07-03 10:04:13.872255: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-07-03 10:04:13.872495: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
2020-07-03 10:04:13.872534: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be 

2020-07-03 10:21:13.187930: W tensorflow/python/util/util.cc:329] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
From /usr/local/lib/python3.6/site-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Assets written to: lstm_trained/assets
  6/391 [..............................] - ETA: 1:58 - loss: 0.4125 - accuracy: 0.84
 12/391 [..............................] - ETA: 2:03 - loss: 0.3715 - accuracy: 0.85
 18/391 [>.............................] - ETA: 2:06 - loss: 0.3678 - accuracy: 0.85
 24/391 [>.............................] - ETA: 2:05 - loss: 0.3618 - accuracy: 0.85
 30/391 [=>............................] - ETA: 2:02 - loss: 0.3543 - accuracy: 0.86
 36/391 [=>........

[W 200703 10:23:43 job:173] Cleaning up job fairing-job-qwn9z...


'fairing-job-qwn9z'