### Import required libraries

In [1]:
from kubernetes.client import V1PodTemplateSpec
from kubernetes.client import V1ObjectMeta
from kubernetes.client import V1PodSpec
from kubernetes.client import V1Container

from kubeflow.tfjob import constants
from kubeflow.tfjob import utils
from kubeflow.tfjob import V1ReplicaSpec
from kubeflow.tfjob import V1TFJob
from kubeflow.tfjob import V1TFJobSpec
from kubeflow.tfjob import TFJobClient

In [2]:
namespace = utils.get_default_target_namespace()

### Define TFJob

In [3]:
container = V1Container(
    name="tensorflow",
    image="gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1",
    args=[
        "--learning_rate=0.01",
        "--epochs=1",
        "--batch_size=64"
        ]
)

worker = V1ReplicaSpec(
    replicas=1,
    restart_policy="OnFailure",
    template=V1PodTemplateSpec(
        metadata=V1ObjectMeta(
            annotations={'sidecar.istio.io/inject':'false'}
        ),
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

chief = V1ReplicaSpec(
    replicas=1,
    restart_policy="OnFailure",
    template=V1PodTemplateSpec(
        metadata=V1ObjectMeta(
            annotations={'sidecar.istio.io/inject':'false'}
        ),
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

ps = V1ReplicaSpec(
    replicas=1,
    restart_policy="OnFailure",
    template=V1PodTemplateSpec(
        metadata=V1ObjectMeta(
            annotations={'sidecar.istio.io/inject':'false'}
        ),
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

tfjob = V1TFJob(
    api_version="kubeflow.org/v1",
    kind="TFJob",
    metadata=V1ObjectMeta(name="text-classification",namespace=namespace),
    spec=V1TFJobSpec(
        tf_replica_specs={"Worker": worker,
                          "Chief": chief,
                          "PS": ps}
    )
)

### Create TFJob

In [4]:
tfjob_client = TFJobClient()
tfjob_client.create(tfjob, namespace=namespace)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'creationTimestamp': '2020-07-25T12:03:06Z',
  'generation': 1,
  'name': 'text-classification',
  'namespace': 'kubeflow-mailsforyashj',
  'resourceVersion': '43826614',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/text-classification',
  'uid': 'cca60e77-ce6e-11ea-89e4-42010a8c0018'},
 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},
     'spec': {'containers': [{'args': ['--learning_rate=0.01',
         '--epochs=1',
         '--batch_size=64'],
        'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',
        'name': 'tensorflow'}]}}},
   'PS': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},
     'spec': {'containers': [{'args': ['--learning_rate=0.01',
 

### Get the created TFJob

In [5]:
tfjob_client.get('text-classification', namespace=namespace)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'creationTimestamp': '2020-07-25T12:03:06Z',
  'generation': 1,
  'name': 'text-classification',
  'namespace': 'kubeflow-mailsforyashj',
  'resourceVersion': '43826645',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/text-classification',
  'uid': 'cca60e77-ce6e-11ea-89e4-42010a8c0018'},
 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},
     'spec': {'containers': [{'args': ['--learning_rate=0.01',
         '--epochs=1',
         '--batch_size=64'],
        'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',
        'name': 'tensorflow'}]}}},
   'PS': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},
     'spec': {'containers': [{'args': ['--learning_rate=0.01',
 

### Get the TFJob status, check if the TFJob has been started

In [6]:
tfjob_client.get_job_status('text-classification', namespace=namespace)

'Created'

### Wait for the specified job to finish

In [7]:
tfjob_client.wait_for_job('text-classification', namespace=namespace, watch=True, timeout_seconds=7200)

NAME                           STATE                TIME                          
text-classification            Created              2020-07-25T12:03:06Z          
text-classification            Created              2020-07-25T12:03:06Z          
text-classification            Created              2020-07-25T12:03:06Z          
text-classification            Running              2020-07-25T12:03:55Z          
text-classification            Succeeded            2020-07-25T12:05:05Z          


### Check if the TFJob succeeded

In [8]:
tfjob_client.is_job_succeeded('text-classification', namespace=namespace)

True

### Get the TFJob training logs

In [9]:
tfjob_client.get_logs('text-classification', namespace=namespace)

The logs of Pod text-classification-chief-0:
2020-07-25 12:03:56.128380: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-07-25 12:03:56.128439: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
2020-07-25 12:03:56.128472: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (text-classification-chief-0): /proc/driver/nvidia/version does not exist
2020-07-25 12:03:56.128780: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-07-25 12:03:56.137474: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2200000000 Hz
2020-07-25 12:03:56.138397: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fa5a0000b

### Delete the TFJob

In [10]:
tfjob_client.delete('text-classification', namespace=namespace)

{'kind': 'Status',
 'apiVersion': 'v1',
 'metadata': {},
 'status': 'Success',
 'details': {'name': 'text-classification',
  'group': 'kubeflow.org',
  'kind': 'tfjobs',
  'uid': 'cca60e77-ce6e-11ea-89e4-42010a8c0018'}}