### Import required libraries

In [1]:
from kubernetes.client import V1PodTemplateSpec
from kubernetes.client import V1ObjectMeta
from kubernetes.client import V1PodSpec
from kubernetes.client import V1Container

from kubeflow.tfjob import constants
from kubeflow.tfjob import utils
from kubeflow.tfjob import V1ReplicaSpec
from kubeflow.tfjob import V1TFJob
from kubeflow.tfjob import V1TFJobSpec
from kubeflow.tfjob import TFJobClient

In [2]:
namespace = utils.get_default_target_namespace()

### Define TFJob

In [3]:
container = V1Container(
    name="tensorflow",
    image="gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0",
    args=[
        "--learning_rate=0.01",
        "--epochs=2",
        "--batch_size=64"
        ]
)

worker = V1ReplicaSpec(
    replicas=1,
    restart_policy="OnFailure",
    template=V1PodTemplateSpec(
        metadata=V1ObjectMeta(
            annotations={'sidecar.istio.io/inject':'false'}
        ),
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

chief = V1ReplicaSpec(
    replicas=1,
    restart_policy="OnFailure",
    template=V1PodTemplateSpec(
        metadata=V1ObjectMeta(
            annotations={'sidecar.istio.io/inject':'false'}
        ),
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

ps = V1ReplicaSpec(
    replicas=1,
    restart_policy="OnFailure",
    template=V1PodTemplateSpec(
        metadata=V1ObjectMeta(
            annotations={'sidecar.istio.io/inject':'false'}
        ),
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

tfjob = V1TFJob(
    api_version="kubeflow.org/v1",
    kind="TFJob",
    metadata=V1ObjectMeta(name="neural-machine-translation",namespace=namespace),
    spec=V1TFJobSpec(
        tf_replica_specs={"Worker": worker,
                          "Chief": chief,
                          "PS": ps}
    )
)

### Create TFJob

In [4]:
tfjob_client = TFJobClient()
tfjob_client.create(tfjob, namespace=namespace)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'creationTimestamp': '2020-07-26T17:09:31Z',
  'generation': 1,
  'name': 'neural-machine-translation',
  'namespace': 'kubeflow-mailsforyashj',
  'resourceVersion': '44981017',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/neural-machine-translation',
  'uid': 'c567c3d2-cf62-11ea-89e4-42010a8c0018'},
 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},
     'spec': {'containers': [{'args': ['--learning_rate=0.01',
         '--epochs=2',
         '--batch_size=64'],
        'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',
        'name': 'tensorflow'}]}}},
   'PS': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},
     'spec': {'containers': [{'args': ['--l

### Get the created TFJob

In [5]:
tfjob_client.get('neural-machine-translation', namespace=namespace)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'creationTimestamp': '2020-07-26T17:09:31Z',
  'generation': 1,
  'name': 'neural-machine-translation',
  'namespace': 'kubeflow-mailsforyashj',
  'resourceVersion': '44981088',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/neural-machine-translation',
  'uid': 'c567c3d2-cf62-11ea-89e4-42010a8c0018'},
 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},
     'spec': {'containers': [{'args': ['--learning_rate=0.01',
         '--epochs=2',
         '--batch_size=64'],
        'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',
        'name': 'tensorflow'}]}}},
   'PS': {'replicas': 1,
    'restartPolicy': 'OnFailure',
    'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},
     'spec': {'containers': [{'args': ['--l

### Get the TFJob status, check if the TFJob has been started

In [6]:
tfjob_client.get_job_status('neural-machine-translation', namespace=namespace)

'Running'

### Wait for the specified job to finish

In [7]:
tfjob_client.wait_for_job('neural-machine-translation', namespace=namespace, watch=True, timeout_seconds=7200)

NAME                           STATE                TIME                          
neural-machine-translation     Running              2020-07-26T17:09:34Z          
neural-machine-translation     Running              2020-07-26T17:09:34Z          
neural-machine-translation     Succeeded            2020-07-26T17:11:20Z          


### Check if the TFJob succeeded

In [8]:
tfjob_client.is_job_succeeded('neural-machine-translation', namespace=namespace)

True

### Get the TFJob training logs

In [9]:
tfjob_client.get_logs('neural-machine-translation', namespace=namespace)

The logs of Pod neural-machine-translation-chief-0:
 2020-07-26 17:09:36.707474: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-07-26 17:09:36.707539: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
2020-07-26 17:09:36.707630: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (neural-machine-translation-chief-0): /proc/driver/nvidia/version does not exist
2020-07-26 17:09:36.708220: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-07-26 17:09:36.717866: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2200000000 Hz
2020-07-26 17:09:36.718530: I tensorflow/compiler/xla/service/service.cc:168] XLA servi

### Delete the TFJob

In [10]:
tfjob_client.delete('neural-machine-translation', namespace=namespace)

{'kind': 'Status',
 'apiVersion': 'v1',
 'metadata': {},
 'status': 'Success',
 'details': {'name': 'neural-machine-translation',
  'group': 'kubeflow.org',
  'kind': 'tfjobs',
  'uid': 'c567c3d2-cf62-11ea-89e4-42010a8c0018'}}