examples/tensorflow_cuj/neural_machine_translation/tfjob-with-python-sdk.ipynb

425 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import required libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from kubernetes.client import V1PodTemplateSpec\n",
"from kubernetes.client import V1ObjectMeta\n",
"from kubernetes.client import V1PodSpec\n",
"from kubernetes.client import V1Container\n",
"\n",
"from kubeflow.tfjob import constants\n",
"from kubeflow.tfjob import utils\n",
"from kubeflow.tfjob import V1ReplicaSpec\n",
"from kubeflow.tfjob import V1TFJob\n",
"from kubeflow.tfjob import V1TFJobSpec\n",
"from kubeflow.tfjob import TFJobClient"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"namespace = utils.get_default_target_namespace()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define TFJob"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"container = V1Container(\n",
" name=\"tensorflow\",\n",
" image=\"gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0\",\n",
" args=[\n",
" \"--learning_rate=0.01\",\n",
" \"--epochs=2\",\n",
" \"--batch_size=64\"\n",
" ]\n",
")\n",
"\n",
"worker = V1ReplicaSpec(\n",
" replicas=1,\n",
" restart_policy=\"OnFailure\",\n",
" template=V1PodTemplateSpec(\n",
" metadata=V1ObjectMeta(\n",
" annotations={'sidecar.istio.io/inject':'false'}\n",
" ),\n",
" spec=V1PodSpec(\n",
" containers=[container]\n",
" )\n",
" )\n",
")\n",
"\n",
"chief = V1ReplicaSpec(\n",
" replicas=1,\n",
" restart_policy=\"OnFailure\",\n",
" template=V1PodTemplateSpec(\n",
" metadata=V1ObjectMeta(\n",
" annotations={'sidecar.istio.io/inject':'false'}\n",
" ),\n",
" spec=V1PodSpec(\n",
" containers=[container]\n",
" )\n",
" )\n",
")\n",
"\n",
"ps = V1ReplicaSpec(\n",
" replicas=1,\n",
" restart_policy=\"OnFailure\",\n",
" template=V1PodTemplateSpec(\n",
" metadata=V1ObjectMeta(\n",
" annotations={'sidecar.istio.io/inject':'false'}\n",
" ),\n",
" spec=V1PodSpec(\n",
" containers=[container]\n",
" )\n",
" )\n",
")\n",
"\n",
"tfjob = V1TFJob(\n",
" api_version=\"kubeflow.org/v1\",\n",
" kind=\"TFJob\",\n",
" metadata=V1ObjectMeta(name=\"neural-machine-translation\",namespace=namespace),\n",
" spec=V1TFJobSpec(\n",
" tf_replica_specs={\"Worker\": worker,\n",
" \"Chief\": chief,\n",
" \"PS\": ps}\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create TFJob"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'apiVersion': 'kubeflow.org/v1',\n",
" 'kind': 'TFJob',\n",
" 'metadata': {'creationTimestamp': '2020-07-26T17:09:31Z',\n",
" 'generation': 1,\n",
" 'name': 'neural-machine-translation',\n",
" 'namespace': 'kubeflow-mailsforyashj',\n",
" 'resourceVersion': '44981017',\n",
" 'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/neural-machine-translation',\n",
" 'uid': 'c567c3d2-cf62-11ea-89e4-42010a8c0018'},\n",
" 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=2',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
" 'name': 'tensorflow'}]}}},\n",
" 'PS': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=2',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
" 'name': 'tensorflow'}]}}},\n",
" 'Worker': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=2',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
" 'name': 'tensorflow'}]}}}}}}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfjob_client = TFJobClient()\n",
"tfjob_client.create(tfjob, namespace=namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get the created TFJob"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'apiVersion': 'kubeflow.org/v1',\n",
" 'kind': 'TFJob',\n",
" 'metadata': {'creationTimestamp': '2020-07-26T17:09:31Z',\n",
" 'generation': 1,\n",
" 'name': 'neural-machine-translation',\n",
" 'namespace': 'kubeflow-mailsforyashj',\n",
" 'resourceVersion': '44981088',\n",
" 'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/neural-machine-translation',\n",
" 'uid': 'c567c3d2-cf62-11ea-89e4-42010a8c0018'},\n",
" 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=2',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
" 'name': 'tensorflow'}]}}},\n",
" 'PS': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=2',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
" 'name': 'tensorflow'}]}}},\n",
" 'Worker': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=2',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
" 'name': 'tensorflow'}]}}}}},\n",
" 'status': {'conditions': [{'lastTransitionTime': '2020-07-26T17:09:31Z',\n",
" 'lastUpdateTime': '2020-07-26T17:09:31Z',\n",
" 'message': 'TFJob neural-machine-translation is created.',\n",
" 'reason': 'TFJobCreated',\n",
" 'status': 'True',\n",
" 'type': 'Created'},\n",
" {'lastTransitionTime': '2020-07-26T17:09:34Z',\n",
" 'lastUpdateTime': '2020-07-26T17:09:34Z',\n",
" 'message': 'TFJob neural-machine-translation is running.',\n",
" 'reason': 'TFJobRunning',\n",
" 'status': 'True',\n",
" 'type': 'Running'}],\n",
" 'replicaStatuses': {'Chief': {'active': 1},\n",
" 'PS': {'active': 1},\n",
" 'Worker': {'active': 1}},\n",
" 'startTime': '2020-07-26T17:09:32Z'}}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfjob_client.get('neural-machine-translation', namespace=namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get the TFJob status, check if the TFJob has been started"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Running'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfjob_client.get_job_status('neural-machine-translation', namespace=namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Wait for the specified job to finish"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAME STATE TIME \n",
"neural-machine-translation Running 2020-07-26T17:09:34Z \n",
"neural-machine-translation Running 2020-07-26T17:09:34Z \n",
"neural-machine-translation Succeeded 2020-07-26T17:11:20Z \n"
]
}
],
"source": [
"tfjob_client.wait_for_job('neural-machine-translation', namespace=namespace, watch=True, timeout_seconds=7200)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Check if the TFJob succeeded"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfjob_client.is_job_succeeded('neural-machine-translation', namespace=namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get the TFJob training logs"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The logs of Pod neural-machine-translation-chief-0:\n",
" 2020-07-26 17:09:36.707474: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n",
"2020-07-26 17:09:36.707539: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)\n",
"2020-07-26 17:09:36.707630: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (neural-machine-translation-chief-0): /proc/driver/nvidia/version does not exist\n",
"2020-07-26 17:09:36.708220: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n",
"2020-07-26 17:09:36.717866: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2200000000 Hz\n",
"2020-07-26 17:09:36.718530: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f5920000b20 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n",
"2020-07-26 17:09:36.718589: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n",
"WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n",
"Path to stored file: /app/dataset/spa-eng/spa.txt\n",
"epoch 1:\n",
"loss=2.66\n",
"\n",
"epoch 2:\n",
"loss=1.93\n",
"\n",
"\n"
]
}
],
"source": [
"tfjob_client.get_logs('neural-machine-translation', namespace=namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete the TFJob"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'kind': 'Status',\n",
" 'apiVersion': 'v1',\n",
" 'metadata': {},\n",
" 'status': 'Success',\n",
" 'details': {'name': 'neural-machine-translation',\n",
" 'group': 'kubeflow.org',\n",
" 'kind': 'tfjobs',\n",
" 'uid': 'c567c3d2-cf62-11ea-89e4-42010a8c0018'}}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfjob_client.delete('neural-machine-translation', namespace=namespace)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}