mirror of https://github.com/kubeflow/examples.git
425 lines
14 KiB
Plaintext
425 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Import required libraries"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from kubernetes.client import V1PodTemplateSpec\n",
|
|
"from kubernetes.client import V1ObjectMeta\n",
|
|
"from kubernetes.client import V1PodSpec\n",
|
|
"from kubernetes.client import V1Container\n",
|
|
"\n",
|
|
"from kubeflow.tfjob import constants\n",
|
|
"from kubeflow.tfjob import utils\n",
|
|
"from kubeflow.tfjob import V1ReplicaSpec\n",
|
|
"from kubeflow.tfjob import V1TFJob\n",
|
|
"from kubeflow.tfjob import V1TFJobSpec\n",
|
|
"from kubeflow.tfjob import TFJobClient"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"namespace = utils.get_default_target_namespace()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Define TFJob"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"container = V1Container(\n",
|
|
" name=\"tensorflow\",\n",
|
|
" image=\"gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0\",\n",
|
|
" args=[\n",
|
|
" \"--learning_rate=0.01\",\n",
|
|
" \"--epochs=2\",\n",
|
|
" \"--batch_size=64\"\n",
|
|
" ]\n",
|
|
")\n",
|
|
"\n",
|
|
"worker = V1ReplicaSpec(\n",
|
|
" replicas=1,\n",
|
|
" restart_policy=\"OnFailure\",\n",
|
|
" template=V1PodTemplateSpec(\n",
|
|
" metadata=V1ObjectMeta(\n",
|
|
" annotations={'sidecar.istio.io/inject':'false'}\n",
|
|
" ),\n",
|
|
" spec=V1PodSpec(\n",
|
|
" containers=[container]\n",
|
|
" )\n",
|
|
" )\n",
|
|
")\n",
|
|
"\n",
|
|
"chief = V1ReplicaSpec(\n",
|
|
" replicas=1,\n",
|
|
" restart_policy=\"OnFailure\",\n",
|
|
" template=V1PodTemplateSpec(\n",
|
|
" metadata=V1ObjectMeta(\n",
|
|
" annotations={'sidecar.istio.io/inject':'false'}\n",
|
|
" ),\n",
|
|
" spec=V1PodSpec(\n",
|
|
" containers=[container]\n",
|
|
" )\n",
|
|
" )\n",
|
|
")\n",
|
|
"\n",
|
|
"ps = V1ReplicaSpec(\n",
|
|
" replicas=1,\n",
|
|
" restart_policy=\"OnFailure\",\n",
|
|
" template=V1PodTemplateSpec(\n",
|
|
" metadata=V1ObjectMeta(\n",
|
|
" annotations={'sidecar.istio.io/inject':'false'}\n",
|
|
" ),\n",
|
|
" spec=V1PodSpec(\n",
|
|
" containers=[container]\n",
|
|
" )\n",
|
|
" )\n",
|
|
")\n",
|
|
"\n",
|
|
"tfjob = V1TFJob(\n",
|
|
" api_version=\"kubeflow.org/v1\",\n",
|
|
" kind=\"TFJob\",\n",
|
|
" metadata=V1ObjectMeta(name=\"neural-machine-translation\",namespace=namespace),\n",
|
|
" spec=V1TFJobSpec(\n",
|
|
" tf_replica_specs={\"Worker\": worker,\n",
|
|
" \"Chief\": chief,\n",
|
|
" \"PS\": ps}\n",
|
|
" )\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Create TFJob"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'apiVersion': 'kubeflow.org/v1',\n",
|
|
" 'kind': 'TFJob',\n",
|
|
" 'metadata': {'creationTimestamp': '2020-07-26T17:09:31Z',\n",
|
|
" 'generation': 1,\n",
|
|
" 'name': 'neural-machine-translation',\n",
|
|
" 'namespace': 'kubeflow-mailsforyashj',\n",
|
|
" 'resourceVersion': '44981017',\n",
|
|
" 'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/neural-machine-translation',\n",
|
|
" 'uid': 'c567c3d2-cf62-11ea-89e4-42010a8c0018'},\n",
|
|
" 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=2',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
|
|
" 'name': 'tensorflow'}]}}},\n",
|
|
" 'PS': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=2',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
|
|
" 'name': 'tensorflow'}]}}},\n",
|
|
" 'Worker': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=2',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
|
|
" 'name': 'tensorflow'}]}}}}}}"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client = TFJobClient()\n",
|
|
"tfjob_client.create(tfjob, namespace=namespace)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Get the created TFJob"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'apiVersion': 'kubeflow.org/v1',\n",
|
|
" 'kind': 'TFJob',\n",
|
|
" 'metadata': {'creationTimestamp': '2020-07-26T17:09:31Z',\n",
|
|
" 'generation': 1,\n",
|
|
" 'name': 'neural-machine-translation',\n",
|
|
" 'namespace': 'kubeflow-mailsforyashj',\n",
|
|
" 'resourceVersion': '44981088',\n",
|
|
" 'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/neural-machine-translation',\n",
|
|
" 'uid': 'c567c3d2-cf62-11ea-89e4-42010a8c0018'},\n",
|
|
" 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=2',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
|
|
" 'name': 'tensorflow'}]}}},\n",
|
|
" 'PS': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=2',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
|
|
" 'name': 'tensorflow'}]}}},\n",
|
|
" 'Worker': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=2',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_neural_machine_translation:1.0',\n",
|
|
" 'name': 'tensorflow'}]}}}}},\n",
|
|
" 'status': {'conditions': [{'lastTransitionTime': '2020-07-26T17:09:31Z',\n",
|
|
" 'lastUpdateTime': '2020-07-26T17:09:31Z',\n",
|
|
" 'message': 'TFJob neural-machine-translation is created.',\n",
|
|
" 'reason': 'TFJobCreated',\n",
|
|
" 'status': 'True',\n",
|
|
" 'type': 'Created'},\n",
|
|
" {'lastTransitionTime': '2020-07-26T17:09:34Z',\n",
|
|
" 'lastUpdateTime': '2020-07-26T17:09:34Z',\n",
|
|
" 'message': 'TFJob neural-machine-translation is running.',\n",
|
|
" 'reason': 'TFJobRunning',\n",
|
|
" 'status': 'True',\n",
|
|
" 'type': 'Running'}],\n",
|
|
" 'replicaStatuses': {'Chief': {'active': 1},\n",
|
|
" 'PS': {'active': 1},\n",
|
|
" 'Worker': {'active': 1}},\n",
|
|
" 'startTime': '2020-07-26T17:09:32Z'}}"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.get('neural-machine-translation', namespace=namespace)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Get the TFJob status, check if the TFJob has been started"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'Running'"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.get_job_status('neural-machine-translation', namespace=namespace)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Wait for the specified job to finish"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"NAME STATE TIME \n",
|
|
"neural-machine-translation Running 2020-07-26T17:09:34Z \n",
|
|
"neural-machine-translation Running 2020-07-26T17:09:34Z \n",
|
|
"neural-machine-translation Succeeded 2020-07-26T17:11:20Z \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.wait_for_job('neural-machine-translation', namespace=namespace, watch=True, timeout_seconds=7200)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Check if the TFJob succeeded"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.is_job_succeeded('neural-machine-translation', namespace=namespace)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Get the TFJob training logs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"The logs of Pod neural-machine-translation-chief-0:\n",
|
|
" 2020-07-26 17:09:36.707474: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n",
|
|
"2020-07-26 17:09:36.707539: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)\n",
|
|
"2020-07-26 17:09:36.707630: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (neural-machine-translation-chief-0): /proc/driver/nvidia/version does not exist\n",
|
|
"2020-07-26 17:09:36.708220: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n",
|
|
"2020-07-26 17:09:36.717866: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2200000000 Hz\n",
|
|
"2020-07-26 17:09:36.718530: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f5920000b20 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n",
|
|
"2020-07-26 17:09:36.718589: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n",
|
|
"WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n",
|
|
"Path to stored file: /app/dataset/spa-eng/spa.txt\n",
|
|
"epoch 1:\n",
|
|
"loss=2.66\n",
|
|
"\n",
|
|
"epoch 2:\n",
|
|
"loss=1.93\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.get_logs('neural-machine-translation', namespace=namespace)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Delete the TFJob"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'kind': 'Status',\n",
|
|
" 'apiVersion': 'v1',\n",
|
|
" 'metadata': {},\n",
|
|
" 'status': 'Success',\n",
|
|
" 'details': {'name': 'neural-machine-translation',\n",
|
|
" 'group': 'kubeflow.org',\n",
|
|
" 'kind': 'tfjobs',\n",
|
|
" 'uid': 'c567c3d2-cf62-11ea-89e4-42010a8c0018'}}"
|
|
]
|
|
},
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.delete('neural-machine-translation', namespace=namespace)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|