examples/tensorflow_cuj/text_classification/tfjob-with-python-sdk.ipynb

425 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import required libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from kubernetes.client import V1PodTemplateSpec\n",
"from kubernetes.client import V1ObjectMeta\n",
"from kubernetes.client import V1PodSpec\n",
"from kubernetes.client import V1Container\n",
"\n",
"from kubeflow.tfjob import constants\n",
"from kubeflow.tfjob import utils\n",
"from kubeflow.tfjob import V1ReplicaSpec\n",
"from kubeflow.tfjob import V1TFJob\n",
"from kubeflow.tfjob import V1TFJobSpec\n",
"from kubeflow.tfjob import TFJobClient"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"namespace = utils.get_default_target_namespace()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define TFJob"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"container = V1Container(\n",
" name=\"tensorflow\",\n",
" image=\"gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1\",\n",
" args=[\n",
" \"--learning_rate=0.01\",\n",
" \"--epochs=1\",\n",
" \"--batch_size=64\"\n",
" ]\n",
")\n",
"\n",
"worker = V1ReplicaSpec(\n",
" replicas=1,\n",
" restart_policy=\"OnFailure\",\n",
" template=V1PodTemplateSpec(\n",
" metadata=V1ObjectMeta(\n",
" annotations={'sidecar.istio.io/inject':'false'}\n",
" ),\n",
" spec=V1PodSpec(\n",
" containers=[container]\n",
" )\n",
" )\n",
")\n",
"\n",
"chief = V1ReplicaSpec(\n",
" replicas=1,\n",
" restart_policy=\"OnFailure\",\n",
" template=V1PodTemplateSpec(\n",
" metadata=V1ObjectMeta(\n",
" annotations={'sidecar.istio.io/inject':'false'}\n",
" ),\n",
" spec=V1PodSpec(\n",
" containers=[container]\n",
" )\n",
" )\n",
")\n",
"\n",
"ps = V1ReplicaSpec(\n",
" replicas=1,\n",
" restart_policy=\"OnFailure\",\n",
" template=V1PodTemplateSpec(\n",
" metadata=V1ObjectMeta(\n",
" annotations={'sidecar.istio.io/inject':'false'}\n",
" ),\n",
" spec=V1PodSpec(\n",
" containers=[container]\n",
" )\n",
" )\n",
")\n",
"\n",
"tfjob = V1TFJob(\n",
" api_version=\"kubeflow.org/v1\",\n",
" kind=\"TFJob\",\n",
" metadata=V1ObjectMeta(name=\"text-classification\",namespace=namespace),\n",
" spec=V1TFJobSpec(\n",
" tf_replica_specs={\"Worker\": worker,\n",
" \"Chief\": chief,\n",
" \"PS\": ps}\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create TFJob"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'apiVersion': 'kubeflow.org/v1',\n",
" 'kind': 'TFJob',\n",
" 'metadata': {'creationTimestamp': '2020-07-25T12:03:06Z',\n",
" 'generation': 1,\n",
" 'name': 'text-classification',\n",
" 'namespace': 'kubeflow-mailsforyashj',\n",
" 'resourceVersion': '43826614',\n",
" 'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/text-classification',\n",
" 'uid': 'cca60e77-ce6e-11ea-89e4-42010a8c0018'},\n",
" 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=1',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
" 'name': 'tensorflow'}]}}},\n",
" 'PS': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=1',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
" 'name': 'tensorflow'}]}}},\n",
" 'Worker': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=1',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
" 'name': 'tensorflow'}]}}}}}}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfjob_client = TFJobClient()\n",
"tfjob_client.create(tfjob, namespace=namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get the created TFJob"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'apiVersion': 'kubeflow.org/v1',\n",
" 'kind': 'TFJob',\n",
" 'metadata': {'creationTimestamp': '2020-07-25T12:03:06Z',\n",
" 'generation': 1,\n",
" 'name': 'text-classification',\n",
" 'namespace': 'kubeflow-mailsforyashj',\n",
" 'resourceVersion': '43826645',\n",
" 'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/text-classification',\n",
" 'uid': 'cca60e77-ce6e-11ea-89e4-42010a8c0018'},\n",
" 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=1',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
" 'name': 'tensorflow'}]}}},\n",
" 'PS': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=1',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
" 'name': 'tensorflow'}]}}},\n",
" 'Worker': {'replicas': 1,\n",
" 'restartPolicy': 'OnFailure',\n",
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
" '--epochs=1',\n",
" '--batch_size=64'],\n",
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
" 'name': 'tensorflow'}]}}}}},\n",
" 'status': {'conditions': [{'lastTransitionTime': '2020-07-25T12:03:06Z',\n",
" 'lastUpdateTime': '2020-07-25T12:03:06Z',\n",
" 'message': 'TFJob text-classification is created.',\n",
" 'reason': 'TFJobCreated',\n",
" 'status': 'True',\n",
" 'type': 'Created'}],\n",
" 'replicaStatuses': {'Chief': {}, 'PS': {}, 'Worker': {}},\n",
" 'startTime': '2020-07-25T12:03:06Z'}}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfjob_client.get('text-classification', namespace=namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get the TFJob status, check if the TFJob has been started"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Created'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfjob_client.get_job_status('text-classification', namespace=namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Wait for the specified job to finish"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAME STATE TIME \n",
"text-classification Created 2020-07-25T12:03:06Z \n",
"text-classification Created 2020-07-25T12:03:06Z \n",
"text-classification Created 2020-07-25T12:03:06Z \n",
"text-classification Running 2020-07-25T12:03:55Z \n",
"text-classification Succeeded 2020-07-25T12:05:05Z \n"
]
}
],
"source": [
"tfjob_client.wait_for_job('text-classification', namespace=namespace, watch=True, timeout_seconds=7200)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Check if the TFJob succeeded"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfjob_client.is_job_succeeded('text-classification', namespace=namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get the TFJob training logs"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The logs of Pod text-classification-chief-0:\n",
" WARNING:absl:TFDS datasets with text encoding are deprecated and will be removed in a future version. Instead, you should use the plain text version and tokenize the text using `tensorflow_text` (See: https://www.tensorflow.org/tutorials/tensorflow_text/intro#tfdata_example)\n",
"2020-07-25 12:03:56.128380: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n",
"2020-07-25 12:03:56.128439: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)\n",
"2020-07-25 12:03:56.128472: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (text-classification-chief-0): /proc/driver/nvidia/version does not exist\n",
"2020-07-25 12:03:56.128780: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n",
"2020-07-25 12:03:56.137474: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2200000000 Hz\n",
"2020-07-25 12:03:56.138397: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fa5a0000b20 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n",
"2020-07-25 12:03:56.138442: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n",
"WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n",
"WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n",
"2020-07-25 12:04:56.718264: W tensorflow/python/util/util.cc:329] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
"WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"If using Keras pass *_constraint arguments to layers.\n",
"WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"If using Keras pass *_constraint arguments to layers.\n",
"epoch 1:\n",
"val_loss=0.64\n",
"val_accuracy=0.61\n",
"\n",
"\n"
]
}
],
"source": [
"tfjob_client.get_logs('text-classification', namespace=namespace)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete the TFJob"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'kind': 'Status',\n",
" 'apiVersion': 'v1',\n",
" 'metadata': {},\n",
" 'status': 'Success',\n",
" 'details': {'name': 'text-classification',\n",
" 'group': 'kubeflow.org',\n",
" 'kind': 'tfjobs',\n",
" 'uid': 'cca60e77-ce6e-11ea-89e4-42010a8c0018'}}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfjob_client.delete('text-classification', namespace=namespace)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}