mirror of https://github.com/kubeflow/examples.git
425 lines
14 KiB
Plaintext
425 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Import required libraries"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from kubernetes.client import V1PodTemplateSpec\n",
|
|
"from kubernetes.client import V1ObjectMeta\n",
|
|
"from kubernetes.client import V1PodSpec\n",
|
|
"from kubernetes.client import V1Container\n",
|
|
"\n",
|
|
"from kubeflow.tfjob import constants\n",
|
|
"from kubeflow.tfjob import utils\n",
|
|
"from kubeflow.tfjob import V1ReplicaSpec\n",
|
|
"from kubeflow.tfjob import V1TFJob\n",
|
|
"from kubeflow.tfjob import V1TFJobSpec\n",
|
|
"from kubeflow.tfjob import TFJobClient"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"namespace = utils.get_default_target_namespace()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Define TFJob"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"container = V1Container(\n",
|
|
" name=\"tensorflow\",\n",
|
|
" image=\"gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1\",\n",
|
|
" args=[\n",
|
|
" \"--learning_rate=0.01\",\n",
|
|
" \"--epochs=1\",\n",
|
|
" \"--batch_size=64\"\n",
|
|
" ]\n",
|
|
")\n",
|
|
"\n",
|
|
"worker = V1ReplicaSpec(\n",
|
|
" replicas=1,\n",
|
|
" restart_policy=\"OnFailure\",\n",
|
|
" template=V1PodTemplateSpec(\n",
|
|
" metadata=V1ObjectMeta(\n",
|
|
" annotations={'sidecar.istio.io/inject':'false'}\n",
|
|
" ),\n",
|
|
" spec=V1PodSpec(\n",
|
|
" containers=[container]\n",
|
|
" )\n",
|
|
" )\n",
|
|
")\n",
|
|
"\n",
|
|
"chief = V1ReplicaSpec(\n",
|
|
" replicas=1,\n",
|
|
" restart_policy=\"OnFailure\",\n",
|
|
" template=V1PodTemplateSpec(\n",
|
|
" metadata=V1ObjectMeta(\n",
|
|
" annotations={'sidecar.istio.io/inject':'false'}\n",
|
|
" ),\n",
|
|
" spec=V1PodSpec(\n",
|
|
" containers=[container]\n",
|
|
" )\n",
|
|
" )\n",
|
|
")\n",
|
|
"\n",
|
|
"ps = V1ReplicaSpec(\n",
|
|
" replicas=1,\n",
|
|
" restart_policy=\"OnFailure\",\n",
|
|
" template=V1PodTemplateSpec(\n",
|
|
" metadata=V1ObjectMeta(\n",
|
|
" annotations={'sidecar.istio.io/inject':'false'}\n",
|
|
" ),\n",
|
|
" spec=V1PodSpec(\n",
|
|
" containers=[container]\n",
|
|
" )\n",
|
|
" )\n",
|
|
")\n",
|
|
"\n",
|
|
"tfjob = V1TFJob(\n",
|
|
" api_version=\"kubeflow.org/v1\",\n",
|
|
" kind=\"TFJob\",\n",
|
|
" metadata=V1ObjectMeta(name=\"text-classification\",namespace=namespace),\n",
|
|
" spec=V1TFJobSpec(\n",
|
|
" tf_replica_specs={\"Worker\": worker,\n",
|
|
" \"Chief\": chief,\n",
|
|
" \"PS\": ps}\n",
|
|
" )\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Create TFJob"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'apiVersion': 'kubeflow.org/v1',\n",
|
|
" 'kind': 'TFJob',\n",
|
|
" 'metadata': {'creationTimestamp': '2020-07-25T12:03:06Z',\n",
|
|
" 'generation': 1,\n",
|
|
" 'name': 'text-classification',\n",
|
|
" 'namespace': 'kubeflow-mailsforyashj',\n",
|
|
" 'resourceVersion': '43826614',\n",
|
|
" 'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/text-classification',\n",
|
|
" 'uid': 'cca60e77-ce6e-11ea-89e4-42010a8c0018'},\n",
|
|
" 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=1',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
|
|
" 'name': 'tensorflow'}]}}},\n",
|
|
" 'PS': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=1',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
|
|
" 'name': 'tensorflow'}]}}},\n",
|
|
" 'Worker': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=1',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
|
|
" 'name': 'tensorflow'}]}}}}}}"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client = TFJobClient()\n",
|
|
"tfjob_client.create(tfjob, namespace=namespace)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Get the created TFJob"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'apiVersion': 'kubeflow.org/v1',\n",
|
|
" 'kind': 'TFJob',\n",
|
|
" 'metadata': {'creationTimestamp': '2020-07-25T12:03:06Z',\n",
|
|
" 'generation': 1,\n",
|
|
" 'name': 'text-classification',\n",
|
|
" 'namespace': 'kubeflow-mailsforyashj',\n",
|
|
" 'resourceVersion': '43826645',\n",
|
|
" 'selfLink': '/apis/kubeflow.org/v1/namespaces/kubeflow-mailsforyashj/tfjobs/text-classification',\n",
|
|
" 'uid': 'cca60e77-ce6e-11ea-89e4-42010a8c0018'},\n",
|
|
" 'spec': {'tfReplicaSpecs': {'Chief': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=1',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
|
|
" 'name': 'tensorflow'}]}}},\n",
|
|
" 'PS': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=1',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
|
|
" 'name': 'tensorflow'}]}}},\n",
|
|
" 'Worker': {'replicas': 1,\n",
|
|
" 'restartPolicy': 'OnFailure',\n",
|
|
" 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
|
|
" 'spec': {'containers': [{'args': ['--learning_rate=0.01',\n",
|
|
" '--epochs=1',\n",
|
|
" '--batch_size=64'],\n",
|
|
" 'image': 'gcr.io/gsoc-kf-example/distributed_tf_2_text_classification:1.1',\n",
|
|
" 'name': 'tensorflow'}]}}}}},\n",
|
|
" 'status': {'conditions': [{'lastTransitionTime': '2020-07-25T12:03:06Z',\n",
|
|
" 'lastUpdateTime': '2020-07-25T12:03:06Z',\n",
|
|
" 'message': 'TFJob text-classification is created.',\n",
|
|
" 'reason': 'TFJobCreated',\n",
|
|
" 'status': 'True',\n",
|
|
" 'type': 'Created'}],\n",
|
|
" 'replicaStatuses': {'Chief': {}, 'PS': {}, 'Worker': {}},\n",
|
|
" 'startTime': '2020-07-25T12:03:06Z'}}"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.get('text-classification', namespace=namespace)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Get the TFJob status, check if the TFJob has been started"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'Created'"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.get_job_status('text-classification', namespace=namespace)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Wait for the specified job to finish"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"NAME STATE TIME \n",
|
|
"text-classification Created 2020-07-25T12:03:06Z \n",
|
|
"text-classification Created 2020-07-25T12:03:06Z \n",
|
|
"text-classification Created 2020-07-25T12:03:06Z \n",
|
|
"text-classification Running 2020-07-25T12:03:55Z \n",
|
|
"text-classification Succeeded 2020-07-25T12:05:05Z \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.wait_for_job('text-classification', namespace=namespace, watch=True, timeout_seconds=7200)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Check if the TFJob succeeded"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.is_job_succeeded('text-classification', namespace=namespace)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Get the TFJob training logs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"The logs of Pod text-classification-chief-0:\n",
|
|
" WARNING:absl:TFDS datasets with text encoding are deprecated and will be removed in a future version. Instead, you should use the plain text version and tokenize the text using `tensorflow_text` (See: https://www.tensorflow.org/tutorials/tensorflow_text/intro#tfdata_example)\n",
|
|
"2020-07-25 12:03:56.128380: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n",
|
|
"2020-07-25 12:03:56.128439: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)\n",
|
|
"2020-07-25 12:03:56.128472: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (text-classification-chief-0): /proc/driver/nvidia/version does not exist\n",
|
|
"2020-07-25 12:03:56.128780: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n",
|
|
"2020-07-25 12:03:56.137474: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2200000000 Hz\n",
|
|
"2020-07-25 12:03:56.138397: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fa5a0000b20 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n",
|
|
"2020-07-25 12:03:56.138442: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n",
|
|
"WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n",
|
|
"WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n",
|
|
"2020-07-25 12:04:56.718264: W tensorflow/python/util/util.cc:329] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n",
|
|
"WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
|
|
"Instructions for updating:\n",
|
|
"If using Keras pass *_constraint arguments to layers.\n",
|
|
"WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
|
|
"Instructions for updating:\n",
|
|
"If using Keras pass *_constraint arguments to layers.\n",
|
|
"epoch 1:\n",
|
|
"val_loss=0.64\n",
|
|
"val_accuracy=0.61\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.get_logs('text-classification', namespace=namespace)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Delete the TFJob"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'kind': 'Status',\n",
|
|
" 'apiVersion': 'v1',\n",
|
|
" 'metadata': {},\n",
|
|
" 'status': 'Success',\n",
|
|
" 'details': {'name': 'text-classification',\n",
|
|
" 'group': 'kubeflow.org',\n",
|
|
" 'kind': 'tfjobs',\n",
|
|
" 'uid': 'cca60e77-ce6e-11ea-89e4-42010a8c0018'}}"
|
|
]
|
|
},
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfjob_client.delete('text-classification', namespace=namespace)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|