fix(components/pytorch) Pytorch Lightning Arguments string pass through (#5870)

* Updating bert script to use input arguments as string

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Adding utility to parse input arguments

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Fixing tensorboard root and checkpoint dirs

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Adding string pass through in component.yaml

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Fixing pipeline.py file

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Fixing pipeline keys

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Fixing args in component.yaml

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Removing extra comma

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Removing unused code

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Updating cifar10 example

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Uncommenting confusion matrix

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Updating jupyter notebooks

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Fixing cifar10 train component.yaml

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Addressing review comments

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>

* Fixing lint issues

Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>
This commit is contained in:
shrinath-suresh 2021-06-24 11:40:19 +05:30 committed by GitHub
parent e958156274
commit 192b8e8756
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 440 additions and 371 deletions

View File

@ -0,0 +1,13 @@
# !/usr/bin/env/python3
# Copyright (c) Facebook, Inc. and its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,41 @@
# !/usr/bin/env/python3
# Copyright (c) Facebook, Inc. and its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def parse_input_args(input_str: str):
"""Utility to parse input string arguments. Returns a dictionary"""
output_dict = {}
if not input_str:
raise ValueError("Empty input string: {}".format(input_str))
key_pairs: list = input_str.split(",")
key_pairs = [x.strip() for x in key_pairs]
if not key_pairs:
raise ValueError("Incorrect format: {}".format(input_str))
for each_key in key_pairs:
try:
key, value = each_key.split("=")
except ValueError as value_error:
raise ValueError("Expected input format "
"'key1=value1, key2=value2' "
"but received {}".format(input_str)) \
from value_error
if value.isdigit():
value = int(value)
output_dict[key] = value
return output_dict

View File

@ -230,7 +230,7 @@
"outputs": [],
"source": [
"@dsl.pipeline(name=\"Training pipeline\", description=\"Sample training job test\")\n",
"def pytorch_bert(\n",
"def pytorch_bert( # pylint: disable=too-many-arguments\n",
" minio_endpoint=MINIO_ENDPOINT,\n",
" log_bucket=LOG_BUCKET,\n",
" log_dir=f\"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}\",\n",
@ -241,74 +241,85 @@
" deploy=DEPLOY_NAME,\n",
" namespace=NAMESPACE,\n",
" confusion_matrix_log_dir=f\"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/\",\n",
" num_samples=1000\n",
" num_samples=1000,\n",
" max_epochs=1\n",
"):\n",
"\n",
" \"\"\"Thid method defines the pipeline tasks and operations\"\"\"\n",
" prepare_tb_task = prepare_tensorboard_op(\n",
" log_dir_uri=f\"s3://{log_bucket}/{log_dir}\",\n",
" image=tf_image,\n",
" pod_template_spec=json.dumps(\n",
" {\n",
" \"spec\": {\n",
" \"containers\": [\n",
" pod_template_spec=json.dumps({\n",
" \"spec\": {\n",
" \"containers\": [{\n",
" \"env\": [\n",
" {\n",
" \"env\": [\n",
" {\n",
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"accesskey\",\n",
" }\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"secretkey\",\n",
" }\n",
" },\n",
" },\n",
" {\"name\": \"AWS_REGION\", \"value\": \"minio\"},\n",
" {\"name\": \"S3_ENDPOINT\", \"value\": f\"{minio_endpoint}\"},\n",
" {\"name\": \"S3_USE_HTTPS\", \"value\": \"0\"},\n",
" {\"name\": \"S3_VERIFY_SSL\", \"value\": \"0\"},\n",
" ]\n",
" }\n",
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"accesskey\",\n",
" }\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"secretkey\",\n",
" }\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"AWS_REGION\",\n",
" \"value\": \"minio\"\n",
" },\n",
" {\n",
" \"name\": \"S3_ENDPOINT\",\n",
" \"value\": f\"{minio_endpoint}\",\n",
" },\n",
" {\n",
" \"name\": \"S3_USE_HTTPS\",\n",
" \"value\": \"0\"\n",
" },\n",
" {\n",
" \"name\": \"S3_VERIFY_SSL\",\n",
" \"value\": \"0\"\n",
" },\n",
" ]\n",
" }\n",
" }]\n",
" }\n",
" ),\n",
" }),\n",
" ).set_display_name(\"Visualization\")\n",
"\n",
" prep_task = prep_op().after(prepare_tb_task).set_display_name(\"Preprocess & Transform\")\n",
" prep_task = (\n",
" prep_op().after(prepare_tb_task\n",
" ).set_display_name(\"Preprocess & Transform\")\n",
" )\n",
" confusion_matrix_url = f\"minio://{log_bucket}/{confusion_matrix_log_dir}\"\n",
" script_args = f\"model_name=bert.pth,\" \\\n",
" f\"num_samples={num_samples},\" \\\n",
" f\"confusion_matrix_url={confusion_matrix_url}\"\n",
" # For GPU , set gpus count and accelerator type\n",
" ptl_args = f\"max_epochs={max_epochs},profiler=pytorch,gpus=0,accelerator=None\"\n",
" train_task = (\n",
" train_op(\n",
" input_data=prep_task.outputs[\"output_data\"],\n",
" profiler=\"pytorch\",\n",
" confusion_matrix_url=f\"minio://{log_bucket}/{confusion_matrix_log_dir}\",\n",
" num_samples=num_samples,\n",
" # For GPU set gpu count and accelerator type\n",
" gpus=0,\n",
" accelerator='None'\n",
" )\n",
" .after(prep_task)\n",
" .set_display_name(\"Training\")\n",
" bert_script_args=script_args,\n",
" ptl_arguments=ptl_args\n",
" ).after(prep_task).set_display_name(\"Training\")\n",
" )\n",
" # For GPU uncomment below line and set GPU limit and node selector\n",
" # ).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
" # ).set_gpu_limit(1).add_node_selector_constraint\n",
" # ('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
"\n",
" minio_tb_upload = (\n",
" (\n",
" minio_op(\n",
" bucket_name=\"mlpipeline\",\n",
" folder_name=log_dir,\n",
" input_path=train_task.outputs[\"tensorboard_root\"],\n",
" filename=\"\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"Tensorboard Events Pusher\")\n",
" ).after(train_task).set_display_name(\"Tensorboard Events Pusher\")\n",
" )\n",
" minio_mar_upload = (\n",
" minio_op(\n",
@ -316,22 +327,19 @@
" folder_name=mar_path,\n",
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
" filename=\"bert_test.mar\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"Mar Pusher\")\n",
" ).after(train_task).set_display_name(\"Mar Pusher\")\n",
" )\n",
" minio_config_upload = (\n",
" (\n",
" minio_op(\n",
" bucket_name=\"mlpipeline\",\n",
" folder_name=config_prop_path,\n",
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
" filename=\"config.properties\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"Conifg Pusher\")\n",
" ).after(train_task).set_display_name(\"Conifg Pusher\")\n",
" )\n",
"\n",
" model_uri = str(model_uri)\n",
" # pylint: disable=unused-variable\n",
" isvc_yaml = \"\"\"\n",
" apiVersion: \"serving.kubeflow.org/v1beta1\"\n",
" kind: \"InferenceService\"\n",
@ -346,9 +354,7 @@
" resources:\n",
" limits:\n",
" memory: 4Gi \n",
" \"\"\".format(\n",
" deploy, namespace, model_uri\n",
" )\n",
" \"\"\".format(deploy, namespace, model_uri)\n",
"\n",
" # For GPU inference use below yaml with gpu count and accelerator\n",
" gpu_count = \"1\"\n",
@ -370,14 +376,11 @@
" nvidia.com/gpu: {}\n",
" nodeSelector:\n",
" cloud.google.com/gke-accelerator: {}\n",
"\"\"\".format(\n",
" deploy, namespace, model_uri, gpu_count, accelerator\n",
" )\n",
"\"\"\".format(deploy, namespace, model_uri, gpu_count, accelerator)\n",
" # Update inferenceservice_yaml for GPU inference\n",
" deploy_task = (\n",
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml)\n",
" .after(minio_mar_upload)\n",
" .set_display_name(\"Deployer\")\n",
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml\n",
" ).after(minio_mar_upload).set_display_name(\"Deployer\")\n",
" )\n",
"\n",
" dsl.get_pipeline_conf().add_op_transformer(\n",
@ -388,7 +391,7 @@
" \"accesskey\": \"MINIO_ACCESS_KEY\",\n",
" },\n",
" )\n",
" )\n"
" )"
]
},
{
@ -782,7 +785,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
"version": "3.8.2"
}
},
"nbformat": 4,

View File

@ -233,8 +233,11 @@
"metadata": {},
"outputs": [],
"source": [
"@dsl.pipeline(name=\"Training Cifar10 pipeline\", description=\"Cifar 10 dataset pipeline\")\n",
"def pytorch_cifar10(\n",
"\n",
"@dsl.pipeline(\n",
" name=\"Training Cifar10 pipeline\", description=\"Cifar 10 dataset pipeline\"\n",
")\n",
"def pytorch_cifar10( # pylint: disable=too-many-arguments\n",
" minio_endpoint=MINIO_ENDPOINT,\n",
" log_bucket=LOG_BUCKET,\n",
" log_dir=f\"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}\",\n",
@ -247,45 +250,54 @@
" model=MODEL_NAME,\n",
" namespace=NAMESPACE,\n",
" confusion_matrix_log_dir=f\"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/\",\n",
" checkpoint_dir=f\"checkpoint_dir/cifar10\",\n",
" checkpoint_dir=\"checkpoint_dir/cifar10\",\n",
" input_req=INPUT_REQUEST,\n",
" cookie=COOKIE,\n",
" ingress_gateway=INGRESS_GATEWAY\n",
" ingress_gateway=INGRESS_GATEWAY,\n",
"):\n",
" pod_template_spec = json.dumps(\n",
" {\n",
" \"spec\": {\n",
" \"containers\": [\n",
" \"\"\"Thid method defines the pipeline tasks and operations\"\"\"\n",
" pod_template_spec = json.dumps({\n",
" \"spec\": {\n",
" \"containers\": [{\n",
" \"env\": [\n",
" {\n",
" \"env\": [\n",
" {\n",
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"accesskey\",\n",
" }\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"secretkey\",\n",
" }\n",
" },\n",
" },\n",
" {\"name\": \"AWS_REGION\", \"value\": \"minio\"},\n",
" {\"name\": \"S3_ENDPOINT\", \"value\": f\"{minio_endpoint}\"},\n",
" {\"name\": \"S3_USE_HTTPS\", \"value\": \"0\"},\n",
" {\"name\": \"S3_VERIFY_SSL\", \"value\": \"0\"},\n",
" ]\n",
" }\n",
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"accesskey\",\n",
" }\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"secretkey\",\n",
" }\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"AWS_REGION\",\n",
" \"value\": \"minio\"\n",
" },\n",
" {\n",
" \"name\": \"S3_ENDPOINT\",\n",
" \"value\": f\"{minio_endpoint}\",\n",
" },\n",
" {\n",
" \"name\": \"S3_USE_HTTPS\",\n",
" \"value\": \"0\"\n",
" },\n",
" {\n",
" \"name\": \"S3_VERIFY_SSL\",\n",
" \"value\": \"0\"\n",
" },\n",
" ]\n",
" }\n",
" }]\n",
" }\n",
" )\n",
" })\n",
"\n",
" prepare_tb_task = prepare_tensorboard_op(\n",
" log_dir_uri=f\"s3://{log_bucket}/{log_dir}\",\n",
@ -293,42 +305,42 @@
" pod_template_spec=pod_template_spec,\n",
" ).set_display_name(\"Visualization\")\n",
"\n",
" prep_task = prep_op().after(prepare_tb_task).set_display_name(\"Preprocess & Transform\")\n",
" prep_task = (\n",
" prep_op().after(prepare_tb_task\n",
" ).set_display_name(\"Preprocess & Transform\")\n",
" )\n",
" confusion_matrix_url = f\"minio://{log_bucket}/{confusion_matrix_log_dir}\"\n",
" script_args = f\"model_name=resnet.pth,\" \\\n",
" f\"confusion_matrix_url={confusion_matrix_url}\"\n",
" # For GPU, set number of gpus and accelerator type\n",
" ptl_args = f\"max_epochs=1, gpus=0, accelerator=None, profiler=pytorch\"\n",
" train_task = (\n",
" train_op(\n",
" input_data=prep_task.outputs[\"output_data\"],\n",
" profiler=\"pytorch\",\n",
" confusion_matrix_url=f\"minio://{log_bucket}/{confusion_matrix_log_dir}\",\n",
" # For GPU set gpu count and accelerator type\n",
" gpus=0,\n",
" accelerator='None'\n",
" )\n",
" .after(prep_task)\n",
" .set_display_name(\"Training\")\n",
" cifar_script_args=script_args,\n",
" ptl_arguments=ptl_args\n",
" ).after(prep_task).set_display_name(\"Training\")\n",
" )\n",
" # For GPU uncomment below line and set GPU limit and node selector\n",
" # ).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
" # ).set_gpu_limit(1).add_node_selector_constraint\n",
" # ('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
"\n",
" minio_tb_upload = (\n",
" (\n",
" minio_op(\n",
" bucket_name=\"mlpipeline\",\n",
" folder_name=log_dir,\n",
" input_path=train_task.outputs[\"tensorboard_root\"],\n",
" filename=\"\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"Tensorboard Events Pusher\")\n",
" ).after(train_task).set_display_name(\"Tensorboard Events Pusher\")\n",
" )\n",
"\n",
" minio_checkpoint_dir_upload = (\n",
" (\n",
" minio_op(\n",
" bucket_name=\"mlpipeline\",\n",
" folder_name=checkpoint_dir,\n",
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
" filename=\"\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"checkpoint_dir Pusher\")\n",
" ).after(train_task).set_display_name(\"checkpoint_dir Pusher\")\n",
" )\n",
"\n",
" minio_mar_upload = (\n",
@ -337,23 +349,20 @@
" folder_name=mar_path,\n",
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
" filename=\"cifar10_test.mar\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"Mar Pusher\")\n",
" ).after(train_task).set_display_name(\"Mar Pusher\")\n",
" )\n",
" \n",
" minio_config_upload = (\n",
"\n",
" (\n",
" minio_op(\n",
" bucket_name=\"mlpipeline\",\n",
" folder_name=config_prop_path,\n",
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
" filename=\"config.properties\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"Conifg Pusher\")\n",
" ).after(train_task).set_display_name(\"Conifg Pusher\")\n",
" )\n",
"\n",
" model_uri = str(model_uri)\n",
" # pylint: disable=unused-variable\n",
" isvc_yaml = \"\"\"\n",
" apiVersion: \"serving.kubeflow.org/v1beta1\"\n",
" kind: \"InferenceService\"\n",
@ -379,7 +388,7 @@
" # For GPU inference use below yaml with gpu count and accelerator\n",
" gpu_count = \"1\"\n",
" accelerator = \"nvidia-tesla-p4\"\n",
" isvc_gpu_yaml = \"\"\"\n",
" isvc_gpu_yaml = \"\"\"# pylint: disable=unused-variable\n",
" apiVersion: \"serving.kubeflow.org/v1beta1\"\n",
" kind: \"InferenceService\"\n",
" metadata:\n",
@ -396,40 +405,33 @@
" nvidia.com/gpu: {}\n",
" nodeSelector:\n",
" cloud.google.com/gke-accelerator: {}\n",
"\"\"\".format(\n",
" deploy, namespace, model_uri, gpu_count, accelerator\n",
" )\n",
"\"\"\".format(deploy, namespace, model_uri, gpu_count, accelerator)\n",
" # Update inferenceservice_yaml for GPU inference\n",
" deploy_task = (\n",
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml)\n",
" .after(minio_mar_upload)\n",
" .set_display_name(\"Deployer\")\n",
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml\n",
" ).after(minio_mar_upload).set_display_name(\"Deployer\")\n",
" )\n",
" pred_task = (\n",
" pred_op(\n",
" host_name=ISVC_NAME,\n",
" input_request=INPUT_REQUEST,\n",
" cookie=COOKIE,\n",
" url=INGRESS_GATEWAY,\n",
" model=MODEL_NAME,\n",
" host_name=isvc_name,\n",
" input_request=input_req,\n",
" cookie=cookie,\n",
" url=ingress_gateway,\n",
" model=model,\n",
" inference_type=\"predict\",\n",
" )\n",
" .after(deploy_task)\n",
" .set_display_name(\"Prediction\")\n",
" ).after(deploy_task).set_display_name(\"Prediction\")\n",
" )\n",
" explain_task = (\n",
" (\n",
" pred_op(\n",
" host_name=ISVC_NAME,\n",
" input_request=INPUT_REQUEST,\n",
" cookie=COOKIE,\n",
" url=INGRESS_GATEWAY,\n",
" model=MODEL_NAME,\n",
" host_name=isvc_name,\n",
" input_request=input_req,\n",
" cookie=cookie,\n",
" url=ingress_gateway,\n",
" model=model,\n",
" inference_type=\"explain\",\n",
" )\n",
" .after(pred_task)\n",
" .set_display_name(\"Explanation\")\n",
" ).after(pred_task).set_display_name(\"Explanation\")\n",
" )\n",
" \n",
"\n",
" dsl.get_pipeline_conf().add_op_transformer(\n",
" use_k8s_secret(\n",
" secret_name=\"mlpipeline-minio-artifact\",\n",
@ -1138,9 +1140,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}

View File

@ -235,8 +235,11 @@
"metadata": {},
"outputs": [],
"source": [
"@dsl.pipeline(name=\"Training Cifar10 pipeline\", description=\"Cifar 10 dataset pipeline\")\n",
"def pytorch_cifar10(\n",
"\n",
"@dsl.pipeline(\n",
" name=\"Training Cifar10 pipeline\", description=\"Cifar 10 dataset pipeline\"\n",
")\n",
"def pytorch_cifar10( # pylint: disable=too-many-arguments\n",
" minio_endpoint=MINIO_ENDPOINT,\n",
" log_bucket=LOG_BUCKET,\n",
" log_dir=f\"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}\",\n",
@ -249,45 +252,54 @@
" model=MODEL_NAME,\n",
" namespace=NAMESPACE,\n",
" confusion_matrix_log_dir=f\"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/\",\n",
" checkpoint_dir=f\"checkpoint_dir/cifar10\",\n",
" checkpoint_dir=\"checkpoint_dir/cifar10\",\n",
" input_req=INPUT_REQUEST,\n",
" cookie=COOKIE,\n",
" ingress_gateway=INGRESS_GATEWAY\n",
" ingress_gateway=INGRESS_GATEWAY,\n",
"):\n",
" pod_template_spec = json.dumps(\n",
" {\n",
" \"spec\": {\n",
" \"containers\": [\n",
" \"\"\"Thid method defines the pipeline tasks and operations\"\"\"\n",
" pod_template_spec = json.dumps({\n",
" \"spec\": {\n",
" \"containers\": [{\n",
" \"env\": [\n",
" {\n",
" \"env\": [\n",
" {\n",
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"accesskey\",\n",
" }\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"secretkey\",\n",
" }\n",
" },\n",
" },\n",
" {\"name\": \"AWS_REGION\", \"value\": \"minio\"},\n",
" {\"name\": \"S3_ENDPOINT\", \"value\": f\"{minio_endpoint}\"},\n",
" {\"name\": \"S3_USE_HTTPS\", \"value\": \"0\"},\n",
" {\"name\": \"S3_VERIFY_SSL\", \"value\": \"0\"},\n",
" ]\n",
" }\n",
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"accesskey\",\n",
" }\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
" \"valueFrom\": {\n",
" \"secretKeyRef\": {\n",
" \"name\": \"mlpipeline-minio-artifact\",\n",
" \"key\": \"secretkey\",\n",
" }\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"AWS_REGION\",\n",
" \"value\": \"minio\"\n",
" },\n",
" {\n",
" \"name\": \"S3_ENDPOINT\",\n",
" \"value\": f\"{minio_endpoint}\",\n",
" },\n",
" {\n",
" \"name\": \"S3_USE_HTTPS\",\n",
" \"value\": \"0\"\n",
" },\n",
" {\n",
" \"name\": \"S3_VERIFY_SSL\",\n",
" \"value\": \"0\"\n",
" },\n",
" ]\n",
" }\n",
" }]\n",
" }\n",
" )\n",
" })\n",
"\n",
" prepare_tb_task = prepare_tensorboard_op(\n",
" log_dir_uri=f\"s3://{log_bucket}/{log_dir}\",\n",
@ -295,42 +307,42 @@
" pod_template_spec=pod_template_spec,\n",
" ).set_display_name(\"Visualization\")\n",
"\n",
" prep_task = prep_op().after(prepare_tb_task).set_display_name(\"Preprocess & Transform\")\n",
" prep_task = (\n",
" prep_op().after(prepare_tb_task\n",
" ).set_display_name(\"Preprocess & Transform\")\n",
" )\n",
" confusion_matrix_url = f\"minio://{log_bucket}/{confusion_matrix_log_dir}\"\n",
" script_args = f\"model_name=resnet.pth,\" \\\n",
" f\"confusion_matrix_url={confusion_matrix_url}\"\n",
" # For GPU, set number of gpus and accelerator type\n",
" ptl_args = f\"max_epochs=1, gpus=0, accelerator=None, profiler=pytorch\"\n",
" train_task = (\n",
" train_op(\n",
" input_data=prep_task.outputs[\"output_data\"],\n",
" profiler=\"pytorch\",\n",
" confusion_matrix_url=f\"minio://{log_bucket}/{confusion_matrix_log_dir}\",\n",
" # For GPU set gpu count and accelerator type\n",
" gpus=0,\n",
" accelerator='None'\n",
" )\n",
" .after(prep_task)\n",
" .set_display_name(\"Training\")\n",
" cifar_script_args=script_args,\n",
" ptl_arguments=ptl_args\n",
" ).after(prep_task).set_display_name(\"Training\")\n",
" )\n",
" # For GPU uncomment below line and set GPU limit and node selector\n",
" # ).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
" # ).set_gpu_limit(1).add_node_selector_constraint\n",
" # ('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
"\n",
" minio_tb_upload = (\n",
" (\n",
" minio_op(\n",
" bucket_name=\"mlpipeline\",\n",
" folder_name=log_dir,\n",
" input_path=train_task.outputs[\"tensorboard_root\"],\n",
" filename=\"\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"Tensorboard Events Pusher\")\n",
" ).after(train_task).set_display_name(\"Tensorboard Events Pusher\")\n",
" )\n",
"\n",
" minio_checkpoint_dir_upload = (\n",
" (\n",
" minio_op(\n",
" bucket_name=\"mlpipeline\",\n",
" folder_name=checkpoint_dir,\n",
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
" filename=\"\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"checkpoint_dir Pusher\")\n",
" ).after(train_task).set_display_name(\"checkpoint_dir Pusher\")\n",
" )\n",
"\n",
" minio_mar_upload = (\n",
@ -339,23 +351,20 @@
" folder_name=mar_path,\n",
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
" filename=\"cifar10_test.mar\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"Mar Pusher\")\n",
" ).after(train_task).set_display_name(\"Mar Pusher\")\n",
" )\n",
" \n",
" minio_config_upload = (\n",
"\n",
" (\n",
" minio_op(\n",
" bucket_name=\"mlpipeline\",\n",
" folder_name=config_prop_path,\n",
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
" filename=\"config.properties\",\n",
" )\n",
" .after(train_task)\n",
" .set_display_name(\"Conifg Pusher\")\n",
" ).after(train_task).set_display_name(\"Conifg Pusher\")\n",
" )\n",
"\n",
" model_uri = str(model_uri)\n",
" # pylint: disable=unused-variable\n",
" isvc_yaml = \"\"\"\n",
" apiVersion: \"serving.kubeflow.org/v1beta1\"\n",
" kind: \"InferenceService\"\n",
@ -370,14 +379,12 @@
" resources:\n",
" limits:\n",
" memory: 4Gi\n",
" \"\"\".format(\n",
" deploy, namespace, model_uri\n",
" )\n",
" \n",
" \"\"\".format(deploy, namespace, model_uri)\n",
"\n",
" # For GPU inference use below yaml with gpu count and accelerator\n",
" gpu_count = \"1\"\n",
" accelerator = \"nvidia-tesla-p4\"\n",
" isvc_gpu_yaml = \"\"\"\n",
" isvc_gpu_yaml = \"\"\"# pylint: disable=unused-variable\n",
" apiVersion: \"serving.kubeflow.org/v1beta1\"\n",
" kind: \"InferenceService\"\n",
" metadata:\n",
@ -394,40 +401,33 @@
" nvidia.com/gpu: {}\n",
" nodeSelector:\n",
" cloud.google.com/gke-accelerator: {}\n",
"\"\"\".format(\n",
" deploy, namespace, model_uri, gpu_count, accelerator\n",
" )\n",
"\"\"\".format(deploy, namespace, model_uri, gpu_count, accelerator)\n",
" # Update inferenceservice_yaml for GPU inference\n",
" deploy_task = (\n",
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml)\n",
" .after(minio_mar_upload)\n",
" .set_display_name(\"Deployer\")\n",
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml\n",
" ).after(minio_mar_upload).set_display_name(\"Deployer\")\n",
" )\n",
" pred_task = (\n",
" pred_op(\n",
" host_name=ISVC_NAME,\n",
" input_request=INPUT_REQUEST,\n",
" cookie=COOKIE,\n",
" url=INGRESS_GATEWAY,\n",
" model=MODEL_NAME,\n",
" host_name=isvc_name,\n",
" input_request=input_req,\n",
" cookie=cookie,\n",
" url=ingress_gateway,\n",
" model=model,\n",
" inference_type=\"predict\",\n",
" )\n",
" .after(deploy_task)\n",
" .set_display_name(\"Prediction\")\n",
" ).after(deploy_task).set_display_name(\"Prediction\")\n",
" )\n",
" explain_task = (\n",
" (\n",
" pred_op(\n",
" host_name=ISVC_NAME,\n",
" input_request=INPUT_REQUEST,\n",
" cookie=COOKIE,\n",
" url=INGRESS_GATEWAY,\n",
" model=MODEL_NAME,\n",
" host_name=isvc_name,\n",
" input_request=input_req,\n",
" cookie=cookie,\n",
" url=ingress_gateway,\n",
" model=model,\n",
" inference_type=\"explain\",\n",
" )\n",
" .after(pred_task)\n",
" .set_display_name(\"Explanation\")\n",
" ).after(pred_task).set_display_name(\"Explanation\")\n",
" )\n",
" \n",
"\n",
" dsl.get_pipeline_conf().add_op_transformer(\n",
" use_k8s_secret(\n",
" secret_name=\"mlpipeline-minio-artifact\",\n",
@ -625,7 +625,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.8.2"
}
},
"nbformat": 4,

View File

@ -15,7 +15,6 @@
import os
from argparse import ArgumentParser
from pathlib import Path
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import (
EarlyStopping,
@ -25,89 +24,89 @@ from pytorch_lightning.callbacks import (
from pytorch_kfp_components.components.visualization.component import Visualization
from pytorch_kfp_components.components.trainer.component import Trainer
from pytorch_kfp_components.components.mar.component import MarGeneration
from pytorch_kfp_components.components.utils.argument_parsing import parse_input_args
# Argument parser for user defined paths
import pytorch_lightning
print("Using Pytorch Lighting: {}".format(pytorch_lightning.__version__))
parser = ArgumentParser()
parser.add_argument(
"--tensorboard_root",
type=str,
default="output/tensorboard",
help="Tensorboard Root path (default: output/tensorboard)",
)
parser.add_argument(
"--checkpoint_dir",
type=str,
default="output/train/models",
help="Path to save model checkpoints (default: output/train/models)",
)
parser.add_argument(
"--dataset_path",
type=str,
default="output/processing",
help="Cifar10 Dataset path (default: output/processing)",
)
parser.add_argument(
"--model_name",
type=str,
default="bert.pth",
help="Name of the model to be saved as (default: bert.pth)",
)
parser.add_argument(
"--num_samples",
type=int,
default=1000,
help="Number of samples to use for training",
help="Path to input dataset",
)
parser.add_argument(
"--mlpipeline_ui_metadata",
type=str,
default="mlpipeline-ui-metadata.json",
help="Path to write mlpipeline-ui-metadata.json",
)
parser.add_argument(
"--mlpipeline_metrics",
type=str,
default="mlpipeline-metrics",
help="Path to write mlpipeline-metrics.json",
)
parser.add_argument(
"--confusion_matrix_url",
"--script_args",
type=str,
help="Minio url to generate confusion matrix",
help="Arguments for bert agnews classification script",
)
parser = pl.Trainer.add_argparse_args(parent_parser=parser)
parser.add_argument(
"--ptl_args",
type=str,
help="Arguments specific to PTL trainer",
)
parser.add_argument(
"--checkpoint_dir",
default="output/train/models",
type=str,
help="Arguments specific to PTL trainer",
)
parser.add_argument(
"--tensorboard_root",
default="output/tensorboard",
type=str,
help="Arguments specific to PTL trainer",
)
args = vars(parser.parse_args())
script_args = args["script_args"]
ptl_args = args["ptl_args"]
TENSOBOARD_ROOT = args["tensorboard_root"]
CHECKPOINT_DIR = args["checkpoint_dir"]
DATASET_PATH = args["dataset_path"]
script_dict: dict = parse_input_args(input_str=script_args)
script_dict["checkpoint_dir"] = CHECKPOINT_DIR
ptl_dict: dict = parse_input_args(input_str=ptl_args)
# Enabling Tensorboard Logger, ModelCheckpoint, Earlystopping
lr_logger = LearningRateMonitor()
tboard = TensorBoardLogger(args["tensorboard_root"])
tboard = TensorBoardLogger(TENSOBOARD_ROOT)
early_stopping = EarlyStopping(
monitor="val_loss", mode="min", patience=5, verbose=True
)
checkpoint_callback = ModelCheckpoint(
dirpath=args["checkpoint_dir"],
filename="cifar10_{epoch:02d}",
dirpath=CHECKPOINT_DIR,
filename="bert_{epoch:02d}",
save_top_k=1,
verbose=True,
monitor="val_loss",
mode="min",
)
if not args["max_epochs"]:
args["max_epochs"] = 1
if args["accelerator"] and args["accelerator"] == "None":
args["accelerator"] = None
if "accelerator" in ptl_dict and ptl_dict["accelerator"] == "None":
ptl_dict["accelerator"] = None
# Setting the trainer specific arguments
trainer_args = {
@ -116,24 +115,29 @@ trainer_args = {
"callbacks": [lr_logger, early_stopping, checkpoint_callback],
}
if "profiler" in args and args["profiler"] != "":
trainer_args["profiler"] = args["profiler"]
if not ptl_dict["max_epochs"]:
trainer_args["max_epochs"] = 1
else:
trainer_args["max_epochs"] = ptl_dict["max_epochs"]
if "profiler" in ptl_dict and ptl_dict["profiler"] != "":
trainer_args["profiler"] = ptl_dict["profiler"]
# Setting the datamodule specific arguments
data_module_args = {
"train_glob": args["dataset_path"],
"num_samples": args["num_samples"]
"train_glob": DATASET_PATH,
"num_samples": script_dict["num_samples"]
}
# Creating parent directories
Path(args["tensorboard_root"]).mkdir(parents=True, exist_ok=True)
Path(args["checkpoint_dir"]).mkdir(parents=True, exist_ok=True)
Path(TENSOBOARD_ROOT).mkdir(parents=True, exist_ok=True)
Path(CHECKPOINT_DIR).mkdir(parents=True, exist_ok=True)
# Initiating the training process
trainer = Trainer(
module_file="bert_train.py",
data_module_file="bert_datamodule.py",
module_file_args=args,
module_file_args=script_dict,
data_module_args=data_module_args,
trainer_args=trainer_args,
)
@ -158,11 +162,11 @@ if trainer.ptl_trainer.global_rank == 0:
"HANDLER":
os.path.join(bert_dir, "bert_handler.py"),
"SERIALIZED_FILE":
os.path.join(args["checkpoint_dir"], args["model_name"]),
os.path.join(CHECKPOINT_DIR, script_dict["model_name"]),
"VERSION":
"1",
"EXPORT_PATH":
args["checkpoint_dir"],
CHECKPOINT_DIR,
"CONFIG_PROPERTIES":
os.path.join(bert_dir, "config.properties"),
"EXTRA_FILES":
@ -175,7 +179,7 @@ if trainer.ptl_trainer.global_rank == 0:
os.path.join(bert_dir, "requirements.txt")
}
MarGeneration(mar_config=mar_config, mar_save_path=args["checkpoint_dir"])
MarGeneration(mar_config=mar_config, mar_save_path=CHECKPOINT_DIR)
classes = [
"World",
@ -196,7 +200,7 @@ if trainer.ptl_trainer.global_rank == 0:
"actuals": model.target,
"preds": model.preds,
"classes": class_list,
"url": args["confusion_matrix_url"],
"url": script_dict["confusion_matrix_url"],
}
test_accuracy = round(float(model.test_acc.compute()), 2)
@ -205,11 +209,11 @@ if trainer.ptl_trainer.global_rank == 0:
visualization_arguments = {
"input": {
"tensorboard_root": args["tensorboard_root"],
"checkpoint_dir": args["checkpoint_dir"],
"dataset_path": args["dataset_path"],
"model_name": args["model_name"],
"confusion_matrix_url": args["confusion_matrix_url"],
"tensorboard_root": TENSOBOARD_ROOT,
"checkpoint_dir": CHECKPOINT_DIR,
"dataset_path": DATASET_PATH,
"model_name": script_dict["model_name"],
"confusion_matrix_url": script_dict["confusion_matrix_url"],
},
"output": {
"mlpipeline_ui_metadata": args["mlpipeline_ui_metadata"],

View File

@ -76,6 +76,7 @@ def pytorch_bert( # pylint: disable=too-many-arguments
namespace=NAMESPACE,
confusion_matrix_log_dir=f"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/",
num_samples=1000,
max_epochs=1
):
"""Thid method defines the pipeline tasks and operations"""
prepare_tb_task = prepare_tensorboard_op(
@ -129,16 +130,20 @@ def pytorch_bert( # pylint: disable=too-many-arguments
prep_op().after(prepare_tb_task
).set_display_name("Preprocess & Transform")
)
confusion_matrix_url = f"minio://{log_bucket}/{confusion_matrix_log_dir}"
script_args = f"model_name=bert.pth," \
f"num_samples={num_samples}," \
f"confusion_matrix_url={confusion_matrix_url}"
# For gpus, set number of gpus and accelerator type
ptl_args = f"max_epochs={max_epochs}," \
"profiler=pytorch," \
"gpus=0," \
"accelerator=None"
train_task = (
train_op(
input_data=prep_task.outputs["output_data"],
profiler="pytorch",
confusion_matrix_url=f"minio://{log_bucket}/"
f"{confusion_matrix_log_dir}",
num_samples=num_samples,
# For GPU set gpu count and accelerator type
gpus=0,
accelerator="None",
bert_script_args=script_args,
ptl_arguments=ptl_args
).after(prep_task).set_display_name("Training")
)
# For GPU uncomment below line and set GPU limit and node selector

View File

@ -16,11 +16,8 @@ description: |
Pytorch training
inputs:
- {name: input_data, description: 'Input dataset path'}
- {name: profiler, description: 'Pytorch profiler type'}
- {name: confusion_matrix_url, description: 'Minio url to upload confusion matrix'}
- {name: num_samples, default: 1000, description: 'Number of samples to train'}
- {name: gpus, type: Integer, default: 0, description: 'Number of gpus to use for training'}
- {name: accelerator, type: String, default: 'None', description: 'PTL accelerator type'}
- {name: bert_script_args, description: 'Arguments to the bert script'}
- {name: ptl_arguments, description: 'Arguments to pytorch lightning Trainer'}
outputs:
- {name: tensorboard_root, description: "Tensorboard output path"}
@ -36,21 +33,15 @@ implementation:
args:
- --dataset_path
- {inputPath: input_data}
- --script_args
- { inputValue: bert_script_args }
- --ptl_args
- { inputValue: ptl_arguments }
- --tensorboard_root
- {outputPath: tensorboard_root}
- --checkpoint_dir
- {outputPath: checkpoint_dir}
- --profiler
- {inputValue: profiler}
- --mlpipeline_ui_metadata
- {outputPath: MLPipeline UI Metadata}
- --mlpipeline_metrics
- {outputPath: MLPipeline Metrics}
- --confusion_matrix_url
- { inputValue: confusion_matrix_url}
- --num_samples
- { inputValue: num_samples}
- --gpus
- { inputValue: gpus}
- --accelerator
- { inputValue: accelerator}

0
samples/contrib/pytorch-samples/build.sh Normal file → Executable file
View File

View File

@ -15,7 +15,6 @@
import os
from pathlib import Path
from argparse import ArgumentParser
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import (
EarlyStopping,
@ -27,6 +26,7 @@ from pytorch_kfp_components.components.visualization.component import (
)
from pytorch_kfp_components.components.trainer.component import Trainer
from pytorch_kfp_components.components.mar.component import MarGeneration
from pytorch_kfp_components.components.utils.argument_parsing import parse_input_args
# Argument parser for user defined paths
parser = ArgumentParser()
@ -79,27 +79,40 @@ parser.add_argument(
default=None,
help="Minio url to generate confusion matrix",
)
parser.add_argument(
"--pod_template_spec",
"--script_args",
type=str,
default=None,
help="Pod template spec",
help="Arguments for bert agnews classification script",
)
parser = pl.Trainer.add_argparse_args(parent_parser=parser)
parser.add_argument(
"--ptl_args",
type=str,
help="Arguments specific to PTL trainer",
)
# parser = pl.Trainer.add_argparse_args(parent_parser=parser)
args = vars(parser.parse_args())
script_args = args["script_args"]
ptl_args = args["ptl_args"]
TENSORBOARD_ROOT = args["tensorboard_root"]
CHECKPOINT_DIR = args["checkpoint_dir"]
DATASET_PATH = args["dataset_path"]
script_dict: dict = parse_input_args(input_str=script_args)
script_dict["checkpoint_dir"] = CHECKPOINT_DIR
ptl_dict: dict = parse_input_args(input_str=ptl_args)
# Enabling Tensorboard Logger, ModelCheckpoint, Earlystopping
lr_logger = LearningRateMonitor()
tboard = TensorBoardLogger(args["tensorboard_root"])
tboard = TensorBoardLogger(TENSORBOARD_ROOT)
early_stopping = EarlyStopping(
monitor="val_loss", mode="min", patience=5, verbose=True
)
checkpoint_callback = ModelCheckpoint(
dirpath=args["checkpoint_dir"],
dirpath=CHECKPOINT_DIR,
filename="cifar10_{epoch:02d}",
save_top_k=1,
verbose=True,
@ -107,11 +120,8 @@ checkpoint_callback = ModelCheckpoint(
mode="min",
)
if not args["max_epochs"]:
args["max_epochs"] = 1
if args["accelerator"] and args["accelerator"] == "None":
args["accelerator"] = None
if "accelerator" in ptl_dict and ptl_dict["accelerator"] == "None":
ptl_dict["accelerator"] = None
# Setting the trainer specific arguments
trainer_args = {
@ -120,15 +130,20 @@ trainer_args = {
"callbacks": [lr_logger, early_stopping, checkpoint_callback],
}
if "profiler" in args and args["profiler"] != "":
trainer_args["profiler"] = args["profiler"]
if not ptl_dict["max_epochs"]:
trainer_args["max_epochs"] = 1
else:
trainer_args["max_epochs"] = ptl_dict["max_epochs"]
if "profiler" in ptl_dict and ptl_dict["profiler"] != "":
trainer_args["profiler"] = ptl_dict["profiler"]
# Setting the datamodule specific arguments
data_module_args = {"train_glob": args["dataset_path"]}
data_module_args = {"train_glob": DATASET_PATH}
# Creating parent directories
Path(args["tensorboard_root"]).mkdir(parents=True, exist_ok=True)
Path(args["checkpoint_dir"]).mkdir(parents=True, exist_ok=True)
Path(TENSORBOARD_ROOT).mkdir(parents=True, exist_ok=True)
Path(CHECKPOINT_DIR).mkdir(parents=True, exist_ok=True)
# Initiating the training process
trainer = Trainer(
@ -154,11 +169,11 @@ if trainer.ptl_trainer.global_rank == 0:
"HANDLER":
os.path.join(cifar_dir, "cifar10_handler.py"),
"SERIALIZED_FILE":
os.path.join(args["checkpoint_dir"], args["model_name"]),
os.path.join(CHECKPOINT_DIR, script_dict["model_name"]),
"VERSION":
"1",
"EXPORT_PATH":
args["checkpoint_dir"],
CHECKPOINT_DIR,
"CONFIG_PROPERTIES":
os.path.join(cifar_dir, "config.properties"),
"EXTRA_FILES":
@ -170,7 +185,7 @@ if trainer.ptl_trainer.global_rank == 0:
os.path.join(cifar_dir, "requirements.txt"),
}
MarGeneration(mar_config=mar_config, mar_save_path=args["checkpoint_dir"])
MarGeneration(mar_config=mar_config, mar_save_path=CHECKPOINT_DIR)
classes = [
"airplane",
@ -207,11 +222,11 @@ if trainer.ptl_trainer.global_rank == 0:
visualization_arguments = {
"input": {
"tensorboard_root": args["tensorboard_root"],
"checkpoint_dir": args["checkpoint_dir"],
"dataset_path": args["dataset_path"],
"model_name": args["model_name"],
"confusion_matrix_url": args["confusion_matrix_url"],
"tensorboard_root": TENSORBOARD_ROOT,
"checkpoint_dir": CHECKPOINT_DIR,
"dataset_path": DATASET_PATH,
"model_name": script_dict["model_name"],
"confusion_matrix_url": script_dict["confusion_matrix_url"],
},
"output": {
"mlpipeline_ui_metadata": args["mlpipeline_ui_metadata"],
@ -231,5 +246,5 @@ if trainer.ptl_trainer.global_rank == 0:
markdown=markdown_dict,
)
checpoint_dir_contents = os.listdir(args["checkpoint_dir"])
checpoint_dir_contents = os.listdir(CHECKPOINT_DIR)
print(f"Checkpoint Directory Contents: {checpoint_dir_contents}")

View File

@ -137,15 +137,16 @@ def pytorch_cifar10( # pylint: disable=too-many-arguments
prep_op().after(prepare_tb_task
).set_display_name("Preprocess & Transform")
)
confusion_matrix_url = f"minio://{log_bucket}/{confusion_matrix_log_dir}"
script_args = f"model_name=resnet.pth," \
f"confusion_matrix_url={confusion_matrix_url}"
# For gpus, set number of gpus and accelerator type
ptl_args = "max_epochs=1, gpus=0, accelerator=None, profiler=pytorch"
train_task = (
train_op(
input_data=prep_task.outputs["output_data"],
profiler="pytorch",
confusion_matrix_url=f"minio://{log_bucket}/"
f"{confusion_matrix_log_dir}",
# For GPU set gpu count and accelerator type
gpus=0,
accelerator="None",
cifar_script_args=script_args,
ptl_arguments=ptl_args
).after(prep_task).set_display_name("Training")
)
# For GPU uncomment below line and set GPU limit and node selector

View File

@ -17,10 +17,8 @@ description: |
inputs:
- {name: input_data, description: 'Input dataset path'}
- {name: profiler, description: 'Pytorch profiler type'}
- {name: confusion_matrix_url, description: 'Minio url to upload confusion matrix'}
- {name: gpus, type: Integer, default: 0, description: 'Number of gpus to use for training'}
- {name: accelerator, type: String, default: 'None', description: 'PTL accelerator type'}
- {name: cifar_script_args, description: 'Arguments to the cifar script'}
- {name: ptl_arguments, description: 'Arguments to pytorch lightning Trainer'}
outputs:
- {name: tensorboard_root, description: 'Tensorboard output path'}
@ -42,15 +40,11 @@ implementation:
- {outputPath: tensorboard_root}
- --checkpoint_dir
- {outputPath: checkpoint_dir}
- --profiler
- {inputValue: profiler}
- --mlpipeline_ui_metadata
- {outputPath: MLPipeline UI Metadata}
- --mlpipeline_metrics
- { outputPath: MLPipeline Metrics}
- --confusion_matrix_url
- { inputValue: confusion_matrix_url}
- --gpus
- { inputValue: gpus}
- --accelerator
- { inputValue: accelerator}
- --script_args
- { inputValue: cifar_script_args }
- --ptl_args
- { inputValue: ptl_arguments }