fix(components/pytorch) Pytorch Lightning Arguments string pass through (#5870)
* Updating bert script to use input arguments as string Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Adding utility to parse input arguments Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Fixing tensorboard root and checkpoint dirs Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Adding string pass through in component.yaml Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Fixing pipeline.py file Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Fixing pipeline keys Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Fixing args in component.yaml Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Removing extra comma Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Removing unused code Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Updating cifar10 example Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Uncommenting confusion matrix Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Updating jupyter notebooks Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Fixing cifar10 train component.yaml Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Addressing review comments Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com> * Fixing lint issues Signed-off-by: Shrinath Suresh <shrinath@ideas2it.com>
This commit is contained in:
parent
e958156274
commit
192b8e8756
|
|
@ -0,0 +1,13 @@
|
|||
# !/usr/bin/env/python3
|
||||
# Copyright (c) Facebook, Inc. and its affiliates.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
# !/usr/bin/env/python3
|
||||
# Copyright (c) Facebook, Inc. and its affiliates.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
def parse_input_args(input_str: str):
|
||||
"""Utility to parse input string arguments. Returns a dictionary"""
|
||||
output_dict = {}
|
||||
if not input_str:
|
||||
raise ValueError("Empty input string: {}".format(input_str))
|
||||
|
||||
key_pairs: list = input_str.split(",")
|
||||
|
||||
key_pairs = [x.strip() for x in key_pairs]
|
||||
|
||||
if not key_pairs:
|
||||
raise ValueError("Incorrect format: {}".format(input_str))
|
||||
|
||||
for each_key in key_pairs:
|
||||
try:
|
||||
key, value = each_key.split("=")
|
||||
except ValueError as value_error:
|
||||
raise ValueError("Expected input format "
|
||||
"'key1=value1, key2=value2' "
|
||||
"but received {}".format(input_str)) \
|
||||
from value_error
|
||||
if value.isdigit():
|
||||
value = int(value)
|
||||
output_dict[key] = value
|
||||
|
||||
return output_dict
|
||||
|
|
@ -230,7 +230,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"@dsl.pipeline(name=\"Training pipeline\", description=\"Sample training job test\")\n",
|
||||
"def pytorch_bert(\n",
|
||||
"def pytorch_bert( # pylint: disable=too-many-arguments\n",
|
||||
" minio_endpoint=MINIO_ENDPOINT,\n",
|
||||
" log_bucket=LOG_BUCKET,\n",
|
||||
" log_dir=f\"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}\",\n",
|
||||
|
|
@ -241,74 +241,85 @@
|
|||
" deploy=DEPLOY_NAME,\n",
|
||||
" namespace=NAMESPACE,\n",
|
||||
" confusion_matrix_log_dir=f\"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/\",\n",
|
||||
" num_samples=1000\n",
|
||||
" num_samples=1000,\n",
|
||||
" max_epochs=1\n",
|
||||
"):\n",
|
||||
"\n",
|
||||
" \"\"\"Thid method defines the pipeline tasks and operations\"\"\"\n",
|
||||
" prepare_tb_task = prepare_tensorboard_op(\n",
|
||||
" log_dir_uri=f\"s3://{log_bucket}/{log_dir}\",\n",
|
||||
" image=tf_image,\n",
|
||||
" pod_template_spec=json.dumps(\n",
|
||||
" {\n",
|
||||
" \"spec\": {\n",
|
||||
" \"containers\": [\n",
|
||||
" pod_template_spec=json.dumps({\n",
|
||||
" \"spec\": {\n",
|
||||
" \"containers\": [{\n",
|
||||
" \"env\": [\n",
|
||||
" {\n",
|
||||
" \"env\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"accesskey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"secretkey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\"name\": \"AWS_REGION\", \"value\": \"minio\"},\n",
|
||||
" {\"name\": \"S3_ENDPOINT\", \"value\": f\"{minio_endpoint}\"},\n",
|
||||
" {\"name\": \"S3_USE_HTTPS\", \"value\": \"0\"},\n",
|
||||
" {\"name\": \"S3_VERIFY_SSL\", \"value\": \"0\"},\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"accesskey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"secretkey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_REGION\",\n",
|
||||
" \"value\": \"minio\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"S3_ENDPOINT\",\n",
|
||||
" \"value\": f\"{minio_endpoint}\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"S3_USE_HTTPS\",\n",
|
||||
" \"value\": \"0\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"S3_VERIFY_SSL\",\n",
|
||||
" \"value\": \"0\"\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" }]\n",
|
||||
" }\n",
|
||||
" ),\n",
|
||||
" }),\n",
|
||||
" ).set_display_name(\"Visualization\")\n",
|
||||
"\n",
|
||||
" prep_task = prep_op().after(prepare_tb_task).set_display_name(\"Preprocess & Transform\")\n",
|
||||
" prep_task = (\n",
|
||||
" prep_op().after(prepare_tb_task\n",
|
||||
" ).set_display_name(\"Preprocess & Transform\")\n",
|
||||
" )\n",
|
||||
" confusion_matrix_url = f\"minio://{log_bucket}/{confusion_matrix_log_dir}\"\n",
|
||||
" script_args = f\"model_name=bert.pth,\" \\\n",
|
||||
" f\"num_samples={num_samples},\" \\\n",
|
||||
" f\"confusion_matrix_url={confusion_matrix_url}\"\n",
|
||||
" # For GPU , set gpus count and accelerator type\n",
|
||||
" ptl_args = f\"max_epochs={max_epochs},profiler=pytorch,gpus=0,accelerator=None\"\n",
|
||||
" train_task = (\n",
|
||||
" train_op(\n",
|
||||
" input_data=prep_task.outputs[\"output_data\"],\n",
|
||||
" profiler=\"pytorch\",\n",
|
||||
" confusion_matrix_url=f\"minio://{log_bucket}/{confusion_matrix_log_dir}\",\n",
|
||||
" num_samples=num_samples,\n",
|
||||
" # For GPU set gpu count and accelerator type\n",
|
||||
" gpus=0,\n",
|
||||
" accelerator='None'\n",
|
||||
" )\n",
|
||||
" .after(prep_task)\n",
|
||||
" .set_display_name(\"Training\")\n",
|
||||
" bert_script_args=script_args,\n",
|
||||
" ptl_arguments=ptl_args\n",
|
||||
" ).after(prep_task).set_display_name(\"Training\")\n",
|
||||
" )\n",
|
||||
" # For GPU uncomment below line and set GPU limit and node selector\n",
|
||||
" # ).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
|
||||
" # ).set_gpu_limit(1).add_node_selector_constraint\n",
|
||||
" # ('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
|
||||
"\n",
|
||||
" minio_tb_upload = (\n",
|
||||
" (\n",
|
||||
" minio_op(\n",
|
||||
" bucket_name=\"mlpipeline\",\n",
|
||||
" folder_name=log_dir,\n",
|
||||
" input_path=train_task.outputs[\"tensorboard_root\"],\n",
|
||||
" filename=\"\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"Tensorboard Events Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"Tensorboard Events Pusher\")\n",
|
||||
" )\n",
|
||||
" minio_mar_upload = (\n",
|
||||
" minio_op(\n",
|
||||
|
|
@ -316,22 +327,19 @@
|
|||
" folder_name=mar_path,\n",
|
||||
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
|
||||
" filename=\"bert_test.mar\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"Mar Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"Mar Pusher\")\n",
|
||||
" )\n",
|
||||
" minio_config_upload = (\n",
|
||||
" (\n",
|
||||
" minio_op(\n",
|
||||
" bucket_name=\"mlpipeline\",\n",
|
||||
" folder_name=config_prop_path,\n",
|
||||
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
|
||||
" filename=\"config.properties\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"Conifg Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"Conifg Pusher\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" model_uri = str(model_uri)\n",
|
||||
" # pylint: disable=unused-variable\n",
|
||||
" isvc_yaml = \"\"\"\n",
|
||||
" apiVersion: \"serving.kubeflow.org/v1beta1\"\n",
|
||||
" kind: \"InferenceService\"\n",
|
||||
|
|
@ -346,9 +354,7 @@
|
|||
" resources:\n",
|
||||
" limits:\n",
|
||||
" memory: 4Gi \n",
|
||||
" \"\"\".format(\n",
|
||||
" deploy, namespace, model_uri\n",
|
||||
" )\n",
|
||||
" \"\"\".format(deploy, namespace, model_uri)\n",
|
||||
"\n",
|
||||
" # For GPU inference use below yaml with gpu count and accelerator\n",
|
||||
" gpu_count = \"1\"\n",
|
||||
|
|
@ -370,14 +376,11 @@
|
|||
" nvidia.com/gpu: {}\n",
|
||||
" nodeSelector:\n",
|
||||
" cloud.google.com/gke-accelerator: {}\n",
|
||||
"\"\"\".format(\n",
|
||||
" deploy, namespace, model_uri, gpu_count, accelerator\n",
|
||||
" )\n",
|
||||
"\"\"\".format(deploy, namespace, model_uri, gpu_count, accelerator)\n",
|
||||
" # Update inferenceservice_yaml for GPU inference\n",
|
||||
" deploy_task = (\n",
|
||||
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml)\n",
|
||||
" .after(minio_mar_upload)\n",
|
||||
" .set_display_name(\"Deployer\")\n",
|
||||
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml\n",
|
||||
" ).after(minio_mar_upload).set_display_name(\"Deployer\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" dsl.get_pipeline_conf().add_op_transformer(\n",
|
||||
|
|
@ -388,7 +391,7 @@
|
|||
" \"accesskey\": \"MINIO_ACCESS_KEY\",\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
" )\n"
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -782,7 +785,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.8"
|
||||
"version": "3.8.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
|||
|
|
@ -233,8 +233,11 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@dsl.pipeline(name=\"Training Cifar10 pipeline\", description=\"Cifar 10 dataset pipeline\")\n",
|
||||
"def pytorch_cifar10(\n",
|
||||
"\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name=\"Training Cifar10 pipeline\", description=\"Cifar 10 dataset pipeline\"\n",
|
||||
")\n",
|
||||
"def pytorch_cifar10( # pylint: disable=too-many-arguments\n",
|
||||
" minio_endpoint=MINIO_ENDPOINT,\n",
|
||||
" log_bucket=LOG_BUCKET,\n",
|
||||
" log_dir=f\"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}\",\n",
|
||||
|
|
@ -247,45 +250,54 @@
|
|||
" model=MODEL_NAME,\n",
|
||||
" namespace=NAMESPACE,\n",
|
||||
" confusion_matrix_log_dir=f\"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/\",\n",
|
||||
" checkpoint_dir=f\"checkpoint_dir/cifar10\",\n",
|
||||
" checkpoint_dir=\"checkpoint_dir/cifar10\",\n",
|
||||
" input_req=INPUT_REQUEST,\n",
|
||||
" cookie=COOKIE,\n",
|
||||
" ingress_gateway=INGRESS_GATEWAY\n",
|
||||
" ingress_gateway=INGRESS_GATEWAY,\n",
|
||||
"):\n",
|
||||
" pod_template_spec = json.dumps(\n",
|
||||
" {\n",
|
||||
" \"spec\": {\n",
|
||||
" \"containers\": [\n",
|
||||
" \"\"\"Thid method defines the pipeline tasks and operations\"\"\"\n",
|
||||
" pod_template_spec = json.dumps({\n",
|
||||
" \"spec\": {\n",
|
||||
" \"containers\": [{\n",
|
||||
" \"env\": [\n",
|
||||
" {\n",
|
||||
" \"env\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"accesskey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"secretkey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\"name\": \"AWS_REGION\", \"value\": \"minio\"},\n",
|
||||
" {\"name\": \"S3_ENDPOINT\", \"value\": f\"{minio_endpoint}\"},\n",
|
||||
" {\"name\": \"S3_USE_HTTPS\", \"value\": \"0\"},\n",
|
||||
" {\"name\": \"S3_VERIFY_SSL\", \"value\": \"0\"},\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"accesskey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"secretkey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_REGION\",\n",
|
||||
" \"value\": \"minio\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"S3_ENDPOINT\",\n",
|
||||
" \"value\": f\"{minio_endpoint}\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"S3_USE_HTTPS\",\n",
|
||||
" \"value\": \"0\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"S3_VERIFY_SSL\",\n",
|
||||
" \"value\": \"0\"\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" }]\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" prepare_tb_task = prepare_tensorboard_op(\n",
|
||||
" log_dir_uri=f\"s3://{log_bucket}/{log_dir}\",\n",
|
||||
|
|
@ -293,42 +305,42 @@
|
|||
" pod_template_spec=pod_template_spec,\n",
|
||||
" ).set_display_name(\"Visualization\")\n",
|
||||
"\n",
|
||||
" prep_task = prep_op().after(prepare_tb_task).set_display_name(\"Preprocess & Transform\")\n",
|
||||
" prep_task = (\n",
|
||||
" prep_op().after(prepare_tb_task\n",
|
||||
" ).set_display_name(\"Preprocess & Transform\")\n",
|
||||
" )\n",
|
||||
" confusion_matrix_url = f\"minio://{log_bucket}/{confusion_matrix_log_dir}\"\n",
|
||||
" script_args = f\"model_name=resnet.pth,\" \\\n",
|
||||
" f\"confusion_matrix_url={confusion_matrix_url}\"\n",
|
||||
" # For GPU, set number of gpus and accelerator type\n",
|
||||
" ptl_args = f\"max_epochs=1, gpus=0, accelerator=None, profiler=pytorch\"\n",
|
||||
" train_task = (\n",
|
||||
" train_op(\n",
|
||||
" input_data=prep_task.outputs[\"output_data\"],\n",
|
||||
" profiler=\"pytorch\",\n",
|
||||
" confusion_matrix_url=f\"minio://{log_bucket}/{confusion_matrix_log_dir}\",\n",
|
||||
" # For GPU set gpu count and accelerator type\n",
|
||||
" gpus=0,\n",
|
||||
" accelerator='None'\n",
|
||||
" )\n",
|
||||
" .after(prep_task)\n",
|
||||
" .set_display_name(\"Training\")\n",
|
||||
" cifar_script_args=script_args,\n",
|
||||
" ptl_arguments=ptl_args\n",
|
||||
" ).after(prep_task).set_display_name(\"Training\")\n",
|
||||
" )\n",
|
||||
" # For GPU uncomment below line and set GPU limit and node selector\n",
|
||||
" # ).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
|
||||
" # ).set_gpu_limit(1).add_node_selector_constraint\n",
|
||||
" # ('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
|
||||
"\n",
|
||||
" minio_tb_upload = (\n",
|
||||
" (\n",
|
||||
" minio_op(\n",
|
||||
" bucket_name=\"mlpipeline\",\n",
|
||||
" folder_name=log_dir,\n",
|
||||
" input_path=train_task.outputs[\"tensorboard_root\"],\n",
|
||||
" filename=\"\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"Tensorboard Events Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"Tensorboard Events Pusher\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" minio_checkpoint_dir_upload = (\n",
|
||||
" (\n",
|
||||
" minio_op(\n",
|
||||
" bucket_name=\"mlpipeline\",\n",
|
||||
" folder_name=checkpoint_dir,\n",
|
||||
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
|
||||
" filename=\"\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"checkpoint_dir Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"checkpoint_dir Pusher\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" minio_mar_upload = (\n",
|
||||
|
|
@ -337,23 +349,20 @@
|
|||
" folder_name=mar_path,\n",
|
||||
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
|
||||
" filename=\"cifar10_test.mar\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"Mar Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"Mar Pusher\")\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" minio_config_upload = (\n",
|
||||
"\n",
|
||||
" (\n",
|
||||
" minio_op(\n",
|
||||
" bucket_name=\"mlpipeline\",\n",
|
||||
" folder_name=config_prop_path,\n",
|
||||
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
|
||||
" filename=\"config.properties\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"Conifg Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"Conifg Pusher\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" model_uri = str(model_uri)\n",
|
||||
" # pylint: disable=unused-variable\n",
|
||||
" isvc_yaml = \"\"\"\n",
|
||||
" apiVersion: \"serving.kubeflow.org/v1beta1\"\n",
|
||||
" kind: \"InferenceService\"\n",
|
||||
|
|
@ -379,7 +388,7 @@
|
|||
" # For GPU inference use below yaml with gpu count and accelerator\n",
|
||||
" gpu_count = \"1\"\n",
|
||||
" accelerator = \"nvidia-tesla-p4\"\n",
|
||||
" isvc_gpu_yaml = \"\"\"\n",
|
||||
" isvc_gpu_yaml = \"\"\"# pylint: disable=unused-variable\n",
|
||||
" apiVersion: \"serving.kubeflow.org/v1beta1\"\n",
|
||||
" kind: \"InferenceService\"\n",
|
||||
" metadata:\n",
|
||||
|
|
@ -396,40 +405,33 @@
|
|||
" nvidia.com/gpu: {}\n",
|
||||
" nodeSelector:\n",
|
||||
" cloud.google.com/gke-accelerator: {}\n",
|
||||
"\"\"\".format(\n",
|
||||
" deploy, namespace, model_uri, gpu_count, accelerator\n",
|
||||
" )\n",
|
||||
"\"\"\".format(deploy, namespace, model_uri, gpu_count, accelerator)\n",
|
||||
" # Update inferenceservice_yaml for GPU inference\n",
|
||||
" deploy_task = (\n",
|
||||
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml)\n",
|
||||
" .after(minio_mar_upload)\n",
|
||||
" .set_display_name(\"Deployer\")\n",
|
||||
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml\n",
|
||||
" ).after(minio_mar_upload).set_display_name(\"Deployer\")\n",
|
||||
" )\n",
|
||||
" pred_task = (\n",
|
||||
" pred_op(\n",
|
||||
" host_name=ISVC_NAME,\n",
|
||||
" input_request=INPUT_REQUEST,\n",
|
||||
" cookie=COOKIE,\n",
|
||||
" url=INGRESS_GATEWAY,\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
" host_name=isvc_name,\n",
|
||||
" input_request=input_req,\n",
|
||||
" cookie=cookie,\n",
|
||||
" url=ingress_gateway,\n",
|
||||
" model=model,\n",
|
||||
" inference_type=\"predict\",\n",
|
||||
" )\n",
|
||||
" .after(deploy_task)\n",
|
||||
" .set_display_name(\"Prediction\")\n",
|
||||
" ).after(deploy_task).set_display_name(\"Prediction\")\n",
|
||||
" )\n",
|
||||
" explain_task = (\n",
|
||||
" (\n",
|
||||
" pred_op(\n",
|
||||
" host_name=ISVC_NAME,\n",
|
||||
" input_request=INPUT_REQUEST,\n",
|
||||
" cookie=COOKIE,\n",
|
||||
" url=INGRESS_GATEWAY,\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
" host_name=isvc_name,\n",
|
||||
" input_request=input_req,\n",
|
||||
" cookie=cookie,\n",
|
||||
" url=ingress_gateway,\n",
|
||||
" model=model,\n",
|
||||
" inference_type=\"explain\",\n",
|
||||
" )\n",
|
||||
" .after(pred_task)\n",
|
||||
" .set_display_name(\"Explanation\")\n",
|
||||
" ).after(pred_task).set_display_name(\"Explanation\")\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" dsl.get_pipeline_conf().add_op_transformer(\n",
|
||||
" use_k8s_secret(\n",
|
||||
" secret_name=\"mlpipeline-minio-artifact\",\n",
|
||||
|
|
@ -1138,9 +1140,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.8.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -235,8 +235,11 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@dsl.pipeline(name=\"Training Cifar10 pipeline\", description=\"Cifar 10 dataset pipeline\")\n",
|
||||
"def pytorch_cifar10(\n",
|
||||
"\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name=\"Training Cifar10 pipeline\", description=\"Cifar 10 dataset pipeline\"\n",
|
||||
")\n",
|
||||
"def pytorch_cifar10( # pylint: disable=too-many-arguments\n",
|
||||
" minio_endpoint=MINIO_ENDPOINT,\n",
|
||||
" log_bucket=LOG_BUCKET,\n",
|
||||
" log_dir=f\"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}\",\n",
|
||||
|
|
@ -249,45 +252,54 @@
|
|||
" model=MODEL_NAME,\n",
|
||||
" namespace=NAMESPACE,\n",
|
||||
" confusion_matrix_log_dir=f\"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/\",\n",
|
||||
" checkpoint_dir=f\"checkpoint_dir/cifar10\",\n",
|
||||
" checkpoint_dir=\"checkpoint_dir/cifar10\",\n",
|
||||
" input_req=INPUT_REQUEST,\n",
|
||||
" cookie=COOKIE,\n",
|
||||
" ingress_gateway=INGRESS_GATEWAY\n",
|
||||
" ingress_gateway=INGRESS_GATEWAY,\n",
|
||||
"):\n",
|
||||
" pod_template_spec = json.dumps(\n",
|
||||
" {\n",
|
||||
" \"spec\": {\n",
|
||||
" \"containers\": [\n",
|
||||
" \"\"\"Thid method defines the pipeline tasks and operations\"\"\"\n",
|
||||
" pod_template_spec = json.dumps({\n",
|
||||
" \"spec\": {\n",
|
||||
" \"containers\": [{\n",
|
||||
" \"env\": [\n",
|
||||
" {\n",
|
||||
" \"env\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"accesskey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"secretkey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\"name\": \"AWS_REGION\", \"value\": \"minio\"},\n",
|
||||
" {\"name\": \"S3_ENDPOINT\", \"value\": f\"{minio_endpoint}\"},\n",
|
||||
" {\"name\": \"S3_USE_HTTPS\", \"value\": \"0\"},\n",
|
||||
" {\"name\": \"S3_VERIFY_SSL\", \"value\": \"0\"},\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" \"name\": \"AWS_ACCESS_KEY_ID\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"accesskey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_SECRET_ACCESS_KEY\",\n",
|
||||
" \"valueFrom\": {\n",
|
||||
" \"secretKeyRef\": {\n",
|
||||
" \"name\": \"mlpipeline-minio-artifact\",\n",
|
||||
" \"key\": \"secretkey\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"AWS_REGION\",\n",
|
||||
" \"value\": \"minio\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"S3_ENDPOINT\",\n",
|
||||
" \"value\": f\"{minio_endpoint}\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"S3_USE_HTTPS\",\n",
|
||||
" \"value\": \"0\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"S3_VERIFY_SSL\",\n",
|
||||
" \"value\": \"0\"\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" }]\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" prepare_tb_task = prepare_tensorboard_op(\n",
|
||||
" log_dir_uri=f\"s3://{log_bucket}/{log_dir}\",\n",
|
||||
|
|
@ -295,42 +307,42 @@
|
|||
" pod_template_spec=pod_template_spec,\n",
|
||||
" ).set_display_name(\"Visualization\")\n",
|
||||
"\n",
|
||||
" prep_task = prep_op().after(prepare_tb_task).set_display_name(\"Preprocess & Transform\")\n",
|
||||
" prep_task = (\n",
|
||||
" prep_op().after(prepare_tb_task\n",
|
||||
" ).set_display_name(\"Preprocess & Transform\")\n",
|
||||
" )\n",
|
||||
" confusion_matrix_url = f\"minio://{log_bucket}/{confusion_matrix_log_dir}\"\n",
|
||||
" script_args = f\"model_name=resnet.pth,\" \\\n",
|
||||
" f\"confusion_matrix_url={confusion_matrix_url}\"\n",
|
||||
" # For GPU, set number of gpus and accelerator type\n",
|
||||
" ptl_args = f\"max_epochs=1, gpus=0, accelerator=None, profiler=pytorch\"\n",
|
||||
" train_task = (\n",
|
||||
" train_op(\n",
|
||||
" input_data=prep_task.outputs[\"output_data\"],\n",
|
||||
" profiler=\"pytorch\",\n",
|
||||
" confusion_matrix_url=f\"minio://{log_bucket}/{confusion_matrix_log_dir}\",\n",
|
||||
" # For GPU set gpu count and accelerator type\n",
|
||||
" gpus=0,\n",
|
||||
" accelerator='None'\n",
|
||||
" )\n",
|
||||
" .after(prep_task)\n",
|
||||
" .set_display_name(\"Training\")\n",
|
||||
" cifar_script_args=script_args,\n",
|
||||
" ptl_arguments=ptl_args\n",
|
||||
" ).after(prep_task).set_display_name(\"Training\")\n",
|
||||
" )\n",
|
||||
" # For GPU uncomment below line and set GPU limit and node selector\n",
|
||||
" # ).set_gpu_limit(1).add_node_selector_constraint('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
|
||||
" # ).set_gpu_limit(1).add_node_selector_constraint\n",
|
||||
" # ('cloud.google.com/gke-accelerator','nvidia-tesla-p4')\n",
|
||||
"\n",
|
||||
" minio_tb_upload = (\n",
|
||||
" (\n",
|
||||
" minio_op(\n",
|
||||
" bucket_name=\"mlpipeline\",\n",
|
||||
" folder_name=log_dir,\n",
|
||||
" input_path=train_task.outputs[\"tensorboard_root\"],\n",
|
||||
" filename=\"\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"Tensorboard Events Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"Tensorboard Events Pusher\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" minio_checkpoint_dir_upload = (\n",
|
||||
" (\n",
|
||||
" minio_op(\n",
|
||||
" bucket_name=\"mlpipeline\",\n",
|
||||
" folder_name=checkpoint_dir,\n",
|
||||
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
|
||||
" filename=\"\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"checkpoint_dir Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"checkpoint_dir Pusher\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" minio_mar_upload = (\n",
|
||||
|
|
@ -339,23 +351,20 @@
|
|||
" folder_name=mar_path,\n",
|
||||
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
|
||||
" filename=\"cifar10_test.mar\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"Mar Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"Mar Pusher\")\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" minio_config_upload = (\n",
|
||||
"\n",
|
||||
" (\n",
|
||||
" minio_op(\n",
|
||||
" bucket_name=\"mlpipeline\",\n",
|
||||
" folder_name=config_prop_path,\n",
|
||||
" input_path=train_task.outputs[\"checkpoint_dir\"],\n",
|
||||
" filename=\"config.properties\",\n",
|
||||
" )\n",
|
||||
" .after(train_task)\n",
|
||||
" .set_display_name(\"Conifg Pusher\")\n",
|
||||
" ).after(train_task).set_display_name(\"Conifg Pusher\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" model_uri = str(model_uri)\n",
|
||||
" # pylint: disable=unused-variable\n",
|
||||
" isvc_yaml = \"\"\"\n",
|
||||
" apiVersion: \"serving.kubeflow.org/v1beta1\"\n",
|
||||
" kind: \"InferenceService\"\n",
|
||||
|
|
@ -370,14 +379,12 @@
|
|||
" resources:\n",
|
||||
" limits:\n",
|
||||
" memory: 4Gi\n",
|
||||
" \"\"\".format(\n",
|
||||
" deploy, namespace, model_uri\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" \"\"\".format(deploy, namespace, model_uri)\n",
|
||||
"\n",
|
||||
" # For GPU inference use below yaml with gpu count and accelerator\n",
|
||||
" gpu_count = \"1\"\n",
|
||||
" accelerator = \"nvidia-tesla-p4\"\n",
|
||||
" isvc_gpu_yaml = \"\"\"\n",
|
||||
" isvc_gpu_yaml = \"\"\"# pylint: disable=unused-variable\n",
|
||||
" apiVersion: \"serving.kubeflow.org/v1beta1\"\n",
|
||||
" kind: \"InferenceService\"\n",
|
||||
" metadata:\n",
|
||||
|
|
@ -394,40 +401,33 @@
|
|||
" nvidia.com/gpu: {}\n",
|
||||
" nodeSelector:\n",
|
||||
" cloud.google.com/gke-accelerator: {}\n",
|
||||
"\"\"\".format(\n",
|
||||
" deploy, namespace, model_uri, gpu_count, accelerator\n",
|
||||
" )\n",
|
||||
"\"\"\".format(deploy, namespace, model_uri, gpu_count, accelerator)\n",
|
||||
" # Update inferenceservice_yaml for GPU inference\n",
|
||||
" deploy_task = (\n",
|
||||
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml)\n",
|
||||
" .after(minio_mar_upload)\n",
|
||||
" .set_display_name(\"Deployer\")\n",
|
||||
" deploy_op(action=\"apply\", inferenceservice_yaml=isvc_yaml\n",
|
||||
" ).after(minio_mar_upload).set_display_name(\"Deployer\")\n",
|
||||
" )\n",
|
||||
" pred_task = (\n",
|
||||
" pred_op(\n",
|
||||
" host_name=ISVC_NAME,\n",
|
||||
" input_request=INPUT_REQUEST,\n",
|
||||
" cookie=COOKIE,\n",
|
||||
" url=INGRESS_GATEWAY,\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
" host_name=isvc_name,\n",
|
||||
" input_request=input_req,\n",
|
||||
" cookie=cookie,\n",
|
||||
" url=ingress_gateway,\n",
|
||||
" model=model,\n",
|
||||
" inference_type=\"predict\",\n",
|
||||
" )\n",
|
||||
" .after(deploy_task)\n",
|
||||
" .set_display_name(\"Prediction\")\n",
|
||||
" ).after(deploy_task).set_display_name(\"Prediction\")\n",
|
||||
" )\n",
|
||||
" explain_task = (\n",
|
||||
" (\n",
|
||||
" pred_op(\n",
|
||||
" host_name=ISVC_NAME,\n",
|
||||
" input_request=INPUT_REQUEST,\n",
|
||||
" cookie=COOKIE,\n",
|
||||
" url=INGRESS_GATEWAY,\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
" host_name=isvc_name,\n",
|
||||
" input_request=input_req,\n",
|
||||
" cookie=cookie,\n",
|
||||
" url=ingress_gateway,\n",
|
||||
" model=model,\n",
|
||||
" inference_type=\"explain\",\n",
|
||||
" )\n",
|
||||
" .after(pred_task)\n",
|
||||
" .set_display_name(\"Explanation\")\n",
|
||||
" ).after(pred_task).set_display_name(\"Explanation\")\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" dsl.get_pipeline_conf().add_op_transformer(\n",
|
||||
" use_k8s_secret(\n",
|
||||
" secret_name=\"mlpipeline-minio-artifact\",\n",
|
||||
|
|
@ -625,7 +625,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.8.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
|||
|
|
@ -15,7 +15,6 @@
|
|||
import os
|
||||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
from pytorch_lightning.callbacks import (
|
||||
EarlyStopping,
|
||||
|
|
@ -25,89 +24,89 @@ from pytorch_lightning.callbacks import (
|
|||
from pytorch_kfp_components.components.visualization.component import Visualization
|
||||
from pytorch_kfp_components.components.trainer.component import Trainer
|
||||
from pytorch_kfp_components.components.mar.component import MarGeneration
|
||||
from pytorch_kfp_components.components.utils.argument_parsing import parse_input_args
|
||||
# Argument parser for user defined paths
|
||||
import pytorch_lightning
|
||||
print("Using Pytorch Lighting: {}".format(pytorch_lightning.__version__))
|
||||
parser = ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--tensorboard_root",
|
||||
type=str,
|
||||
default="output/tensorboard",
|
||||
help="Tensorboard Root path (default: output/tensorboard)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--checkpoint_dir",
|
||||
type=str,
|
||||
default="output/train/models",
|
||||
help="Path to save model checkpoints (default: output/train/models)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dataset_path",
|
||||
type=str,
|
||||
default="output/processing",
|
||||
help="Cifar10 Dataset path (default: output/processing)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model_name",
|
||||
type=str,
|
||||
default="bert.pth",
|
||||
help="Name of the model to be saved as (default: bert.pth)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--num_samples",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of samples to use for training",
|
||||
help="Path to input dataset",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--mlpipeline_ui_metadata",
|
||||
type=str,
|
||||
default="mlpipeline-ui-metadata.json",
|
||||
help="Path to write mlpipeline-ui-metadata.json",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--mlpipeline_metrics",
|
||||
type=str,
|
||||
default="mlpipeline-metrics",
|
||||
help="Path to write mlpipeline-metrics.json",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--confusion_matrix_url",
|
||||
"--script_args",
|
||||
type=str,
|
||||
help="Minio url to generate confusion matrix",
|
||||
help="Arguments for bert agnews classification script",
|
||||
)
|
||||
|
||||
parser = pl.Trainer.add_argparse_args(parent_parser=parser)
|
||||
parser.add_argument(
|
||||
"--ptl_args",
|
||||
type=str,
|
||||
help="Arguments specific to PTL trainer",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--checkpoint_dir",
|
||||
default="output/train/models",
|
||||
type=str,
|
||||
help="Arguments specific to PTL trainer",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--tensorboard_root",
|
||||
default="output/tensorboard",
|
||||
type=str,
|
||||
help="Arguments specific to PTL trainer",
|
||||
)
|
||||
args = vars(parser.parse_args())
|
||||
script_args = args["script_args"]
|
||||
ptl_args = args["ptl_args"]
|
||||
|
||||
TENSOBOARD_ROOT = args["tensorboard_root"]
|
||||
CHECKPOINT_DIR = args["checkpoint_dir"]
|
||||
DATASET_PATH = args["dataset_path"]
|
||||
|
||||
script_dict: dict = parse_input_args(input_str=script_args)
|
||||
script_dict["checkpoint_dir"] = CHECKPOINT_DIR
|
||||
|
||||
ptl_dict: dict = parse_input_args(input_str=ptl_args)
|
||||
|
||||
# Enabling Tensorboard Logger, ModelCheckpoint, Earlystopping
|
||||
|
||||
lr_logger = LearningRateMonitor()
|
||||
tboard = TensorBoardLogger(args["tensorboard_root"])
|
||||
tboard = TensorBoardLogger(TENSOBOARD_ROOT)
|
||||
early_stopping = EarlyStopping(
|
||||
monitor="val_loss", mode="min", patience=5, verbose=True
|
||||
)
|
||||
checkpoint_callback = ModelCheckpoint(
|
||||
dirpath=args["checkpoint_dir"],
|
||||
filename="cifar10_{epoch:02d}",
|
||||
dirpath=CHECKPOINT_DIR,
|
||||
filename="bert_{epoch:02d}",
|
||||
save_top_k=1,
|
||||
verbose=True,
|
||||
monitor="val_loss",
|
||||
mode="min",
|
||||
)
|
||||
|
||||
if not args["max_epochs"]:
|
||||
args["max_epochs"] = 1
|
||||
|
||||
if args["accelerator"] and args["accelerator"] == "None":
|
||||
args["accelerator"] = None
|
||||
if "accelerator" in ptl_dict and ptl_dict["accelerator"] == "None":
|
||||
ptl_dict["accelerator"] = None
|
||||
|
||||
# Setting the trainer specific arguments
|
||||
trainer_args = {
|
||||
|
|
@ -116,24 +115,29 @@ trainer_args = {
|
|||
"callbacks": [lr_logger, early_stopping, checkpoint_callback],
|
||||
}
|
||||
|
||||
if "profiler" in args and args["profiler"] != "":
|
||||
trainer_args["profiler"] = args["profiler"]
|
||||
if not ptl_dict["max_epochs"]:
|
||||
trainer_args["max_epochs"] = 1
|
||||
else:
|
||||
trainer_args["max_epochs"] = ptl_dict["max_epochs"]
|
||||
|
||||
if "profiler" in ptl_dict and ptl_dict["profiler"] != "":
|
||||
trainer_args["profiler"] = ptl_dict["profiler"]
|
||||
|
||||
# Setting the datamodule specific arguments
|
||||
data_module_args = {
|
||||
"train_glob": args["dataset_path"],
|
||||
"num_samples": args["num_samples"]
|
||||
"train_glob": DATASET_PATH,
|
||||
"num_samples": script_dict["num_samples"]
|
||||
}
|
||||
|
||||
# Creating parent directories
|
||||
Path(args["tensorboard_root"]).mkdir(parents=True, exist_ok=True)
|
||||
Path(args["checkpoint_dir"]).mkdir(parents=True, exist_ok=True)
|
||||
Path(TENSOBOARD_ROOT).mkdir(parents=True, exist_ok=True)
|
||||
Path(CHECKPOINT_DIR).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initiating the training process
|
||||
trainer = Trainer(
|
||||
module_file="bert_train.py",
|
||||
data_module_file="bert_datamodule.py",
|
||||
module_file_args=args,
|
||||
module_file_args=script_dict,
|
||||
data_module_args=data_module_args,
|
||||
trainer_args=trainer_args,
|
||||
)
|
||||
|
|
@ -158,11 +162,11 @@ if trainer.ptl_trainer.global_rank == 0:
|
|||
"HANDLER":
|
||||
os.path.join(bert_dir, "bert_handler.py"),
|
||||
"SERIALIZED_FILE":
|
||||
os.path.join(args["checkpoint_dir"], args["model_name"]),
|
||||
os.path.join(CHECKPOINT_DIR, script_dict["model_name"]),
|
||||
"VERSION":
|
||||
"1",
|
||||
"EXPORT_PATH":
|
||||
args["checkpoint_dir"],
|
||||
CHECKPOINT_DIR,
|
||||
"CONFIG_PROPERTIES":
|
||||
os.path.join(bert_dir, "config.properties"),
|
||||
"EXTRA_FILES":
|
||||
|
|
@ -175,7 +179,7 @@ if trainer.ptl_trainer.global_rank == 0:
|
|||
os.path.join(bert_dir, "requirements.txt")
|
||||
}
|
||||
|
||||
MarGeneration(mar_config=mar_config, mar_save_path=args["checkpoint_dir"])
|
||||
MarGeneration(mar_config=mar_config, mar_save_path=CHECKPOINT_DIR)
|
||||
|
||||
classes = [
|
||||
"World",
|
||||
|
|
@ -196,7 +200,7 @@ if trainer.ptl_trainer.global_rank == 0:
|
|||
"actuals": model.target,
|
||||
"preds": model.preds,
|
||||
"classes": class_list,
|
||||
"url": args["confusion_matrix_url"],
|
||||
"url": script_dict["confusion_matrix_url"],
|
||||
}
|
||||
|
||||
test_accuracy = round(float(model.test_acc.compute()), 2)
|
||||
|
|
@ -205,11 +209,11 @@ if trainer.ptl_trainer.global_rank == 0:
|
|||
|
||||
visualization_arguments = {
|
||||
"input": {
|
||||
"tensorboard_root": args["tensorboard_root"],
|
||||
"checkpoint_dir": args["checkpoint_dir"],
|
||||
"dataset_path": args["dataset_path"],
|
||||
"model_name": args["model_name"],
|
||||
"confusion_matrix_url": args["confusion_matrix_url"],
|
||||
"tensorboard_root": TENSOBOARD_ROOT,
|
||||
"checkpoint_dir": CHECKPOINT_DIR,
|
||||
"dataset_path": DATASET_PATH,
|
||||
"model_name": script_dict["model_name"],
|
||||
"confusion_matrix_url": script_dict["confusion_matrix_url"],
|
||||
},
|
||||
"output": {
|
||||
"mlpipeline_ui_metadata": args["mlpipeline_ui_metadata"],
|
||||
|
|
|
|||
|
|
@ -76,6 +76,7 @@ def pytorch_bert( # pylint: disable=too-many-arguments
|
|||
namespace=NAMESPACE,
|
||||
confusion_matrix_log_dir=f"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/",
|
||||
num_samples=1000,
|
||||
max_epochs=1
|
||||
):
|
||||
"""Thid method defines the pipeline tasks and operations"""
|
||||
prepare_tb_task = prepare_tensorboard_op(
|
||||
|
|
@ -129,16 +130,20 @@ def pytorch_bert( # pylint: disable=too-many-arguments
|
|||
prep_op().after(prepare_tb_task
|
||||
).set_display_name("Preprocess & Transform")
|
||||
)
|
||||
confusion_matrix_url = f"minio://{log_bucket}/{confusion_matrix_log_dir}"
|
||||
script_args = f"model_name=bert.pth," \
|
||||
f"num_samples={num_samples}," \
|
||||
f"confusion_matrix_url={confusion_matrix_url}"
|
||||
# For gpus, set number of gpus and accelerator type
|
||||
ptl_args = f"max_epochs={max_epochs}," \
|
||||
"profiler=pytorch," \
|
||||
"gpus=0," \
|
||||
"accelerator=None"
|
||||
train_task = (
|
||||
train_op(
|
||||
input_data=prep_task.outputs["output_data"],
|
||||
profiler="pytorch",
|
||||
confusion_matrix_url=f"minio://{log_bucket}/"
|
||||
f"{confusion_matrix_log_dir}",
|
||||
num_samples=num_samples,
|
||||
# For GPU set gpu count and accelerator type
|
||||
gpus=0,
|
||||
accelerator="None",
|
||||
bert_script_args=script_args,
|
||||
ptl_arguments=ptl_args
|
||||
).after(prep_task).set_display_name("Training")
|
||||
)
|
||||
# For GPU uncomment below line and set GPU limit and node selector
|
||||
|
|
|
|||
|
|
@ -16,11 +16,8 @@ description: |
|
|||
Pytorch training
|
||||
inputs:
|
||||
- {name: input_data, description: 'Input dataset path'}
|
||||
- {name: profiler, description: 'Pytorch profiler type'}
|
||||
- {name: confusion_matrix_url, description: 'Minio url to upload confusion matrix'}
|
||||
- {name: num_samples, default: 1000, description: 'Number of samples to train'}
|
||||
- {name: gpus, type: Integer, default: 0, description: 'Number of gpus to use for training'}
|
||||
- {name: accelerator, type: String, default: 'None', description: 'PTL accelerator type'}
|
||||
- {name: bert_script_args, description: 'Arguments to the bert script'}
|
||||
- {name: ptl_arguments, description: 'Arguments to pytorch lightning Trainer'}
|
||||
|
||||
outputs:
|
||||
- {name: tensorboard_root, description: "Tensorboard output path"}
|
||||
|
|
@ -36,21 +33,15 @@ implementation:
|
|||
args:
|
||||
- --dataset_path
|
||||
- {inputPath: input_data}
|
||||
- --script_args
|
||||
- { inputValue: bert_script_args }
|
||||
- --ptl_args
|
||||
- { inputValue: ptl_arguments }
|
||||
- --tensorboard_root
|
||||
- {outputPath: tensorboard_root}
|
||||
- --checkpoint_dir
|
||||
- {outputPath: checkpoint_dir}
|
||||
- --profiler
|
||||
- {inputValue: profiler}
|
||||
- --mlpipeline_ui_metadata
|
||||
- {outputPath: MLPipeline UI Metadata}
|
||||
- --mlpipeline_metrics
|
||||
- {outputPath: MLPipeline Metrics}
|
||||
- --confusion_matrix_url
|
||||
- { inputValue: confusion_matrix_url}
|
||||
- --num_samples
|
||||
- { inputValue: num_samples}
|
||||
- --gpus
|
||||
- { inputValue: gpus}
|
||||
- --accelerator
|
||||
- { inputValue: accelerator}
|
||||
|
|
|
|||
|
|
@ -15,7 +15,6 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
from argparse import ArgumentParser
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
from pytorch_lightning.callbacks import (
|
||||
EarlyStopping,
|
||||
|
|
@ -27,6 +26,7 @@ from pytorch_kfp_components.components.visualization.component import (
|
|||
)
|
||||
from pytorch_kfp_components.components.trainer.component import Trainer
|
||||
from pytorch_kfp_components.components.mar.component import MarGeneration
|
||||
from pytorch_kfp_components.components.utils.argument_parsing import parse_input_args
|
||||
|
||||
# Argument parser for user defined paths
|
||||
parser = ArgumentParser()
|
||||
|
|
@ -79,27 +79,40 @@ parser.add_argument(
|
|||
default=None,
|
||||
help="Minio url to generate confusion matrix",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--pod_template_spec",
|
||||
"--script_args",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Pod template spec",
|
||||
help="Arguments for bert agnews classification script",
|
||||
)
|
||||
|
||||
parser = pl.Trainer.add_argparse_args(parent_parser=parser)
|
||||
|
||||
parser.add_argument(
|
||||
"--ptl_args",
|
||||
type=str,
|
||||
help="Arguments specific to PTL trainer",
|
||||
)
|
||||
# parser = pl.Trainer.add_argparse_args(parent_parser=parser)
|
||||
args = vars(parser.parse_args())
|
||||
script_args = args["script_args"]
|
||||
ptl_args = args["ptl_args"]
|
||||
|
||||
TENSORBOARD_ROOT = args["tensorboard_root"]
|
||||
CHECKPOINT_DIR = args["checkpoint_dir"]
|
||||
DATASET_PATH = args["dataset_path"]
|
||||
|
||||
script_dict: dict = parse_input_args(input_str=script_args)
|
||||
script_dict["checkpoint_dir"] = CHECKPOINT_DIR
|
||||
|
||||
ptl_dict: dict = parse_input_args(input_str=ptl_args)
|
||||
|
||||
# Enabling Tensorboard Logger, ModelCheckpoint, Earlystopping
|
||||
|
||||
lr_logger = LearningRateMonitor()
|
||||
tboard = TensorBoardLogger(args["tensorboard_root"])
|
||||
tboard = TensorBoardLogger(TENSORBOARD_ROOT)
|
||||
early_stopping = EarlyStopping(
|
||||
monitor="val_loss", mode="min", patience=5, verbose=True
|
||||
)
|
||||
checkpoint_callback = ModelCheckpoint(
|
||||
dirpath=args["checkpoint_dir"],
|
||||
dirpath=CHECKPOINT_DIR,
|
||||
filename="cifar10_{epoch:02d}",
|
||||
save_top_k=1,
|
||||
verbose=True,
|
||||
|
|
@ -107,11 +120,8 @@ checkpoint_callback = ModelCheckpoint(
|
|||
mode="min",
|
||||
)
|
||||
|
||||
if not args["max_epochs"]:
|
||||
args["max_epochs"] = 1
|
||||
|
||||
if args["accelerator"] and args["accelerator"] == "None":
|
||||
args["accelerator"] = None
|
||||
if "accelerator" in ptl_dict and ptl_dict["accelerator"] == "None":
|
||||
ptl_dict["accelerator"] = None
|
||||
|
||||
# Setting the trainer specific arguments
|
||||
trainer_args = {
|
||||
|
|
@ -120,15 +130,20 @@ trainer_args = {
|
|||
"callbacks": [lr_logger, early_stopping, checkpoint_callback],
|
||||
}
|
||||
|
||||
if "profiler" in args and args["profiler"] != "":
|
||||
trainer_args["profiler"] = args["profiler"]
|
||||
if not ptl_dict["max_epochs"]:
|
||||
trainer_args["max_epochs"] = 1
|
||||
else:
|
||||
trainer_args["max_epochs"] = ptl_dict["max_epochs"]
|
||||
|
||||
if "profiler" in ptl_dict and ptl_dict["profiler"] != "":
|
||||
trainer_args["profiler"] = ptl_dict["profiler"]
|
||||
|
||||
# Setting the datamodule specific arguments
|
||||
data_module_args = {"train_glob": args["dataset_path"]}
|
||||
data_module_args = {"train_glob": DATASET_PATH}
|
||||
|
||||
# Creating parent directories
|
||||
Path(args["tensorboard_root"]).mkdir(parents=True, exist_ok=True)
|
||||
Path(args["checkpoint_dir"]).mkdir(parents=True, exist_ok=True)
|
||||
Path(TENSORBOARD_ROOT).mkdir(parents=True, exist_ok=True)
|
||||
Path(CHECKPOINT_DIR).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initiating the training process
|
||||
trainer = Trainer(
|
||||
|
|
@ -154,11 +169,11 @@ if trainer.ptl_trainer.global_rank == 0:
|
|||
"HANDLER":
|
||||
os.path.join(cifar_dir, "cifar10_handler.py"),
|
||||
"SERIALIZED_FILE":
|
||||
os.path.join(args["checkpoint_dir"], args["model_name"]),
|
||||
os.path.join(CHECKPOINT_DIR, script_dict["model_name"]),
|
||||
"VERSION":
|
||||
"1",
|
||||
"EXPORT_PATH":
|
||||
args["checkpoint_dir"],
|
||||
CHECKPOINT_DIR,
|
||||
"CONFIG_PROPERTIES":
|
||||
os.path.join(cifar_dir, "config.properties"),
|
||||
"EXTRA_FILES":
|
||||
|
|
@ -170,7 +185,7 @@ if trainer.ptl_trainer.global_rank == 0:
|
|||
os.path.join(cifar_dir, "requirements.txt"),
|
||||
}
|
||||
|
||||
MarGeneration(mar_config=mar_config, mar_save_path=args["checkpoint_dir"])
|
||||
MarGeneration(mar_config=mar_config, mar_save_path=CHECKPOINT_DIR)
|
||||
|
||||
classes = [
|
||||
"airplane",
|
||||
|
|
@ -207,11 +222,11 @@ if trainer.ptl_trainer.global_rank == 0:
|
|||
|
||||
visualization_arguments = {
|
||||
"input": {
|
||||
"tensorboard_root": args["tensorboard_root"],
|
||||
"checkpoint_dir": args["checkpoint_dir"],
|
||||
"dataset_path": args["dataset_path"],
|
||||
"model_name": args["model_name"],
|
||||
"confusion_matrix_url": args["confusion_matrix_url"],
|
||||
"tensorboard_root": TENSORBOARD_ROOT,
|
||||
"checkpoint_dir": CHECKPOINT_DIR,
|
||||
"dataset_path": DATASET_PATH,
|
||||
"model_name": script_dict["model_name"],
|
||||
"confusion_matrix_url": script_dict["confusion_matrix_url"],
|
||||
},
|
||||
"output": {
|
||||
"mlpipeline_ui_metadata": args["mlpipeline_ui_metadata"],
|
||||
|
|
@ -231,5 +246,5 @@ if trainer.ptl_trainer.global_rank == 0:
|
|||
markdown=markdown_dict,
|
||||
)
|
||||
|
||||
checpoint_dir_contents = os.listdir(args["checkpoint_dir"])
|
||||
checpoint_dir_contents = os.listdir(CHECKPOINT_DIR)
|
||||
print(f"Checkpoint Directory Contents: {checpoint_dir_contents}")
|
||||
|
|
|
|||
|
|
@ -137,15 +137,16 @@ def pytorch_cifar10( # pylint: disable=too-many-arguments
|
|||
prep_op().after(prepare_tb_task
|
||||
).set_display_name("Preprocess & Transform")
|
||||
)
|
||||
confusion_matrix_url = f"minio://{log_bucket}/{confusion_matrix_log_dir}"
|
||||
script_args = f"model_name=resnet.pth," \
|
||||
f"confusion_matrix_url={confusion_matrix_url}"
|
||||
# For gpus, set number of gpus and accelerator type
|
||||
ptl_args = "max_epochs=1, gpus=0, accelerator=None, profiler=pytorch"
|
||||
train_task = (
|
||||
train_op(
|
||||
input_data=prep_task.outputs["output_data"],
|
||||
profiler="pytorch",
|
||||
confusion_matrix_url=f"minio://{log_bucket}/"
|
||||
f"{confusion_matrix_log_dir}",
|
||||
# For GPU set gpu count and accelerator type
|
||||
gpus=0,
|
||||
accelerator="None",
|
||||
cifar_script_args=script_args,
|
||||
ptl_arguments=ptl_args
|
||||
).after(prep_task).set_display_name("Training")
|
||||
)
|
||||
# For GPU uncomment below line and set GPU limit and node selector
|
||||
|
|
|
|||
|
|
@ -17,10 +17,8 @@ description: |
|
|||
|
||||
inputs:
|
||||
- {name: input_data, description: 'Input dataset path'}
|
||||
- {name: profiler, description: 'Pytorch profiler type'}
|
||||
- {name: confusion_matrix_url, description: 'Minio url to upload confusion matrix'}
|
||||
- {name: gpus, type: Integer, default: 0, description: 'Number of gpus to use for training'}
|
||||
- {name: accelerator, type: String, default: 'None', description: 'PTL accelerator type'}
|
||||
- {name: cifar_script_args, description: 'Arguments to the cifar script'}
|
||||
- {name: ptl_arguments, description: 'Arguments to pytorch lightning Trainer'}
|
||||
|
||||
outputs:
|
||||
- {name: tensorboard_root, description: 'Tensorboard output path'}
|
||||
|
|
@ -42,15 +40,11 @@ implementation:
|
|||
- {outputPath: tensorboard_root}
|
||||
- --checkpoint_dir
|
||||
- {outputPath: checkpoint_dir}
|
||||
- --profiler
|
||||
- {inputValue: profiler}
|
||||
- --mlpipeline_ui_metadata
|
||||
- {outputPath: MLPipeline UI Metadata}
|
||||
- --mlpipeline_metrics
|
||||
- { outputPath: MLPipeline Metrics}
|
||||
- --confusion_matrix_url
|
||||
- { inputValue: confusion_matrix_url}
|
||||
- --gpus
|
||||
- { inputValue: gpus}
|
||||
- --accelerator
|
||||
- { inputValue: accelerator}
|
||||
- --script_args
|
||||
- { inputValue: cifar_script_args }
|
||||
- --ptl_args
|
||||
- { inputValue: ptl_arguments }
|
||||
|
|
|
|||
Loading…
Reference in New Issue