pipelines/samples/contrib/aws-samples/robomaker_simulation/sagemaker_robomaker_rl_job.py

197 lines
6.7 KiB
Python

#!/usr/bin/env python3
# Uncomment the apply(use_aws_secret()) below if you are not using OIDC
# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md
import kfp
import os
from kfp import components
from kfp import dsl
import random
import string
from kfp.aws import use_aws_secret
cur_file_dir = os.path.dirname(__file__)
components_dir = os.path.join(cur_file_dir, "../../../../components/aws/sagemaker/")
robomaker_create_sim_app_op = components.load_component_from_file(
components_dir + "/create_simulation_app/component.yaml"
)
robomaker_sim_job_op = components.load_component_from_file(
components_dir + "/simulation_job/component.yaml"
)
robomaker_delete_sim_app_op = components.load_component_from_file(
components_dir + "/delete_simulation_app/component.yaml"
)
sagemaker_rlestimator_op = components.load_component_from_file(
components_dir + "/rlestimator/component.yaml"
)
metric_definitions = [
{"Name": "reward-training", "Regex": "^Training>.*Total reward=(.*?),"},
{"Name": "ppo-surrogate-loss", "Regex": "^Policy training>.*Surrogate loss=(.*?),"},
{"Name": "ppo-entropy", "Regex": "^Policy training>.*Entropy=(.*?),"},
{"Name": "reward-testing", "Regex": "^Testing>.*Total reward=(.*?),"},
]
# Simulation Application Inputs
region = "us-east-1"
simulation_software_name = "Gazebo"
simulation_software_version = "7"
robot_software_name = "ROS"
robot_software_version = "Kinetic"
rendering_engine_name = "OGRE"
rendering_engine_version = "1.x"
simulation_app_name = "robomaker-pipeline-objecttracker-sim-app" + "".join(
random.choice(string.ascii_lowercase) for i in range(10)
)
sources_bucket = "your_sagemaker_bucket_name"
sources_key = "object-tracker/simulation_ws.tar.gz"
sources_architecture = "X86_64"
sources = [
{
"s3Bucket": sources_bucket,
"s3Key": sources_key,
"architecture": sources_architecture,
}
]
# RLEstimator Inputs
entry_point = "training_worker.py"
rl_sources_key = "rl-object-tracker-sagemaker-201123-042019/source/sourcedir.tar.gz"
source_dir = "s3://{}/{}".format(sources_bucket, rl_sources_key)
rl_output_path = "s3://{}/".format(sources_bucket)
train_instance_type = "ml.c5.2xlarge"
train_instance_count = 1
toolkit = "coach"
toolkit_version = "0.11"
framework = "tensorflow"
job_name = "rl-kf-pipeline-objecttracker" + "".join(
random.choice(string.ascii_lowercase) for i in range(10)
)
max_run = 300
s3_prefix = "rl-object-tracker-sagemaker-201123-042019"
hyperparameters = {
"s3_bucket": sources_bucket,
"s3_prefix": s3_prefix,
"aws_region": "us-east-1",
"RLCOACH_PRESET": "object_tracker",
}
role = "your_sagemaker_role_name"
security_groups = ["sg-0490601e83f220e82"]
subnets = [
"subnet-0efc73526db16a4a4",
"subnet-0b8af626f39e7d462",
]
# Simulation Job Inputs
output_bucket = "kf-pipelines-robomaker-output"
output_key = "test-output-key"
@dsl.pipeline(
name="SageMaker & RoboMaker pipeline",
description="SageMaker & RoboMaker Reinforcement Learning job where the jobs work together to train an RL model",
)
def sagemaker_robomaker_rl_job(
region=region,
role=role,
name=simulation_app_name,
sources=sources,
simulation_software_name=simulation_software_name,
simulation_software_version=simulation_software_version,
robot_software_name=robot_software_name,
robot_software_version=robot_software_version,
rendering_engine_name=rendering_engine_name,
rendering_engine_version=rendering_engine_version,
output_bucket=output_bucket,
robomaker_output_path=output_key,
vpc_security_group_ids=security_groups,
vpc_subnets=subnets,
entry_point=entry_point,
source_dir=source_dir,
toolkit=toolkit,
toolkit_version=toolkit_version,
framework=framework,
assume_role=role,
instance_type=train_instance_type,
instance_count=train_instance_count,
output_path=rl_output_path,
job_name=job_name,
metric_definitions=metric_definitions,
max_run=max_run,
hyperparameters=hyperparameters,
sources_bucket=sources_bucket,
s3_prefix=s3_prefix,
):
robomaker_create_sim_app = robomaker_create_sim_app_op(
region=region,
app_name=name,
sources=sources,
simulation_software_name=simulation_software_name,
simulation_software_version=simulation_software_version,
robot_software_name=robot_software_name,
robot_software_version=robot_software_version,
rendering_engine_name=rendering_engine_name,
rendering_engine_version=rendering_engine_version,
)
# .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
rlestimator_training_toolkit_coach = sagemaker_rlestimator_op(
region=region,
entry_point=entry_point,
source_dir=source_dir,
toolkit=toolkit,
toolkit_version=toolkit_version,
framework=framework,
role=assume_role,
instance_type=instance_type,
instance_count=instance_count,
model_artifact_path=output_path,
job_name=job_name,
max_run=max_run,
hyperparameters=hyperparameters,
metric_definitions=metric_definitions,
vpc_subnets=vpc_subnets,
vpc_security_group_ids=vpc_security_group_ids,
)
# .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
robomaker_simulation_job = robomaker_sim_job_op(
region=region,
role=role,
output_bucket=output_bucket,
output_path=robomaker_output_path,
max_run=3800,
failure_behavior="Continue",
sim_app_arn=robomaker_create_sim_app.outputs["arn"],
sim_app_launch_config={
"packageName": "object_tracker_simulation",
"launchFile": "evaluation.launch",
"environmentVariables": {
"MODEL_S3_BUCKET": sources_bucket,
"MODEL_S3_PREFIX": s3_prefix,
"ROS_AWS_REGION": region,
"NUMBER_OF_ROLLOUT_WORKERS": "1",
"MARKOV_PRESET_FILE": "object_tracker.py",
},
"streamUI": True,
},
vpc_security_group_ids=vpc_security_group_ids,
vpc_subnets=vpc_subnets,
use_public_ip="True",
)
# .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
robomaker_delete_sim_app = robomaker_delete_sim_app_op(
region=region, arn=robomaker_create_sim_app.outputs["arn"],
).after(robomaker_simulation_job, robomaker_create_sim_app)
# .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
if __name__ == "__main__":
kfp.compiler.Compiler().compile(sagemaker_robomaker_rl_job, __file__ + ".zip")