197 lines
6.7 KiB
Python
197 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
|
|
# Uncomment the apply(use_aws_secret()) below if you are not using OIDC
|
|
# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md
|
|
|
|
import kfp
|
|
import os
|
|
from kfp import components
|
|
from kfp import dsl
|
|
import random
|
|
import string
|
|
from kfp.aws import use_aws_secret
|
|
|
|
cur_file_dir = os.path.dirname(__file__)
|
|
components_dir = os.path.join(cur_file_dir, "../../../../components/aws/sagemaker/")
|
|
|
|
robomaker_create_sim_app_op = components.load_component_from_file(
|
|
components_dir + "/create_simulation_app/component.yaml"
|
|
)
|
|
|
|
robomaker_sim_job_op = components.load_component_from_file(
|
|
components_dir + "/simulation_job/component.yaml"
|
|
)
|
|
|
|
robomaker_delete_sim_app_op = components.load_component_from_file(
|
|
components_dir + "/delete_simulation_app/component.yaml"
|
|
)
|
|
|
|
sagemaker_rlestimator_op = components.load_component_from_file(
|
|
components_dir + "/rlestimator/component.yaml"
|
|
)
|
|
|
|
metric_definitions = [
|
|
{"Name": "reward-training", "Regex": "^Training>.*Total reward=(.*?),"},
|
|
{"Name": "ppo-surrogate-loss", "Regex": "^Policy training>.*Surrogate loss=(.*?),"},
|
|
{"Name": "ppo-entropy", "Regex": "^Policy training>.*Entropy=(.*?),"},
|
|
{"Name": "reward-testing", "Regex": "^Testing>.*Total reward=(.*?),"},
|
|
]
|
|
|
|
# Simulation Application Inputs
|
|
region = "us-east-1"
|
|
simulation_software_name = "Gazebo"
|
|
simulation_software_version = "7"
|
|
robot_software_name = "ROS"
|
|
robot_software_version = "Kinetic"
|
|
rendering_engine_name = "OGRE"
|
|
rendering_engine_version = "1.x"
|
|
simulation_app_name = "robomaker-pipeline-objecttracker-sim-app" + "".join(
|
|
random.choice(string.ascii_lowercase) for i in range(10)
|
|
)
|
|
sources_bucket = "your_sagemaker_bucket_name"
|
|
sources_key = "object-tracker/simulation_ws.tar.gz"
|
|
sources_architecture = "X86_64"
|
|
sources = [
|
|
{
|
|
"s3Bucket": sources_bucket,
|
|
"s3Key": sources_key,
|
|
"architecture": sources_architecture,
|
|
}
|
|
]
|
|
|
|
# RLEstimator Inputs
|
|
entry_point = "training_worker.py"
|
|
rl_sources_key = "rl-object-tracker-sagemaker-201123-042019/source/sourcedir.tar.gz"
|
|
source_dir = "s3://{}/{}".format(sources_bucket, rl_sources_key)
|
|
rl_output_path = "s3://{}/".format(sources_bucket)
|
|
train_instance_type = "ml.c5.2xlarge"
|
|
train_instance_count = 1
|
|
toolkit = "coach"
|
|
toolkit_version = "0.11"
|
|
framework = "tensorflow"
|
|
job_name = "rl-kf-pipeline-objecttracker" + "".join(
|
|
random.choice(string.ascii_lowercase) for i in range(10)
|
|
)
|
|
max_run = 300
|
|
s3_prefix = "rl-object-tracker-sagemaker-201123-042019"
|
|
hyperparameters = {
|
|
"s3_bucket": sources_bucket,
|
|
"s3_prefix": s3_prefix,
|
|
"aws_region": "us-east-1",
|
|
"RLCOACH_PRESET": "object_tracker",
|
|
}
|
|
role = "your_sagemaker_role_name"
|
|
security_groups = ["sg-0490601e83f220e82"]
|
|
subnets = [
|
|
"subnet-0efc73526db16a4a4",
|
|
"subnet-0b8af626f39e7d462",
|
|
]
|
|
|
|
# Simulation Job Inputs
|
|
output_bucket = "kf-pipelines-robomaker-output"
|
|
output_key = "test-output-key"
|
|
|
|
|
|
@dsl.pipeline(
|
|
name="SageMaker & RoboMaker pipeline",
|
|
description="SageMaker & RoboMaker Reinforcement Learning job where the jobs work together to train an RL model",
|
|
)
|
|
def sagemaker_robomaker_rl_job(
|
|
region=region,
|
|
role=role,
|
|
name=simulation_app_name,
|
|
sources=sources,
|
|
simulation_software_name=simulation_software_name,
|
|
simulation_software_version=simulation_software_version,
|
|
robot_software_name=robot_software_name,
|
|
robot_software_version=robot_software_version,
|
|
rendering_engine_name=rendering_engine_name,
|
|
rendering_engine_version=rendering_engine_version,
|
|
output_bucket=output_bucket,
|
|
robomaker_output_path=output_key,
|
|
vpc_security_group_ids=security_groups,
|
|
vpc_subnets=subnets,
|
|
entry_point=entry_point,
|
|
source_dir=source_dir,
|
|
toolkit=toolkit,
|
|
toolkit_version=toolkit_version,
|
|
framework=framework,
|
|
assume_role=role,
|
|
instance_type=train_instance_type,
|
|
instance_count=train_instance_count,
|
|
output_path=rl_output_path,
|
|
job_name=job_name,
|
|
metric_definitions=metric_definitions,
|
|
max_run=max_run,
|
|
hyperparameters=hyperparameters,
|
|
sources_bucket=sources_bucket,
|
|
s3_prefix=s3_prefix,
|
|
):
|
|
robomaker_create_sim_app = robomaker_create_sim_app_op(
|
|
region=region,
|
|
app_name=name,
|
|
sources=sources,
|
|
simulation_software_name=simulation_software_name,
|
|
simulation_software_version=simulation_software_version,
|
|
robot_software_name=robot_software_name,
|
|
robot_software_version=robot_software_version,
|
|
rendering_engine_name=rendering_engine_name,
|
|
rendering_engine_version=rendering_engine_version,
|
|
)
|
|
# .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
|
|
|
rlestimator_training_toolkit_coach = sagemaker_rlestimator_op(
|
|
region=region,
|
|
entry_point=entry_point,
|
|
source_dir=source_dir,
|
|
toolkit=toolkit,
|
|
toolkit_version=toolkit_version,
|
|
framework=framework,
|
|
role=assume_role,
|
|
instance_type=instance_type,
|
|
instance_count=instance_count,
|
|
model_artifact_path=output_path,
|
|
job_name=job_name,
|
|
max_run=max_run,
|
|
hyperparameters=hyperparameters,
|
|
metric_definitions=metric_definitions,
|
|
vpc_subnets=vpc_subnets,
|
|
vpc_security_group_ids=vpc_security_group_ids,
|
|
)
|
|
# .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
|
|
|
robomaker_simulation_job = robomaker_sim_job_op(
|
|
region=region,
|
|
role=role,
|
|
output_bucket=output_bucket,
|
|
output_path=robomaker_output_path,
|
|
max_run=3800,
|
|
failure_behavior="Continue",
|
|
sim_app_arn=robomaker_create_sim_app.outputs["arn"],
|
|
sim_app_launch_config={
|
|
"packageName": "object_tracker_simulation",
|
|
"launchFile": "evaluation.launch",
|
|
"environmentVariables": {
|
|
"MODEL_S3_BUCKET": sources_bucket,
|
|
"MODEL_S3_PREFIX": s3_prefix,
|
|
"ROS_AWS_REGION": region,
|
|
"NUMBER_OF_ROLLOUT_WORKERS": "1",
|
|
"MARKOV_PRESET_FILE": "object_tracker.py",
|
|
},
|
|
"streamUI": True,
|
|
},
|
|
vpc_security_group_ids=vpc_security_group_ids,
|
|
vpc_subnets=vpc_subnets,
|
|
use_public_ip="True",
|
|
)
|
|
# .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
|
|
|
robomaker_delete_sim_app = robomaker_delete_sim_app_op(
|
|
region=region, arn=robomaker_create_sim_app.outputs["arn"],
|
|
).after(robomaker_simulation_job, robomaker_create_sim_app)
|
|
# .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
kfp.compiler.Compiler().compile(sagemaker_robomaker_rl_job, __file__ + ".zip")
|