pipelines/samples/contrib/aws-samples/robomaker_simulation/sagemaker_robomaker_rl_job.py

#!/usr/bin/env python3

# Uncomment the apply(use_aws_secret()) below if you are not using OIDC
# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md

import kfp
import os
from kfp import components
from kfp import dsl
import random
import string
from kfp.aws import use_aws_secret

cur_file_dir = os.path.dirname(__file__)
components_dir = os.path.join(cur_file_dir, "../../../../components/aws/sagemaker/")

robomaker_create_sim_app_op = components.load_component_from_file(
    components_dir + "/create_simulation_app/component.yaml"
)

robomaker_sim_job_op = components.load_component_from_file(
    components_dir + "/simulation_job/component.yaml"
)

robomaker_delete_sim_app_op = components.load_component_from_file(
    components_dir + "/delete_simulation_app/component.yaml"
)

sagemaker_rlestimator_op = components.load_component_from_file(
    components_dir + "/rlestimator/component.yaml"
)

metric_definitions = [
    {"Name": "reward-training", "Regex": "^Training>.*Total reward=(.*?),"},
    {"Name": "ppo-surrogate-loss", "Regex": "^Policy training>.*Surrogate loss=(.*?),"},
    {"Name": "ppo-entropy", "Regex": "^Policy training>.*Entropy=(.*?),"},
    {"Name": "reward-testing", "Regex": "^Testing>.*Total reward=(.*?),"},
]

# Simulation Application Inputs
region = "us-east-1"
simulation_software_name = "Gazebo"
simulation_software_version = "7"
robot_software_name = "ROS"
robot_software_version = "Kinetic"
rendering_engine_name = "OGRE"
rendering_engine_version = "1.x"
simulation_app_name = "robomaker-pipeline-objecttracker-sim-app" + "".join(
    random.choice(string.ascii_lowercase) for i in range(10)
)
sources_bucket = "your_sagemaker_bucket_name"
sources_key = "object-tracker/simulation_ws.tar.gz"
sources_architecture = "X86_64"
sources = [
    {
        "s3Bucket": sources_bucket,
        "s3Key": sources_key,
        "architecture": sources_architecture,
    }
]

# RLEstimator Inputs
entry_point = "training_worker.py"
rl_sources_key = "rl-object-tracker-sagemaker-201123-042019/source/sourcedir.tar.gz"
source_dir = "s3://{}/{}".format(sources_bucket, rl_sources_key)
rl_output_path = "s3://{}/".format(sources_bucket)
train_instance_type = "ml.c5.2xlarge"
train_instance_count = 1
toolkit = "coach"
toolkit_version = "0.11"
framework = "tensorflow"
job_name = "rl-kf-pipeline-objecttracker" + "".join(
    random.choice(string.ascii_lowercase) for i in range(10)
)
max_run = 300
s3_prefix = "rl-object-tracker-sagemaker-201123-042019"
hyperparameters = {
    "s3_bucket": sources_bucket,
    "s3_prefix": s3_prefix,
    "aws_region": "us-east-1",
    "RLCOACH_PRESET": "object_tracker",
}
role = "your_sagemaker_role_name"
security_groups = ["sg-0490601e83f220e82"]
subnets = [
    "subnet-0efc73526db16a4a4",
    "subnet-0b8af626f39e7d462",
]

# Simulation Job Inputs
output_bucket = "kf-pipelines-robomaker-output"
output_key = "test-output-key"


@dsl.pipeline(
    name="SageMaker & RoboMaker pipeline",
    description="SageMaker & RoboMaker Reinforcement Learning job where the jobs work together to train an RL model",
)
def sagemaker_robomaker_rl_job(
    region=region,
    role=role,
    name=simulation_app_name,
    sources=sources,
    simulation_software_name=simulation_software_name,
    simulation_software_version=simulation_software_version,
    robot_software_name=robot_software_name,
    robot_software_version=robot_software_version,
    rendering_engine_name=rendering_engine_name,
    rendering_engine_version=rendering_engine_version,
    output_bucket=output_bucket,
    robomaker_output_path=output_key,
    vpc_security_group_ids=security_groups,
    vpc_subnets=subnets,
    entry_point=entry_point,
    source_dir=source_dir,
    toolkit=toolkit,
    toolkit_version=toolkit_version,
    framework=framework,
    assume_role=role,
    instance_type=train_instance_type,
    instance_count=train_instance_count,
    output_path=rl_output_path,
    job_name=job_name,
    metric_definitions=metric_definitions,
    max_run=max_run,
    hyperparameters=hyperparameters,
    sources_bucket=sources_bucket,
    s3_prefix=s3_prefix,
):
    robomaker_create_sim_app = robomaker_create_sim_app_op(
        region=region,
        app_name=name,
        sources=sources,
        simulation_software_name=simulation_software_name,
        simulation_software_version=simulation_software_version,
        robot_software_name=robot_software_name,
        robot_software_version=robot_software_version,
        rendering_engine_name=rendering_engine_name,
        rendering_engine_version=rendering_engine_version,
    )
    # .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))

    rlestimator_training_toolkit_coach = sagemaker_rlestimator_op(
        region=region,
        entry_point=entry_point,
        source_dir=source_dir,
        toolkit=toolkit,
        toolkit_version=toolkit_version,
        framework=framework,
        role=assume_role,
        instance_type=instance_type,
        instance_count=instance_count,
        model_artifact_path=output_path,
        job_name=job_name,
        max_run=max_run,
        hyperparameters=hyperparameters,
        metric_definitions=metric_definitions,
        vpc_subnets=vpc_subnets,
        vpc_security_group_ids=vpc_security_group_ids,
    )
    # .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))

    robomaker_simulation_job = robomaker_sim_job_op(
        region=region,
        role=role,
        output_bucket=output_bucket,
        output_path=robomaker_output_path,
        max_run=3800,
        failure_behavior="Continue",
        sim_app_arn=robomaker_create_sim_app.outputs["arn"],
        sim_app_launch_config={
            "packageName": "object_tracker_simulation",
            "launchFile": "evaluation.launch",
            "environmentVariables": {
                "MODEL_S3_BUCKET": sources_bucket,
                "MODEL_S3_PREFIX": s3_prefix,
                "ROS_AWS_REGION": region,
                "NUMBER_OF_ROLLOUT_WORKERS": "1",
                "MARKOV_PRESET_FILE": "object_tracker.py",
            },
            "streamUI": True,
        },
        vpc_security_group_ids=vpc_security_group_ids,
        vpc_subnets=vpc_subnets,
        use_public_ip="True",
    )
    # .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))

    robomaker_delete_sim_app = robomaker_delete_sim_app_op(
        region=region, arn=robomaker_create_sim_app.outputs["arn"],
    ).after(robomaker_simulation_job, robomaker_create_sim_app)
    # .apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))


if __name__ == "__main__":
    kfp.compiler.Compiler().compile(sagemaker_robomaker_rl_job, __file__ + ".zip")