AWS SageMaker : Use IAM Roles for Service Account (#3719)
* don't use aws-secret and update readme for sample pipelines * Addressed comments on PR and few more readme changes * small changes to readme * nit change * Address comments
This commit is contained in:
parent
291f5b3d7a
commit
d18ad7a563
|
|
@ -44,9 +44,7 @@ def create_parser():
|
|||
parser.add_argument('--input_filter', type=str, required=False, help='A JSONPath expression used to select a portion of the input data to pass to the algorithm.', default='')
|
||||
parser.add_argument('--output_filter', type=str, required=False, help='A JSONPath expression used to select a portion of the joined dataset to save in the output file for a batch transform job.', default='')
|
||||
parser.add_argument('--join_source', choices=['None', 'Input', ''], type=str, required=False, help='Specifies the source of the data to join with the transformed data.', default='None')
|
||||
parser.add_argument('--instance_type', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge',
|
||||
'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge',
|
||||
'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'], type=str, required=True, help='The ML compute instance type for the transform job.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--instance_type', type=str, required=False, help='The ML compute instance type for the transform job.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--instance_count', type=int, required=False, help='The number of ML compute instances to use in the transform job.')
|
||||
parser.add_argument('--resource_encryption_key', type=str, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='')
|
||||
parser.add_argument('--tags', type=_utils.yaml_or_json_str, required=False, help='An array of key-value pairs, to categorize AWS resources.', default={})
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ Argument | Description | Optional (in pipeline definition
|
|||
:--- | :---------- | :---------- | :---------- | :----------| :---------- | :----------|
|
||||
model_name_[1, 3] | The name of the model that you want to host. This is the name that you specified when creating the model | No | No | String | | |
|
||||
variant_name_[1, 3] | The name of the production variant | Yes | Yes | String | | variant_name_[1, 3] |
|
||||
instance_type_[1, 3] | The ML compute instance type | Yes | Yes | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge | ml.m4.xlarge |
|
||||
instance_type_[1, 3] | The ML compute instance type | Yes | Yes | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/)| ml.m4.xlarge |
|
||||
initial_instance_count_[1, 3] | Number of instances to launch initially | Yes | Yes | Integer | ≥ 1 | 1 |
|
||||
initial_variant_weight_[1, 3] | Determines initial traffic distribution among all of the models that you specify in the endpoint configuration. The traffic to a production variant is determined by the ratio of the VariantWeight to the sum of all VariantWeight values across all ProductionVariants. | Yes | Yes | Float | Minimum value of 0 | |
|
||||
accelerator_type_[1, 3] | The size of the Elastic Inference (EI) instance to use for the production variant | Yes | Yes | String| ml.eia1.medium, ml.eia1.large, ml.eia1.xlarge | |
|
||||
|
|
|
|||
|
|
@ -23,30 +23,23 @@ def create_parser():
|
|||
parser.add_argument('--variant_name_1', type=str, required=False, help='The name of the production variant.', default='variant-name-1')
|
||||
parser.add_argument('--model_name_1', type=str, required=True, help='The model name used for endpoint deployment.')
|
||||
parser.add_argument('--initial_instance_count_1', type=int, required=False, help='Number of instances to launch initially.', default=1)
|
||||
parser.add_argument('--instance_type_1', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge',
|
||||
'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge',
|
||||
'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge', ''], type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--instance_type_1', type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--initial_variant_weight_1', type=float, required=False, help='Determines initial traffic distribution among all of the models that you specify in the endpoint configuration.', default=1.0)
|
||||
parser.add_argument('--accelerator_type_1', choices=['ml.eia1.medium', 'ml.eia1.large', 'ml.eia1.xlarge', ''], type=str, required=False, help='The size of the Elastic Inference (EI) instance to use for the production variant.', default='')
|
||||
parser.add_argument('--variant_name_2', type=str, required=False, help='The name of the production variant.', default='variant-name-2')
|
||||
parser.add_argument('--model_name_2', type=str, required=False, help='The model name used for endpoint deployment.', default='')
|
||||
parser.add_argument('--initial_instance_count_2', type=int, required=False, help='Number of instances to launch initially.', default=1)
|
||||
parser.add_argument('--instance_type_2', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge',
|
||||
'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge',
|
||||
'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge', ''], type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--instance_type_2', type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--initial_variant_weight_2', type=float, required=False, help='Determines initial traffic distribution among all of the models that you specify in the endpoint configuration.', default=1.0)
|
||||
parser.add_argument('--accelerator_type_2', choices=['ml.eia1.medium', 'ml.eia1.large', 'ml.eia1.xlarge', ''], type=str, required=False, help='The size of the Elastic Inference (EI) instance to use for the production variant.', default='')
|
||||
parser.add_argument('--variant_name_3', type=str, required=False, help='The name of the production variant.', default='variant-name-3')
|
||||
parser.add_argument('--model_name_3', type=str, required=False, help='The model name used for endpoint deployment.', default='')
|
||||
parser.add_argument('--initial_instance_count_3', type=int, required=False, help='Number of instances to launch initially.', default=1)
|
||||
parser.add_argument('--instance_type_3', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge',
|
||||
'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge',
|
||||
'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge', ''], type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--instance_type_3', type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--initial_variant_weight_3', type=float, required=False, help='Determines initial traffic distribution among all of the models that you specify in the endpoint configuration.', default=1.0)
|
||||
parser.add_argument('--accelerator_type_3', choices=['ml.eia1.medium', 'ml.eia1.large', 'ml.eia1.xlarge', ''], type=str, required=False, help='The size of the Elastic Inference (EI) instance to use for the production variant.', default='')
|
||||
parser.add_argument('--resource_encryption_key', type=str, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='')
|
||||
parser.add_argument('--endpoint_config_tags', type=_utils.yaml_or_json_str, required=False, help='An array of key-value pairs, to categorize AWS resources.', default={})
|
||||
|
||||
parser.add_argument('--endpoint_name', type=str, required=False, help='The name of the endpoint.', default='')
|
||||
parser.add_argument('--endpoint_tags', type=_utils.yaml_or_json_str, required=False, help='An array of key-value pairs, to categorize AWS resources.', default={})
|
||||
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ categorical_parameters | The array of CategoricalParameterRange objects that spe
|
|||
channels | A list of dicts specifying the input channels (at least one); refer to [documentation](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/API_Channel.md) for parameters | No | No | List of Dicts | | |
|
||||
output_location | The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job | No | No | String | | |
|
||||
output_encryption_key | The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts | Yes | Yes | String | | |
|
||||
instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge | ml.m4.xlarge |
|
||||
instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/)| ml.m4.xlarge |
|
||||
instance_count | The number of ML compute instances to use in each training job | Yes | Yes | Int | ≥ 1 | 1 |
|
||||
volume_size | The size of the ML storage volume that you want to provision in GB | Yes | Yes | Int | ≥ 1 | 30 |
|
||||
max_num_jobs | The maximum number of training jobs that a hyperparameter tuning job can launch | No | No | Int | [1, 500] | |
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ from common import _utils
|
|||
def create_parser():
|
||||
parser = argparse.ArgumentParser(description='SageMaker Hyperparameter Tuning Job')
|
||||
_utils.add_default_client_arguments(parser)
|
||||
|
||||
|
||||
parser.add_argument('--job_name', type=str, required=False, help='The name of the tuning job. Must be unique within the same AWS account and AWS region.')
|
||||
parser.add_argument('--role', type=str, required=True, help='The Amazon Resource Name (ARN) that Amazon SageMaker assumes to perform tasks on your behalf.')
|
||||
parser.add_argument('--image', type=str, required=True, help='The registry path of the Docker image that contains the training algorithm.', default='')
|
||||
|
|
@ -37,9 +37,7 @@ def create_parser():
|
|||
parser.add_argument('--channels', type=_utils.yaml_or_json_str, required=True, help='A list of dicts specifying the input channels. Must have at least one.')
|
||||
parser.add_argument('--output_location', type=str, required=True, help='The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job.')
|
||||
parser.add_argument('--output_encryption_key', type=str, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts.', default='')
|
||||
parser.add_argument('--instance_type', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge',
|
||||
'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge',
|
||||
'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'], type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--instance_type', type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--instance_count', type=int, required=False, help='The number of ML compute instances to use in each training job.', default=1)
|
||||
parser.add_argument('--volume_size', type=int, required=False, help='The size of the ML storage volume that you want to provision.', default=30)
|
||||
parser.add_argument('--max_num_jobs', type=int, required=True, help='The maximum number of training jobs that a hyperparameter tuning job can launch.')
|
||||
|
|
|
|||
|
|
@ -20,8 +20,8 @@ algorithm_name | The name of the algorithm resource to use for the hyperparamete
|
|||
metric_definitions | The dictionary of name-regex pairs specify the metrics that the algorithm emits | Yes | Dict | | {} |
|
||||
put_mode | The input mode that the algorithm supports | No | String | File, Pipe | File |
|
||||
hyperparameters | Hyperparameters for the selected algorithm | No | Dict | [Depends on Algo](https://docs.aws.amazon.com/sagemaker/latest/dg/k-means-api-config.html)| |
|
||||
channels | A list of dicts specifying the input channels (at least one); refer to [documentation](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/API_Channel.md) for parameters | No | No | List of Dicts | | |
|
||||
instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge | ml.m4.xlarge |
|
||||
channels | A list of dicts specifying the input channels (at least one); refer to [documentation](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/API_Channel.md) for parameters | No | List of Dicts | | |
|
||||
instance_type | The ML compute instance type | Yes | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/) | ml.m4.xlarge |
|
||||
instance_count | The number of ML compute instances to use in each training job | Yes | Int | ≥ 1 | 1 |
|
||||
volume_size | The size of the ML storage volume that you want to provision in GB | Yes | Int | ≥ 1 | 30 |
|
||||
resource_encryption_key | The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s) | Yes | String | | |
|
||||
|
|
@ -42,7 +42,7 @@ tags | Key-value pairs to categorize AWS resources | Yes | Dict | | {} |
|
|||
Stores the Model in the s3 bucket you specified
|
||||
|
||||
# Example code
|
||||
Simple example pipeline with only Train component : [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/documents/samples/contrib/aws-samples/simple_train_pipeline)
|
||||
Simple example pipeline with only Train component : [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline)
|
||||
|
||||
# Resources
|
||||
* [Using Amazon built-in algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html)
|
||||
|
|
|
|||
|
|
@ -28,9 +28,7 @@ def create_parser():
|
|||
parser.add_argument('--training_input_mode', choices=['File', 'Pipe'], type=str, help='The input mode that the algorithm supports. File or Pipe.', default='File')
|
||||
parser.add_argument('--hyperparameters', type=_utils.yaml_or_json_str, help='Dictionary of hyperparameters for the the algorithm.', default={})
|
||||
parser.add_argument('--channels', type=_utils.yaml_or_json_str, required=True, help='A list of dicts specifying the input channels. Must have at least one.')
|
||||
parser.add_argument('--instance_type', required=True, choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge',
|
||||
'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge',
|
||||
'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'], type=str, help='The ML compute instance type.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--instance_type', required=False, type=str, help='The ML compute instance type.', default='ml.m4.xlarge')
|
||||
parser.add_argument('--instance_count', required=True, type=int, help='The registry path of the Docker image that contains the training algorithm.', default=1)
|
||||
parser.add_argument('--volume_size', type=int, required=True, help='The size of the ML storage volume that you want to provision.', default=30)
|
||||
parser.add_argument('--resource_encryption_key', type=str, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='')
|
||||
|
|
|
|||
|
|
@ -0,0 +1,197 @@
|
|||
# Sample AWS SageMaker Kubeflow Pipelines
|
||||
|
||||
This folder contains many example pipelines which use [AWS SageMaker Components for KFP](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker). The following sections explain the setup needed to run these pipelines. Once you are done with the setup, [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline) is a good place to start if you have never used these components before.
|
||||
|
||||
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. You need a cluster with Kubeflow installed on it. [Install Kubeflow on AWS cluster](https://www.kubeflow.org/docs/aws/deploy/install-kubeflow/)
|
||||
2. Install the following on your local machine or EC2 instance (These are recommended tools. Not all of these are required)
|
||||
1. [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html). If you are using an IAM user, configure your [Access Key ID, Secret Access Key](https://docs.aws.amazon.com/general/latest/gr/aws-sec-cred-types.html#access-keys-and-secret-access-keys) and preferred AWS Region by running:
|
||||
`aws configure`
|
||||
2. [aws-iam-authenticator](https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html) version 0.1.31 and above
|
||||
3. [eksctl](https://github.com/weaveworks/eksctl) version above 0.15
|
||||
4. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/#install-kubectl) version needs to be your k8s version +/- 1 minor version.
|
||||
5. [KFP SDK](https://www.kubeflow.org/docs/pipelines/sdk/install-sdk/#install-the-kubeflow-pipelines-sdk) (installs the dsl-compile and kfp cli)
|
||||
|
||||
|
||||
## IAM Permissions
|
||||
|
||||
To use AWS KFP Components the KFP component pods need access to AWS SageMaker.
|
||||
There are two ways you can give them access to SageMaker.
|
||||
(You need EKS cluster for Option 1)
|
||||
|
||||
**Option 1** (Recommended) [IAM roles for service account](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html).
|
||||
1. Enable OIDC support on EKS cluster
|
||||
```
|
||||
eksctl utils associate-iam-oidc-provider --cluster <cluster_name> \
|
||||
--region <cluster_region> --approve
|
||||
```
|
||||
2. Take note of the OIDC issuer URL. This URL is in the form `oidc.eks.<region>.amazonaws.com/id/<OIDC_ID>` . Note down the URL.
|
||||
```
|
||||
aws eks describe-cluster --name <cluster_name> --query "cluster.identity.oidc.issuer" --output text
|
||||
```
|
||||
3. Create a file named trust.json with the following content.
|
||||
Replace `<OIDC_URL>` with your OIDC issuer URL **(Don’t include https://)** and `<AWS_ACCOUNT_NUMBER>` with your AWS account number.
|
||||
```
|
||||
# Replace these two with proper values
|
||||
OIDC_URL="<OIDC_URL>"
|
||||
AWS_ACC_NUM="<AWS_ACCOUNT_NUMBER>"
|
||||
|
||||
# Run this to create trust.json file
|
||||
cat <<EOF > trust.json
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Principal": {
|
||||
"Federated": "arn:aws:iam::$AWS_ACC_NUM:oidc-provider/$OIDC_URL"
|
||||
},
|
||||
"Action": "sts:AssumeRoleWithWebIdentity",
|
||||
"Condition": {
|
||||
"StringEquals": {
|
||||
"$OIDC_URL:aud": "sts.amazonaws.com",
|
||||
"$OIDC_URL:sub": "system:serviceaccount:kubeflow:pipeline-runner"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
```
|
||||
4. Create an IAM role using trust.json. Make a note of the ARN returned in the output.
|
||||
```
|
||||
aws iam create-role --role-name kfp-example-pod-role --assume-role-policy-document file://trust.json
|
||||
aws iam attach-role-policy --role-name kfp-example-pod-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
|
||||
aws iam get-role --role-name kfp-example-pod-role --output text --query 'Role.Arn'
|
||||
```
|
||||
5. Edit your pipeline-runner service account.
|
||||
```
|
||||
kubectl edit -n kubeflow serviceaccount pipeline-runner
|
||||
```
|
||||
Add `eks.amazonaws.com/role-arn: <role_arn>` to annotations, then save the file. Example: **(add only line 5)**
|
||||
```
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
annotations:
|
||||
eks.amazonaws.com/role-arn: <role_arn>
|
||||
creationTimestamp: "2020-04-16T05:48:06Z"
|
||||
labels:
|
||||
app: pipeline-runner
|
||||
app.kubernetes.io/component: pipelines-runner
|
||||
app.kubernetes.io/instance: pipelines-runner-0.2.0
|
||||
app.kubernetes.io/managed-by: kfctl
|
||||
app.kubernetes.io/name: pipelines-runner
|
||||
app.kubernetes.io/part-of: kubeflow
|
||||
app.kubernetes.io/version: 0.2.0
|
||||
name: pipeline-runner
|
||||
namespace: kubeflow
|
||||
resourceVersion: "11787"
|
||||
selfLink: /api/v1/namespaces/kubeflow/serviceaccounts/pipeline-runner
|
||||
uid: d86234bd-7fa5-11ea-a8f2-02934be6dc88
|
||||
secrets:
|
||||
- name: pipeline-runner-token-dkjrk
|
||||
```
|
||||
**Option 2** Store the IAM credentials as a `aws-secret` in kubernetes cluster. Then use those in the components.
|
||||
1. You need credentials for an IAM user with SageMakerFullAccess. Apply them to k8s cluster.
|
||||
Replace `AWS_ACCESS_KEY_IN_BASE64` and `AWS_SECRET_ACCESS_IN_BASE64`.
|
||||
> Note: To get base64 string you can do `echo -n $AWS_ACCESS_KEY_ID | base64`
|
||||
```
|
||||
cat <<EOF | kubectl apply -f -
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: aws-secret
|
||||
namespace: kubeflow
|
||||
type: Opaque
|
||||
data:
|
||||
AWS_ACCESS_KEY_ID: <AWS_ACCESS_KEY_IN_BASE64>
|
||||
AWS_SECRET_ACCESS_KEY: <AWS_SECRET_ACCESS_IN_BASE64>
|
||||
EOF
|
||||
```
|
||||
2. Use the stored `aws-secret` in pipeline code by adding this line to each component in your pipeline `.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))`
|
||||
[Kubeflow Document](https://www.kubeflow.org/docs/aws/pipeline/)
|
||||
[Example Code](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py#L76) (uncomment this line)
|
||||
|
||||
## Inputs to the pipeline
|
||||
|
||||
### Sample MNIST dataset
|
||||
|
||||
Use the following python script to copy train_data, test_data, and valid_data to your bucket.
|
||||
[Create a bucket](https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html) in `us-east-1` region if you don't have one already.
|
||||
For the purposes of this demonstration, all resources will be created in the us-east-1 region.
|
||||
|
||||
|
||||
Create a new file named s3_sample_data_creator.py with following content :
|
||||
```
|
||||
import pickle, gzip, numpy, urllib.request, json
|
||||
from urllib.parse import urlparse
|
||||
|
||||
###################################################################
|
||||
# This is the only thing that you need to change to run this code
|
||||
# Give the name of your S3 bucket
|
||||
bucket = '<bucket-name>'
|
||||
|
||||
# If you are gonna use the default values of the pipeline then
|
||||
# give a bucket name which is in us-east-1 region
|
||||
###################################################################
|
||||
|
||||
|
||||
# Load the dataset
|
||||
urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz")
|
||||
with gzip.open('mnist.pkl.gz', 'rb') as f:
|
||||
train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
|
||||
|
||||
|
||||
# Upload dataset to S3
|
||||
from sagemaker.amazon.common import write_numpy_to_dense_tensor
|
||||
import io
|
||||
import boto3
|
||||
|
||||
train_data_key = 'mnist_kmeans_example/train_data'
|
||||
test_data_key = 'mnist_kmeans_example/test_data'
|
||||
train_data_location = 's3://{}/{}'.format(bucket, train_data_key)
|
||||
test_data_location = 's3://{}/{}'.format(bucket, test_data_key)
|
||||
print('training data will be uploaded to: {}'.format(train_data_location))
|
||||
print('training data will be uploaded to: {}'.format(test_data_location))
|
||||
|
||||
# Convert the training data into the format required by the SageMaker KMeans algorithm
|
||||
buf = io.BytesIO()
|
||||
write_numpy_to_dense_tensor(buf, train_set[0], train_set[1])
|
||||
buf.seek(0)
|
||||
|
||||
boto3.resource('s3').Bucket(bucket).Object(train_data_key).upload_fileobj(buf)
|
||||
|
||||
# Convert the test data into the format required by the SageMaker KMeans algorithm
|
||||
write_numpy_to_dense_tensor(buf, test_set[0], test_set[1])
|
||||
buf.seek(0)
|
||||
|
||||
boto3.resource('s3').Bucket(bucket).Object(test_data_key).upload_fileobj(buf)
|
||||
|
||||
# Convert the valid data into the format required by the SageMaker KMeans algorithm
|
||||
numpy.savetxt('valid-data.csv', valid_set[0], delimiter=',', fmt='%g')
|
||||
s3_client = boto3.client('s3')
|
||||
input_key = "{}/valid_data.csv".format("mnist_kmeans_example/input")
|
||||
s3_client.upload_file('valid-data.csv', bucket, input_key)
|
||||
```
|
||||
Run this file `python s3_sample_data_creator.py`
|
||||
|
||||
### Role Input
|
||||
|
||||
This role is used by SageMaker jobs created by the KFP to access the S3 buckets and other AWS resources.
|
||||
Run these commands to create the sagemaker-execution-role.
|
||||
Note down the Role ARN. You need to give this Role ARN as input in pipeline.
|
||||
|
||||
```
|
||||
TRUST="{ \"Version\": \"2012-10-17\", \"Statement\": [ { \"Effect\": \"Allow\", \"Principal\": { \"Service\": \"sagemaker.amazonaws.com\" }, \"Action\": \"sts:AssumeRole\" } ] }"
|
||||
aws iam create-role --role-name kfp-example-sagemaker-execution-role --assume-role-policy-document "$TRUST"
|
||||
aws iam attach-role-policy --role-name kfp-example-sagemaker-execution-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
|
||||
aws iam attach-role-policy --role-name kfp-example-sagemaker-execution-role --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess
|
||||
aws iam get-role --role-name kfp-example-sagemaker-execution-role --output text --query 'Role.Arn'
|
||||
|
||||
# note down the Role ARN.
|
||||
```
|
||||
|
||||
|
|
@ -4,6 +4,10 @@ This sample is based on [this example](https://github.com/awslabs/amazon-sagemak
|
|||
|
||||
The sample goes through the workflow of creating a private workteam, creating data labeling jobs for that team, and running a training job using the new labeled data.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Make sure you have the setup explained in this [README.md](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/README.md)
|
||||
(This pipeline does not use mnist dataset. Follow the instruction bellow to get sample dataset)
|
||||
|
||||
## Prep the dataset, label categories, and UI template
|
||||
|
||||
|
|
@ -34,26 +38,6 @@ client_ID = App client
|
|||
|
||||
> Note : Once you start a run on the pipeline you will receive the ground_truth labeling jobs at "Labeling portal sign-in URL" link
|
||||
|
||||
## SageMaker permission
|
||||
|
||||
In order to run this pipeline, we need to prepare an IAM Role to run Sagemaker jobs. You need this `role_arn` to run a pipeline. Check [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) for details.
|
||||
|
||||
This pipeline also use aws-secret to get access to Sagemaker services, please also make sure you have a `aws-secret` in the kubeflow namespace.
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: aws-secret
|
||||
namespace: kubeflow
|
||||
type: Opaque
|
||||
data:
|
||||
AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY
|
||||
AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS
|
||||
```
|
||||
|
||||
> Note: To get base64 string, try `echo -n $AWS_ACCESS_KEY_ID | base64`
|
||||
|
||||
|
||||
## Compiling the pipeline template
|
||||
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ def ground_truth_test(region='us-west-2',
|
|||
training_input_mode='Pipe',
|
||||
training_hyperparameters={"num_classes": "2", "num_training_samples": "14", "mini_batch_size": "2"},
|
||||
training_output_location='s3://your-bucket-name/mini-image-classification/training-output',
|
||||
training_instance_type='ml.p2.xlarge',
|
||||
training_instance_type='ml.m5.2xlarge',
|
||||
training_instance_count=1,
|
||||
training_volume_size=50,
|
||||
training_max_run_time=3600,
|
||||
|
|
@ -73,7 +73,7 @@ def ground_truth_test(region='us-west-2',
|
|||
user_pool=user_pool,
|
||||
user_groups=user_groups,
|
||||
client_id=client_id
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)
|
||||
|
||||
ground_truth_train = sagemaker_gt_op(
|
||||
region=region,
|
||||
|
|
@ -93,7 +93,7 @@ def ground_truth_test(region='us-west-2',
|
|||
time_limit=ground_truth_time_limit,
|
||||
task_availibility=ground_truth_task_availibility,
|
||||
max_concurrent_tasks=ground_truth_max_concurrent_tasks
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)
|
||||
|
||||
ground_truth_validation = sagemaker_gt_op(
|
||||
region=region,
|
||||
|
|
@ -113,7 +113,7 @@ def ground_truth_test(region='us-west-2',
|
|||
time_limit=ground_truth_time_limit,
|
||||
task_availibility=ground_truth_task_availibility,
|
||||
max_concurrent_tasks=ground_truth_max_concurrent_tasks
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)
|
||||
|
||||
channelObj['ChannelName'] = 'train'
|
||||
channelObj['DataSource']['S3DataSource']['S3Uri'] = str(ground_truth_train.outputs['output_manifest_location'])
|
||||
|
|
@ -134,7 +134,8 @@ def ground_truth_test(region='us-west-2',
|
|||
max_run_time=training_max_run_time,
|
||||
model_artifact_path=training_output_location,
|
||||
role=role_arn
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
kfp.compiler.Compiler().compile(ground_truth_test, __file__ + '.zip')
|
||||
|
|
|
|||
|
|
@ -1,93 +1,10 @@
|
|||
The `mnist-classification-pipeline.py` sample runs a pipeline to train a classficiation model using Kmeans with MNIST dataset on Sagemaker.
|
||||
The `kmeans-hpo-pipeline.py` is a single component hyper parameter optimisation pipeline which has default values set to use Kmeans.
|
||||
|
||||
If you do not have `train_data`, `test_data`, and `valid_data` you can use the following code to get sample data which
|
||||
(This data can be used for both of these pipelines)
|
||||
|
||||
## The sample dataset
|
||||
## Prerequisites
|
||||
|
||||
This sample is based on the [Train a Model with a Built-in Algorithm and Deploy it](https://docs.aws.amazon.com/sagemaker/latest/dg/ex1.html).
|
||||
|
||||
The sample trains and deploy a model based on the [MNIST dataset](http://www.deeplearning.net/tutorial/gettingstarted.html).
|
||||
|
||||
|
||||
Create an S3 bucket and use the following python script to copy `train_data`, `test_data`, and `valid_data.csv` to your buckets.
|
||||
(create the bucket in `us-west-2` region if you are gonna use default values of the pipeline)
|
||||
https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html
|
||||
|
||||
Create a new file named `s3_sample_data_creator.py` with following content :
|
||||
```python
|
||||
import pickle, gzip, numpy, urllib.request, json
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Load the dataset
|
||||
urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz")
|
||||
with gzip.open('mnist.pkl.gz', 'rb') as f:
|
||||
train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
|
||||
|
||||
|
||||
# Upload dataset to S3
|
||||
from sagemaker.amazon.common import write_numpy_to_dense_tensor
|
||||
import io
|
||||
import boto3
|
||||
|
||||
###################################################################
|
||||
# This is the only thing that you need to change to run this code
|
||||
# Give the name of your S3 bucket
|
||||
bucket = 'bucket-name'
|
||||
|
||||
# If you are gonna use the default values of the pipeline then
|
||||
# give a bucket name which is in us-west-2 region
|
||||
###################################################################
|
||||
|
||||
train_data_key = 'mnist_kmeans_example/train_data'
|
||||
test_data_key = 'mnist_kmeans_example/test_data'
|
||||
train_data_location = 's3://{}/{}'.format(bucket, train_data_key)
|
||||
test_data_location = 's3://{}/{}'.format(bucket, test_data_key)
|
||||
print('training data will be uploaded to: {}'.format(train_data_location))
|
||||
print('training data will be uploaded to: {}'.format(test_data_location))
|
||||
|
||||
# Convert the training data into the format required by the SageMaker KMeans algorithm
|
||||
buf = io.BytesIO()
|
||||
write_numpy_to_dense_tensor(buf, train_set[0], train_set[1])
|
||||
buf.seek(0)
|
||||
|
||||
boto3.resource('s3').Bucket(bucket).Object(train_data_key).upload_fileobj(buf)
|
||||
|
||||
# Convert the test data into the format required by the SageMaker KMeans algorithm
|
||||
write_numpy_to_dense_tensor(buf, test_set[0], test_set[1])
|
||||
buf.seek(0)
|
||||
|
||||
boto3.resource('s3').Bucket(bucket).Object(test_data_key).upload_fileobj(buf)
|
||||
|
||||
# Convert the valid data into the format required by the SageMaker KMeans algorithm
|
||||
numpy.savetxt('valid-data.csv', valid_set[0], delimiter=',', fmt='%g')
|
||||
s3_client = boto3.client('s3')
|
||||
input_key = "{}/valid_data.csv".format("mnist_kmeans_example/input")
|
||||
s3_client.upload_file('valid-data.csv', bucket, input_key)
|
||||
|
||||
```
|
||||
|
||||
Run this file `python s3_sample_data_creator.py`
|
||||
## SageMaker permission
|
||||
|
||||
In order to run this pipeline, we need to prepare an IAM Role to run Sagemaker jobs. You need this `role_arn` to run a pipeline. Check [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) for details.
|
||||
|
||||
This pipeline also use aws-secret to get access to Sagemaker services, please also make sure you have a `aws-secret` in the kubeflow namespace.
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: aws-secret
|
||||
namespace: kubeflow
|
||||
type: Opaque
|
||||
data:
|
||||
AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY
|
||||
AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS
|
||||
```
|
||||
|
||||
> Note: To get base64 string, try `echo -n $AWS_ACCESS_KEY_ID | base64`
|
||||
Make sure you have the setup explained in this [README.md](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/README.md)
|
||||
|
||||
|
||||
## Compiling the pipeline template
|
||||
|
|
@ -98,6 +15,7 @@ Follow the guide to [building a pipeline](https://www.kubeflow.org/docs/guides/p
|
|||
dsl-compile --py mnist-classification-pipeline.py --output mnist-classification-pipeline.tar.gz
|
||||
```
|
||||
|
||||
|
||||
## Deploying the pipeline
|
||||
|
||||
Open the Kubeflow pipelines UI. Create a new pipeline, and then upload the compiled specification (`.tar.gz` file) as a new pipeline template.
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
import kfp
|
||||
import json
|
||||
import copy
|
||||
|
|
@ -38,7 +39,7 @@ channelObjList.append(copy.deepcopy(channelObj))
|
|||
name='MNIST HPO test pipeline',
|
||||
description='SageMaker hyperparameter tuning job test'
|
||||
)
|
||||
def hpo_test(region='us-west-2',
|
||||
def hpo_test(region='us-east-1',
|
||||
hpo_job_name='HPO-kmeans-sample',
|
||||
image='',
|
||||
algorithm_name='K-Means',
|
||||
|
|
@ -56,7 +57,7 @@ def hpo_test(region='us-west-2',
|
|||
channels=channelObjList,
|
||||
output_location='s3://kubeflow-pipeline-data/mnist_kmeans_example/output',
|
||||
output_encryption_key='',
|
||||
instance_type='ml.p2.16xlarge',
|
||||
instance_type='ml.m5.2xlarge',
|
||||
instance_count=1,
|
||||
volume_size=50,
|
||||
max_num_jobs=1,
|
||||
|
|
@ -114,7 +115,8 @@ def hpo_test(region='us-west-2',
|
|||
checkpoint_config=checkpoint_config,
|
||||
tags=tags,
|
||||
role=role_arn,
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
kfp.compiler.Compiler().compile(hpo_test, __file__ + '.zip')
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
import kfp
|
||||
import json
|
||||
import copy
|
||||
|
|
@ -44,8 +45,8 @@ hpoChannels.append(copy.deepcopy(channelObj))
|
|||
name='MNIST Classification pipeline',
|
||||
description='MNIST Classification using KMEANS in SageMaker'
|
||||
)
|
||||
def mnist_classification(region='us-west-2',
|
||||
image='174872318107.dkr.ecr.us-west-2.amazonaws.com/kmeans:1',
|
||||
def mnist_classification(region='us-east-1',
|
||||
image='382416733822.dkr.ecr.us-east-1.amazonaws.com/kmeans:1',
|
||||
training_input_mode='File',
|
||||
hpo_strategy='Bayesian',
|
||||
hpo_metric_name='test:msd',
|
||||
|
|
@ -61,7 +62,7 @@ def mnist_classification(region='us-west-2',
|
|||
hpo_checkpoint_config={},
|
||||
output_location='s3://kubeflow-pipeline-data/mnist_kmeans_example/output',
|
||||
output_encryption_key='',
|
||||
instance_type='ml.p2.16xlarge',
|
||||
instance_type='ml.m5.2xlarge',
|
||||
instance_count=1,
|
||||
volume_size=50,
|
||||
hpo_max_num_jobs=9,
|
||||
|
|
@ -115,7 +116,7 @@ def mnist_classification(region='us-west-2',
|
|||
max_wait_time=hpo_max_wait_time,
|
||||
checkpoint_config=hpo_checkpoint_config,
|
||||
role=role_arn,
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)
|
||||
|
||||
training = sagemaker_train_op(
|
||||
region=region,
|
||||
|
|
@ -136,7 +137,7 @@ def mnist_classification(region='us-west-2',
|
|||
max_wait_time=train_max_wait_time,
|
||||
checkpoint_config=train_checkpoint_config,
|
||||
role=role_arn,
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)
|
||||
|
||||
create_model = sagemaker_model_op(
|
||||
region=region,
|
||||
|
|
@ -146,13 +147,13 @@ def mnist_classification(region='us-west-2',
|
|||
model_artifact_url=training.outputs['model_artifact_url'],
|
||||
network_isolation=network_isolation,
|
||||
role=role_arn
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)
|
||||
|
||||
prediction = sagemaker_deploy_op(
|
||||
region=region,
|
||||
endpoint_url=endpoint_url,
|
||||
model_name_1=create_model.output,
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)
|
||||
|
||||
batch_transform = sagemaker_batch_transform_op(
|
||||
region=region,
|
||||
|
|
@ -169,7 +170,7 @@ def mnist_classification(region='us-west-2',
|
|||
split_type=batch_transform_split_type,
|
||||
compression_type=batch_transform_compression_type,
|
||||
output_location=batch_transform_ouput
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
kfp.compiler.Compiler().compile(mnist_classification, __file__ + '.zip')
|
||||
|
|
|
|||
|
|
@ -2,100 +2,17 @@
|
|||
|
||||
An example pipeline with only [train component](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker/train).
|
||||
|
||||
# Prerequisites
|
||||
1. Install Kubeflow on an EKS cluster in AWS. https://www.kubeflow.org/docs/aws/deploy/install-kubeflow/
|
||||
2. Get and store data in S3 buckets. You can get sample data using this code.
|
||||
Create a new file `s3_sample_data_creator.py` with following content :
|
||||
```buildoutcfg
|
||||
import io
|
||||
import boto3
|
||||
import pickle, gzip, numpy, urllib.request, json
|
||||
from urllib.parse import urlparse
|
||||
from sagemaker.amazon.common import write_numpy_to_dense_tensor
|
||||
|
||||
|
||||
###########################################################################################
|
||||
# This is the only thing that you need to change in this code
|
||||
# Give the name of your S3 bucket
|
||||
# To use the example input below give a bucket name which is in us-east-1 region
|
||||
bucket = '<bucket-name>'
|
||||
## Prerequisites
|
||||
|
||||
###########################################################################################
|
||||
|
||||
# Load the dataset
|
||||
urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz")
|
||||
with gzip.open('mnist.pkl.gz', 'rb') as f:
|
||||
train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
|
||||
Make sure you have the setup explained in this [README.md](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/README.md)
|
||||
|
||||
|
||||
# Upload dataset to S3
|
||||
data_key = 'mnist_kmeans_example/data'
|
||||
data_location = 's3://{}/{}'.format(bucket, data_key)
|
||||
print('Data will be uploaded to: {}'.format(data_location))
|
||||
|
||||
# Convert the training data into the format required by the SageMaker KMeans algorithm
|
||||
buf = io.BytesIO()
|
||||
write_numpy_to_dense_tensor(buf, train_set[0], train_set[1])
|
||||
buf.seek(0)
|
||||
|
||||
boto3.resource('s3').Bucket(bucket).Object(data_key).upload_fileobj(buf)
|
||||
```
|
||||
Run this file `python s3_sample_data_creator.py`
|
||||
3. Prepare an IAM role with permissions to run SageMaker jobs and access to S3 buckets.
|
||||
|
||||
create a new file "trust.json" with following content
|
||||
```buildoutcfg
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Sid": "",
|
||||
"Effect": "Allow",
|
||||
"Principal": {
|
||||
"Service": "sagemaker.amazonaws.com"
|
||||
},
|
||||
"Action": "sts:AssumeRole"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
```buildoutcfg
|
||||
|
||||
# run these commands to create a role named "SageMakerExecutorKFP" with SageMaker and S3 access
|
||||
aws iam create-role --role-name SageMakerExecutorKFP --assume-role-policy-document file://trust.json
|
||||
aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess --role-name SageMakerExecutorKFP
|
||||
aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess --role-name SageMakerExecutorKFP
|
||||
|
||||
# Note down the role ARN
|
||||
aws iam get-role --role-name SageMakerExecutorKFP # | jq .Role.Arn
|
||||
```
|
||||
4. Add 'aws-secret' to your Kubeflow namespace.
|
||||
```
|
||||
# 1. get aws key and secret in base64 format:
|
||||
|
||||
echo -n "<AWS_ACCESS_KEY_ID>" | base64
|
||||
echo -n "<AWS_SECRET_ACCESS_KEY>" | base64
|
||||
|
||||
# 2. Create new file secret.yaml with following content
|
||||
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: aws-secret
|
||||
namespace: kubeflow
|
||||
type: Opaque
|
||||
data:
|
||||
AWS_ACCESS_KEY_ID: <base64_AWS_ACCESS_KEY_ID>
|
||||
AWS_SECRET_ACCESS_KEY: <base64_AWS_SECRET_ACCESS_KEY>
|
||||
|
||||
# 3. Now apply to the cluster's kubeflow namespace:
|
||||
|
||||
kubectl -n kubeflow apply -f secret.yaml
|
||||
```
|
||||
5. Compile the pipeline:
|
||||
## Steps
|
||||
1. Compile the pipeline:
|
||||
`dsl-compile --py training-pipeline.py --output training-pipeline.tar.gz`
|
||||
6. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run.
|
||||
7. Once the pipeline completes, you can see the outputs under 'Output parameters' in the HPO component's Input/Output section.
|
||||
2. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run.
|
||||
3. Once the pipeline completes, you can see the outputs under 'Output parameters' in the HPO component's Input/Output section.
|
||||
|
||||
Example inputs to this pipeline :
|
||||
```buildoutcfg
|
||||
|
|
@ -111,7 +28,7 @@ channels : In this JSON, along with other parameters you need to pass the S3 Uri
|
|||
"ChannelName": "train",
|
||||
"DataSource": {
|
||||
"S3DataSource": {
|
||||
"S3Uri": "s3://<your_bucket_name>/mnist_kmeans_example/data",
|
||||
"S3Uri": "s3://<your_bucket_name>/mnist_kmeans_example/train_data",
|
||||
"S3DataType": "S3Prefix",
|
||||
"S3DataDistributionType": "FullyReplicated"
|
||||
}
|
||||
|
|
@ -123,7 +40,7 @@ channels : In this JSON, along with other parameters you need to pass the S3 Uri
|
|||
}
|
||||
]
|
||||
|
||||
instance_type : ml.p2.xlarge
|
||||
instance_type : ml.m5.2xlarge
|
||||
instance_count : 1
|
||||
volume_size : 50
|
||||
max_run_time : 3600
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Uncomment the apply(use_aws_secret()) below if you are not using OIDC
|
||||
# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md
|
||||
|
||||
import kfp
|
||||
import json
|
||||
import copy
|
||||
|
|
@ -41,7 +44,7 @@ def training(
|
|||
training_input_mode='File',
|
||||
hyperparameters={"k": "10", "feature_dim": "784"},
|
||||
channels=channelObjList,
|
||||
instance_type='ml.p2.xlarge',
|
||||
instance_type='ml.m5.2xlarge',
|
||||
instance_count=1,
|
||||
volume_size=50,
|
||||
max_run_time=3600,
|
||||
|
|
@ -73,7 +76,8 @@ def training(
|
|||
max_wait_time=max_wait_time,
|
||||
checkpoint_config=checkpoint_config,
|
||||
role=role,
|
||||
).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
)#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
kfp.compiler.Compiler().compile(training, __file__ + '.zip')
|
||||
|
|
|
|||
Loading…
Reference in New Issue