From f2a860b84c11022a7eef5b73f7049d6d885c97db Mon Sep 17 00:00:00 2001 From: Nicholas Thomson Date: Wed, 20 May 2020 14:18:19 -0700 Subject: [PATCH] [AWS SageMaker] Integration tests automation (#3768) * # This is a combination of 5 commits. # This is the 1st commit message: Add initial scripts # This is the commit message #2: Add working pytest script # This is the commit message #3: Add initial scripts # This is the commit message #4: Add environment variable files # This is the commit message #5: Remove old cluster script * Add initial scripts Add working pytest script Add initial scripts Add environment variable files Remove old cluster script Update pipeline credentials to OIDC Add initial scripts Add working pytest script Add initial scripts Add working pytest script * Remove debugging mark * Update example EKS cluster name * Remove quiet from Docker build * Manually pass env * Update env list vars as string * Update use array directly * Update variable array to export * Update to using read for splitting * Move to helper script * Update export from CodeBuild * Add wait for minio * Update kubectl wait timeout * Update minor changes for PR * Update integration test buildspec to quiet build * Add region to delete EKS * Add wait for pods * Updated README * Add fixed interval wait * Fix CodeBuild step order * Add file lock for experiment ID * Fix missing pytest parameter * Update run create only once * Add filelock to conda env * Update experiment name ensuring creation each time * Add try/catch with create experiment * Remove caching from KFP deployment * Remove disable KFP caching * Move .gitignore changes to inside component * Add blank line to default .gitignore --- components/aws/sagemaker/.gitignore | 2 + .../codebuild/integration-test.buildspec.yml | 16 +- .../scripts/construct_environment_array.sh | 10 ++ .../codebuild/unit-test.buildspec.yml | 3 + .../tests/integration_tests/.env.example | 12 ++ .../tests/integration_tests/Dockerfile | 43 +++++ .../tests/integration_tests/README.md | 51 ++---- .../tests/integration_tests/conftest.py | 26 ++- .../tests/integration_tests/environment.yml | 1 + .../config/kmeans-mnist-endpoint/config.yaml | 1 + .../definition/create_endpoint_pipeline.py | 4 +- .../definition/create_model_pipeline.py | 2 +- .../resources/definition/hpo_pipeline.py | 2 +- .../resources/definition/training_pipeline.py | 2 +- .../definition/transform_job_pipeline.py | 4 +- .../scripts/generate_iam_role | 68 +++++++ .../scripts/generate_trust_policy | 39 ++++ .../scripts/run_integration_tests | 168 ++++++++++++++++++ 18 files changed, 404 insertions(+), 50 deletions(-) create mode 100644 components/aws/sagemaker/.gitignore create mode 100755 components/aws/sagemaker/codebuild/scripts/construct_environment_array.sh create mode 100644 components/aws/sagemaker/tests/integration_tests/.env.example create mode 100644 components/aws/sagemaker/tests/integration_tests/Dockerfile create mode 100755 components/aws/sagemaker/tests/integration_tests/scripts/generate_iam_role create mode 100755 components/aws/sagemaker/tests/integration_tests/scripts/generate_trust_policy create mode 100755 components/aws/sagemaker/tests/integration_tests/scripts/run_integration_tests diff --git a/components/aws/sagemaker/.gitignore b/components/aws/sagemaker/.gitignore new file mode 100644 index 0000000000..58c9068fc5 --- /dev/null +++ b/components/aws/sagemaker/.gitignore @@ -0,0 +1,2 @@ +# Any environment variable files +**/*/.env \ No newline at end of file diff --git a/components/aws/sagemaker/codebuild/integration-test.buildspec.yml b/components/aws/sagemaker/codebuild/integration-test.buildspec.yml index 0ca12b06c6..09dafe53cd 100644 --- a/components/aws/sagemaker/codebuild/integration-test.buildspec.yml +++ b/components/aws/sagemaker/codebuild/integration-test.buildspec.yml @@ -1,14 +1,24 @@ version: 0.2 + +env: + variables: + CONTAINER_VARIABLES: "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI EKS_PRIVATE_SUBNETS EKS_PUBLIC_SUBNETS PYTEST_ADDOPTS S3_DATA_BUCKET EKS_EXISTING_CLUSTER SAGEMAKER_EXECUTION_ROLE_ARN REGION" + phases: build: commands: - cd components/aws - docker build . -f ./sagemaker/tests/integration_tests/Dockerfile -t amazon/integration-test-image --quiet + - cd sagemaker/codebuild/scripts && export CONTAINER_VARIABLE_ARGUMENTS="$(./construct_environment_array.sh)" + # Run the container and copy the results to /tmp - # Passes all host environment variables through to the container - - docker run --name integration-test-container $(env | cut -f1 -d= | sed 's/^/-e /') amazon/integration-test-image - - docker cp integration-test-container:/app/tests/integration_tests/integration_tests.log /tmp/results.xml + # Passes all listed host environment variables through to the container + - docker run --name integration-test-container $(echo $CONTAINER_VARIABLE_ARGUMENTS) amazon/integration-test-image + + post_build: + commands: + - docker cp integration-test-container:/tests/integration_tests/integration_tests.log /tmp/results.xml - docker rm -f integration-test-container reports: diff --git a/components/aws/sagemaker/codebuild/scripts/construct_environment_array.sh b/components/aws/sagemaker/codebuild/scripts/construct_environment_array.sh new file mode 100755 index 0000000000..249108d8bf --- /dev/null +++ b/components/aws/sagemaker/codebuild/scripts/construct_environment_array.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +# This script breaks up a string of environment variable names into a list of +# parameters that `docker run` accepts. This needs to be made into a script +# for CodeBuild because these commands do not run in dash - the default terminal +# on the CodeBuild standard images. + +IFS=' ' read -a variable_array <<< $CONTAINER_VARIABLES +printf -v CONTAINER_VARIABLE_ARGUMENTS -- "--env %s " "${variable_array[@]}" +echo $CONTAINER_VARIABLE_ARGUMENTS \ No newline at end of file diff --git a/components/aws/sagemaker/codebuild/unit-test.buildspec.yml b/components/aws/sagemaker/codebuild/unit-test.buildspec.yml index a366094bfa..4d68849447 100644 --- a/components/aws/sagemaker/codebuild/unit-test.buildspec.yml +++ b/components/aws/sagemaker/codebuild/unit-test.buildspec.yml @@ -8,6 +8,9 @@ phases: # Run the container and copy the results to /tmp # Passes all host environment variables through to the container - docker run --name unit-test-container $(env | cut -f1 -d= | sed 's/^/-e /') amazon/unit-test-image + + post_build: + commands: - docker cp unit-test-container:/app/tests/unit_tests/unit_tests.log /tmp/results.xml - docker rm -f unit-test-container diff --git a/components/aws/sagemaker/tests/integration_tests/.env.example b/components/aws/sagemaker/tests/integration_tests/.env.example new file mode 100644 index 0000000000..33c04cd60f --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/.env.example @@ -0,0 +1,12 @@ +# If you would like to override the credentials for the container +# AWS_ACCESS_KEY_ID= +# AWS_SECRET_ACCESS_KEY= +# AWS_SESSION_TOKEN= + +REGION=us-east-1 + +SAGEMAKER_EXECUTION_ROLE_ARN=arn:aws:iam::123456789012:role/service-role/AmazonSageMaker-ExecutionRole-Example +S3_DATA_BUCKET=my-data-bucket + +# If you hope to use an existing EKS cluster, rather than creating a new one. +# EKS_EXISTING_CLUSTER=my-eks-cluster \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/Dockerfile b/components/aws/sagemaker/tests/integration_tests/Dockerfile new file mode 100644 index 0000000000..75c66f8c1b --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/Dockerfile @@ -0,0 +1,43 @@ +FROM continuumio/miniconda:4.7.12 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + wget \ + git \ + jq + +# Install eksctl +RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.19.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp \ + && mv /tmp/eksctl /usr/local/bin + +# Install aws-iam-authenticator +RUN curl -S -o /usr/local/bin/aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.16.8/2020-04-16/bin/linux/amd64/aws-iam-authenticator \ + && chmod +x /usr/local/bin/aws-iam-authenticator + +# Install Kubectl +RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.18.0/bin/linux/amd64/kubectl \ + && chmod +x ./kubectl \ + && mv ./kubectl /usr/local/bin/kubectl + +# Install Argo CLI +RUN curl -sSL -o /usr/local/bin/argo https://github.com/argoproj/argo/releases/download/v2.8.0/argo-linux-amd64 \ + && chmod +x /usr/local/bin/argo + +# Copy conda environment early to avoid cache busting +COPY ./sagemaker/tests/integration_tests/environment.yml environment.yml + +# Create conda environment for running tests and set as start-up environment +RUN conda env create -f environment.yml +RUN echo "source activate kfp_test_env" > ~/.bashrc +ENV PATH "/opt/conda/envs/kfp_test_env/bin":$PATH + +# Environment variables to be used by tests +ENV REGION="us-west-2" +ENV SAGEMAKER_EXECUTION_ROLE_ARN="arn:aws:iam::1234567890:role/sagemaker-role" +ENV S3_DATA_BUCKET="kfp-test-data" +ENV MINIO_LOCAL_PORT=9000 +ENV KFP_NAMESPACE="kubeflow" + +COPY ./sagemaker/ . + +ENTRYPOINT [ "/bin/bash", "./tests/integration_tests/scripts/run_integration_tests" ] \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/README.md b/components/aws/sagemaker/tests/integration_tests/README.md index 898d666d61..e43fbe525b 100644 --- a/components/aws/sagemaker/tests/integration_tests/README.md +++ b/components/aws/sagemaker/tests/integration_tests/README.md @@ -1,42 +1,21 @@ ## Requirements -1. [Conda](https://docs.conda.io/en/latest/miniconda.html) -1. [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) -1. Argo CLI: [Mac](https://github.com/argoproj/homebrew-tap), [Linux](https://eksworkshop.com/advanced/410_batch/install/) -1. K8s cluster with Kubeflow pipelines > 0.4.0 installed -1. [IAM Role](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) with a SageMakerFullAccess and S3FullAccess -1. IAM User credentials with SageMakerFullAccess permissions +1. [Docker](https://www.docker.com/) +1. [IAM Role](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) with a SageMakerFullAccess and AmazonS3FullAccess +1. IAM User credentials with SageMakerFullAccess, AWSCloudFormationFullAccess, IAMFullAccess, AmazonEC2FullAccess, AmazonS3FullAccess permissions ## Creating S3 buckets with datasets -Change the bucket name and run the python script `[s3_sample_data_creator.py](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset)` to create S3 buckets with mnist dataset in the region where you want to run the tests +In the following Python script, change the bucket name and run the [`s3_sample_data_creator.py`](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset) to create an S3 bucket with the sample mnist dataset in the region where you want to run the tests. ## Step to run integration tests -1. Configure AWS credentials with access to EKS cluster -1. Fetch kubeconfig to `~/.kube/config` or set `KUBECONFIG` environment variable to point to kubeconfig of the cluster -1. Create a [secret](https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/) named `aws-secret` in kubeflow namespace with credentials of IAM User for SageMakerFullAccess - ```yaml - apiVersion: v1 - kind: Secret - metadata: - name: aws-secret - namespace: kubeflow - type: Opaque - data: - AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY - AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS - ``` - - > Note: To get base64 string, run `echo -n $AWS_ACCESS_KEY_ID | base64` -1. Create conda environment using environment.yml for running tests. Run `conda env create -f environment.yml` -1. Activate the conda environment `conda activate kfp_test_env` -1. Run port-forward to minio service in background. Example: `kubectl port-forward svc/minio-service 9000:9000 -n kubeflow &` -1. Provide the following arguments to pytest: - 1. `region`: AWS region where test will run. Default - us-west-2 - 1. `role-arn`: SageMaker execution IAM role ARN - 1. `s3-data-bucket`: Regional S3 bucket in which test data is hosted - 1. `minio-service-port`: Localhost port to which minio service is mapped to. Default - 9000 - 1. `kfp-namespace`: Cluster namespace where kubeflow pipelines is installed. Default - Kubeflow -1. cd into this directory and run - ``` - pytest --region <> --role-arn <> --s3-data-bucket <> --minio-service-port <> --kfp-namespace <> - ``` +1. Copy the `.env.example` file to `.env` and in the following steps modify the fields of this new file: + 1. Configure the AWS credentials fields with those of your IAM User. + 1. Update the `SAGEMAKER_EXECUTION_ROLE_ARN` with that of your role created earlier. + 1. Update the `S3_DATA_BUCKET` parameter with the name of the bucket created earlier. + 1. (Optional) If you have already created an EKS cluster for testing, replace the `EKS_EXISTING_CLUSTER` field with it's name. +1. Build the image by doing the following: + 1. Navigate to the `components/aws` directory. + 1. Run `docker build . -f sagemaker/tests/integration_tests/Dockerfile -t amazon/integration_test` +1. Run the image, injecting your environment variable files: + 1. Navigate to the `components/aws` directory. + 1. Run `docker run --env-file sagemaker/tests/integration_tests/.env amazon/integration_test` \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/conftest.py b/components/aws/sagemaker/tests/integration_tests/conftest.py index 47e6cb9ea4..52c29656cc 100644 --- a/components/aws/sagemaker/tests/integration_tests/conftest.py +++ b/components/aws/sagemaker/tests/integration_tests/conftest.py @@ -5,6 +5,7 @@ import os import utils from datetime import datetime +from filelock import FileLock def pytest_addoption(parser): @@ -86,12 +87,29 @@ def kfp_client(): kfp_installed_namespace = utils.get_kfp_namespace() return kfp.Client(namespace=kfp_installed_namespace) - -@pytest.fixture(scope="session") -def experiment_id(kfp_client): - exp_name = datetime.now().strftime("%Y-%m-%d") +def get_experiment_id(kfp_client): + exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M") try: experiment = kfp_client.get_experiment(experiment_name=exp_name) except ValueError: experiment = kfp_client.create_experiment(name=exp_name) return experiment.id + +@pytest.fixture(scope="session") +def experiment_id(kfp_client, tmp_path_factory, worker_id): + if not worker_id: + return get_experiment_id(kfp_client) + + # Locking taking as an example from + # https://github.com/pytest-dev/pytest-xdist#making-session-scoped-fixtures-execute-only-once + # get the temp directory shared by all workers + root_tmp_dir = tmp_path_factory.getbasetemp().parent + + fn = root_tmp_dir / "experiment_id" + with FileLock(str(fn) + ".lock"): + if fn.is_file(): + data = fn.read_text() + else: + data = get_experiment_id(kfp_client) + fn.write_text(data) + return data \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/environment.yml b/components/aws/sagemaker/tests/integration_tests/environment.yml index 565777dc8d..90c7645bc6 100644 --- a/components/aws/sagemaker/tests/integration_tests/environment.yml +++ b/components/aws/sagemaker/tests/integration_tests/environment.yml @@ -12,6 +12,7 @@ dependencies: - pyyaml=5.3.* - flake8=3.7.* - flake8-black=0.1.* + - filelock=3.0.* - pip: - kubernetes==11.0.* - kfp==0.5.* diff --git a/components/aws/sagemaker/tests/integration_tests/resources/config/kmeans-mnist-endpoint/config.yaml b/components/aws/sagemaker/tests/integration_tests/resources/config/kmeans-mnist-endpoint/config.yaml index e961a588b9..f4a413c828 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/config/kmeans-mnist-endpoint/config.yaml +++ b/components/aws/sagemaker/tests/integration_tests/resources/config/kmeans-mnist-endpoint/config.yaml @@ -15,6 +15,7 @@ Arguments: variant_name_1: variant-1 instance_type_1: ml.m4.xlarge initial_instance_count_1: 1 + initial_variant_weight_1: 1.0 network_isolation: "True" role: ((ROLE_ARN)) \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/create_endpoint_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/create_endpoint_pipeline.py index 801b3458f4..8b28e52eac 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/create_endpoint_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/create_endpoint_pipeline.py @@ -34,7 +34,7 @@ def create_endpoint_pipeline( model_artifact_url=model_artifact_url, network_isolation=network_isolation, role=role, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) sagemaker_deploy_op( region=region, @@ -46,7 +46,7 @@ def create_endpoint_pipeline( instance_type_1=instance_type_1, initial_instance_count_1=initial_instance_count_1, initial_variant_weight_1=initial_variant_weight_1, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) if __name__ == "__main__": diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/create_model_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/create_model_pipeline.py index a7fa0afe05..75f4f6a26e 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/create_model_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/create_model_pipeline.py @@ -26,7 +26,7 @@ def create_model_pipeline( model_artifact_url=model_artifact_url, network_isolation=network_isolation, role=role, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) if __name__ == "__main__": diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/hpo_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/hpo_pipeline.py index 721658355e..cd1a50fb57 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/hpo_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/hpo_pipeline.py @@ -56,7 +56,7 @@ def hpo_pipeline( network_isolation=network_isolation, max_wait_time=max_wait_time, role=role, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) if __name__ == "__main__": diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py index e69d103e56..ad8eab23bf 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py @@ -46,7 +46,7 @@ def training_pipeline( max_wait_time=max_wait_time, checkpoint_config=checkpoint_config, role=role, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) if __name__ == "__main__": diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/transform_job_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/transform_job_pipeline.py index 8ac879f81c..e8b38697f3 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/transform_job_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/transform_job_pipeline.py @@ -40,7 +40,7 @@ def batch_transform_pipeline( model_artifact_url=model_artifact_url, network_isolation=network_isolation, role=role, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) sagemaker_batch_transform_op( region=region, @@ -57,7 +57,7 @@ def batch_transform_pipeline( split_type=split_type, compression_type=compression_type, output_location=output_location, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) if __name__ == "__main__": diff --git a/components/aws/sagemaker/tests/integration_tests/scripts/generate_iam_role b/components/aws/sagemaker/tests/integration_tests/scripts/generate_iam_role new file mode 100755 index 0000000000..7e4d1e9b14 --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/scripts/generate_iam_role @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +# Helper script to generate an IAM Role needed to install role-based authentication to a KFP service account. +# +# Run as: +# $ ./generate_iam_role ${cluster_arn/cluster_name} ${role_name} ${cluster_region} [optional: ${service_namespace} ${service_account}] +# + +CLUSTER_ARN="${1}" +ROLE_NAME="${2}" +CLUSTER_REGION="${3:-us-east-1}" +SERVICE_NAMESPACE="${4:-kubeflow}" +SERVICE_ACCOUNT="${5:-pipeline-runner}" +aws_account=$(aws sts get-caller-identity --query Account --output text) +trustfile="trust.json" + +cwd=$(dirname $(realpath $0)) + +# if using an existing cluster, use the cluster arn to get the region and cluster name +# example, cluster_arn=arn:aws:eks:us-east-1:12345678910:cluster/test +cluster_name=$(echo ${CLUSTER_ARN} | cut -d'/' -f2) + +# A function to get the OIDC_ID associated with an EKS cluster +function get_oidc_id { + # TODO: Ideally this should be based on version compatibility instead of command failure + eksctl utils associate-iam-oidc-provider --cluster ${cluster_name} --region ${CLUSTER_REGION} --approve + if [[ $? -ge 1 ]]; then + eksctl utils associate-iam-oidc-provider --name ${cluster_name} --region ${CLUSTER_REGION} --approve + fi + + local oidc=$(aws eks describe-cluster --name ${cluster_name} --region ${CLUSTER_REGION} --query cluster.identity.oidc.issuer --output text) + oidc_id=$(echo ${oidc} | rev | cut -d'/' -f1 | rev) +} + +# A function that generates an IAM role for the given account, cluster, namespace, region +# Parameter: +# $1: Name of the trust file to generate. +function create_namespaced_iam_role { + local trustfile="${1}" + # Check if role already exists + aws iam get-role --role-name ${ROLE_NAME} + if [[ $? -eq 0 ]]; then + echo "A role for this cluster and namespace already exists in this account, assuming sagemaker access and proceeding." + else + echo "IAM Role does not exist, creating a new Role for the cluster" + aws iam create-role --role-name ${ROLE_NAME} --assume-role-policy-document file://${trustfile} --output=text --query "Role.Arn" + aws iam attach-role-policy --role-name ${ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + fi +} + +# Remove the generated trust file +# Parameter: +# $1: Name of the trust file to delete. +function delete_generated_file { + rm "${1}" +} + +echo "Get the OIDC ID for the cluster" +get_oidc_id +echo "Delete the trust json file if it already exists" +delete_generated_file "${trustfile}" +echo "Generate a trust json" +"$cwd"/generate_trust_policy ${CLUSTER_REGION} ${aws_account} ${oidc_id} ${SERVICE_NAMESPACE} ${SERVICE_ACCOUNT} > "${trustfile}" +echo "Create the IAM Role using these values" +create_namespaced_iam_role "${trustfile}" +echo "Cleanup for the next run" +delete_generated_file "${trustfile}" + diff --git a/components/aws/sagemaker/tests/integration_tests/scripts/generate_trust_policy b/components/aws/sagemaker/tests/integration_tests/scripts/generate_trust_policy new file mode 100755 index 0000000000..1c10fa10fe --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/scripts/generate_trust_policy @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# Helper script to generate trust the policy needed to assign role-based authentication to a KFP service account. +# +# Run as: +# $ ./generate_trust_policy ${EKS_CLUSTER_REGION} ${AWS_ACCOUNT_ID} ${OIDC_ID} ${SERVICE_NAMESPACE} ${SERVICE_ACCOUNT} > trust.json +# +# For example: +# $ ./generate_trust_policy us-west-2 123456789012 D48675832CA65BD10A532F597OIDCID > trust.json +# This will create a file `trust.json` containing a role policy that enables the KFP service runner in an EKS cluster to assume AWS roles. +# +# The SERVICE_NAMESPACE parameter is for when you want to run Kubeflow in a custom namespace other than "kubeflow". +# The SERVICE_ACCOUNT parameter is for when you want to give permissions to a service account other than the default "pipeline-runner". + +cluster_region="$1" +account_number="$2" +oidc_id="$3" +service_namespace="${4}" +service_account="${5}" + +printf '{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam::'"${account_number}"':oidc-provider/oidc.eks.'"${cluster_region}"'.amazonaws.com/id/'"${oidc_id}"'" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "oidc.eks.'"${cluster_region}"'.amazonaws.com/id/'"${oidc_id}"':aud": "sts.amazonaws.com", + "oidc.eks.'"${cluster_region}"'.amazonaws.com/id/'"${oidc_id}"':sub": "system:serviceaccount:'"${service_namespace}"':'"${service_account}"'" + } + } + } + ] +} +' \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/scripts/run_integration_tests b/components/aws/sagemaker/tests/integration_tests/scripts/run_integration_tests new file mode 100755 index 0000000000..6ad3fb9db2 --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/scripts/run_integration_tests @@ -0,0 +1,168 @@ +#!/usr/bin/env bash + +set -u +set -o pipefail + +usage(){ + echo "Usage: $0 -n [-r ]" + exit 1 +} + +cwd=$(dirname $(realpath $0)) + +### Input parameters +DEPLOY_NAME="sagemaker-kfp-"$(date '+%Y-%m-%d-%H-%M-%S')"" # The name given to the entire deployment (tagging all resources) +REGION=${REGION:-"$(aws configure get region)"} # Deployment region + +### Configuration parameters +EKS_EXISTING_CLUSTER=${EKS_EXISTING_CLUSTER:-""} # Use an existing EKS cluster +EKS_CLUSTER_VERSION=${EKS_CLUSTER_VERSION:-"1.15"} # EKS cluster K8s version +EKS_NODE_COUNT=${EKS_NODE_COUNT:-"1"} # The initial node count of the EKS cluster +EKS_PUBLIC_SUBNETS=${EKS_PUBLIC_SUBNETS:-""} +EKS_PRIVATE_SUBNETS=${EKS_PRIVATE_SUBNETS:-""} + +### Testing parameters +MINIO_LOCAL_PORT=${MINIO_LOCAL_PORT:-9000} +KFP_NAMESPACE=${KFP_NAMESPACE:-"kubeflow"} +KFP_SERVICE_ACCOUNT=${KFP_SERVICE_ACCOUNT:-"pipeline-runner"} + +PYTEST_MARKER=${PYTEST_MARKER:-""} +S3_DATA_BUCKET=${S3_DATA_BUCKET:-""} +SAGEMAKER_EXECUTION_ROLE_ARN=${SAGEMAKER_EXECUTION_ROLE_ARN:-""} + +while getopts ":n:r:s:" opt; do + case $opt in + n) + DEPLOY_NAME="$OPTARG" + ;; + s) + S3_DATA_BUCKET="$OPTARG" + ;; + r) + REGION="$OPTARG" + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + exit 1 + ;; + :) + echo "Option -$OPTARG requires an argument." >&2 + exit 1 + ;; + esac +done + +# Ensure a deployment name was specified +if [ "$DEPLOY_NAME" == "" ]; then + echo "Missing deployment name" + usage + exit 1 +fi + +if [ "$S3_DATA_BUCKET" == "" ]; then + echo "Missing S3 data bucket name" + usage + exit 1 +fi + +function cleanup() { + set +e + + cleanup_kfp + delete_generated_role + + if [[ -z "${EKS_EXISTING_CLUSTER}" ]]; then + delete_eks + fi +} + +# Set the trap to clean up resources in the case of an error +trap cleanup EXIT +set -e + +function launch_eks() { + EKS_CLUSTER_NAME="${DEPLOY_NAME}-eks-cluster" + + echo "[Creating EKS] Launching EKS cluster $EKS_CLUSTER_NAME" + + eksctl_args=( --managed --nodes "${EKS_NODE_COUNT}" --node-type=c5.xlarge --timeout=30m --region "${REGION}" --auto-kubeconfig --version "${EKS_CLUSTER_VERSION}" ) + [ ! -z "${EKS_PUBLIC_SUBNETS}" ] && eksctl_args+=( --vpc-public-subnets="${EKS_PUBLIC_SUBNETS}" ) + [ ! -z "${EKS_PRIVATE_SUBNETS}" ] && eksctl_args+=( --vpc-private-subnets="${EKS_PRIVATE_SUBNETS}" ) + + eksctl create cluster "${EKS_CLUSTER_NAME}" "${eksctl_args[@]}" + + aws eks update-kubeconfig --name "$EKS_CLUSTER_NAME" --region "$REGION" + + echo "[Creating EKS] $EKS_CLUSTER_NAME launched" +} + +function delete_eks() { + eksctl delete cluster --name "${EKS_CLUSTER_NAME}" --region "${REGION}" +} + +function install_kfp() { + echo "[Installing KFP] Applying KFP manifests" + + PIPELINE_VERSION=0.5.1 + kubectl apply -k github.com/kubeflow/pipelines/manifests/kustomize/cluster-scoped-resources?ref=$PIPELINE_VERSION + kubectl wait --for condition=established --timeout=60s crd/applications.app.k8s.io + kubectl apply -k github.com/kubeflow/pipelines/manifests/kustomize/env/dev?ref=$PIPELINE_VERSION + + echo "[Installing KFP] Port-forwarding Minio" + + kubectl wait --for=condition=ready -n "${KFP_NAMESPACE}" pod -l app=minio --timeout=5m + kubectl port-forward -n kubeflow svc/minio-service $MINIO_LOCAL_PORT:9000 & + MINIO_PID=$! + + echo "[Installing KFP] Minio port-forwarded to ${MINIO_LOCAL_PORT}" + + echo "[Installing KFP] Waiting for pods to stand up" + + kubectl wait --for=condition=ready -n "${KFP_NAMESPACE}" pod -l app=ml-pipeline --timeout=5m + + # TODO: Replace with calculated waits + # For the moment we don't know which pods will be slower, so we are just relying on a fixed interval + sleep 3m + + echo "[Installing KFP] Pipeline pods are ready" +} + +function generate_iam_role_name() { + OIDC_ROLE_NAME="$(echo "${DEPLOY_NAME}-kubeflow-role" | cut -c1-64)" + OIDC_ROLE_ARN="arn:aws:iam::$(aws sts get-caller-identity --query=Account --output=text):role/${OIDC_ROLE_NAME}" +} + +function install_generated_role() { + kubectl patch serviceaccount -n ${KFP_NAMESPACE} ${KFP_SERVICE_ACCOUNT} --patch '{"metadata": {"annotations": {"eks.amazonaws.com/role-arn": "'"${OIDC_ROLE_ARN}"'"}}}' +} + +function delete_generated_role() { + # Delete the role associated with the cluster thats being deleted + aws iam detach-role-policy --role-name "${OIDC_ROLE_NAME}" --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam delete-role --role-name "${OIDC_ROLE_NAME}" +} + +function cleanup_kfp() { + # Clean up Minio + if [ ! -z "${MINIO_PID}" ]; then + kill -9 $MINIO_PID || true + fi +} + +if [[ -z "${EKS_EXISTING_CLUSTER}" ]]; then + launch_eks +else + aws eks update-kubeconfig --name "${EKS_EXISTING_CLUSTER}" --region "$REGION" + EKS_CLUSTER_NAME="${EKS_EXISTING_CLUSTER}" + DEPLOY_NAME="${EKS_EXISTING_CLUSTER}" +fi + +generate_iam_role_name +"$cwd"/generate_iam_role ${EKS_CLUSTER_NAME} ${OIDC_ROLE_NAME} ${REGION} ${KFP_NAMESPACE} ${KFP_SERVICE_ACCOUNT} +install_kfp +install_generated_role + +pytest_args=( --region "${REGION}" --role-arn "${SAGEMAKER_EXECUTION_ROLE_ARN}" --s3-data-bucket "${S3_DATA_BUCKET}" --minio-service-port "${MINIO_LOCAL_PORT}" --kfp-namespace "${KFP_NAMESPACE}" ) +[ ! -z "${PYTEST_MARKER}" ] && pytest_args+=( -m "${PYTEST_MARKER}" ) + +cd tests/integration_tests && python -m pytest "${pytest_args[@]}" --junitxml ./integration_tests.log -n $(nproc) \ No newline at end of file