[AWS SageMaker] Integration tests automation (#3768)
* # This is a combination of 5 commits. # This is the 1st commit message: Add initial scripts # This is the commit message #2: Add working pytest script # This is the commit message #3: Add initial scripts # This is the commit message #4: Add environment variable files # This is the commit message #5: Remove old cluster script * Add initial scripts Add working pytest script Add initial scripts Add environment variable files Remove old cluster script Update pipeline credentials to OIDC Add initial scripts Add working pytest script Add initial scripts Add working pytest script * Remove debugging mark * Update example EKS cluster name * Remove quiet from Docker build * Manually pass env * Update env list vars as string * Update use array directly * Update variable array to export * Update to using read for splitting * Move to helper script * Update export from CodeBuild * Add wait for minio * Update kubectl wait timeout * Update minor changes for PR * Update integration test buildspec to quiet build * Add region to delete EKS * Add wait for pods * Updated README * Add fixed interval wait * Fix CodeBuild step order * Add file lock for experiment ID * Fix missing pytest parameter * Update run create only once * Add filelock to conda env * Update experiment name ensuring creation each time * Add try/catch with create experiment * Remove caching from KFP deployment * Remove disable KFP caching * Move .gitignore changes to inside component * Add blank line to default .gitignore
This commit is contained in:
parent
4a961ce268
commit
f2a860b84c
|
|
@ -0,0 +1,2 @@
|
|||
# Any environment variable files
|
||||
**/*/.env
|
||||
|
|
@ -1,14 +1,24 @@
|
|||
version: 0.2
|
||||
|
||||
env:
|
||||
variables:
|
||||
CONTAINER_VARIABLES: "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI EKS_PRIVATE_SUBNETS EKS_PUBLIC_SUBNETS PYTEST_ADDOPTS S3_DATA_BUCKET EKS_EXISTING_CLUSTER SAGEMAKER_EXECUTION_ROLE_ARN REGION"
|
||||
|
||||
phases:
|
||||
build:
|
||||
commands:
|
||||
- cd components/aws
|
||||
- docker build . -f ./sagemaker/tests/integration_tests/Dockerfile -t amazon/integration-test-image --quiet
|
||||
|
||||
- cd sagemaker/codebuild/scripts && export CONTAINER_VARIABLE_ARGUMENTS="$(./construct_environment_array.sh)"
|
||||
|
||||
# Run the container and copy the results to /tmp
|
||||
# Passes all host environment variables through to the container
|
||||
- docker run --name integration-test-container $(env | cut -f1 -d= | sed 's/^/-e /') amazon/integration-test-image
|
||||
- docker cp integration-test-container:/app/tests/integration_tests/integration_tests.log /tmp/results.xml
|
||||
# Passes all listed host environment variables through to the container
|
||||
- docker run --name integration-test-container $(echo $CONTAINER_VARIABLE_ARGUMENTS) amazon/integration-test-image
|
||||
|
||||
post_build:
|
||||
commands:
|
||||
- docker cp integration-test-container:/tests/integration_tests/integration_tests.log /tmp/results.xml
|
||||
- docker rm -f integration-test-container
|
||||
|
||||
reports:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,10 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# This script breaks up a string of environment variable names into a list of
|
||||
# parameters that `docker run` accepts. This needs to be made into a script
|
||||
# for CodeBuild because these commands do not run in dash - the default terminal
|
||||
# on the CodeBuild standard images.
|
||||
|
||||
IFS=' ' read -a variable_array <<< $CONTAINER_VARIABLES
|
||||
printf -v CONTAINER_VARIABLE_ARGUMENTS -- "--env %s " "${variable_array[@]}"
|
||||
echo $CONTAINER_VARIABLE_ARGUMENTS
|
||||
|
|
@ -8,6 +8,9 @@ phases:
|
|||
# Run the container and copy the results to /tmp
|
||||
# Passes all host environment variables through to the container
|
||||
- docker run --name unit-test-container $(env | cut -f1 -d= | sed 's/^/-e /') amazon/unit-test-image
|
||||
|
||||
post_build:
|
||||
commands:
|
||||
- docker cp unit-test-container:/app/tests/unit_tests/unit_tests.log /tmp/results.xml
|
||||
- docker rm -f unit-test-container
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,12 @@
|
|||
# If you would like to override the credentials for the container
|
||||
# AWS_ACCESS_KEY_ID=
|
||||
# AWS_SECRET_ACCESS_KEY=
|
||||
# AWS_SESSION_TOKEN=
|
||||
|
||||
REGION=us-east-1
|
||||
|
||||
SAGEMAKER_EXECUTION_ROLE_ARN=arn:aws:iam::123456789012:role/service-role/AmazonSageMaker-ExecutionRole-Example
|
||||
S3_DATA_BUCKET=my-data-bucket
|
||||
|
||||
# If you hope to use an existing EKS cluster, rather than creating a new one.
|
||||
# EKS_EXISTING_CLUSTER=my-eks-cluster
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
FROM continuumio/miniconda:4.7.12
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
wget \
|
||||
git \
|
||||
jq
|
||||
|
||||
# Install eksctl
|
||||
RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.19.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp \
|
||||
&& mv /tmp/eksctl /usr/local/bin
|
||||
|
||||
# Install aws-iam-authenticator
|
||||
RUN curl -S -o /usr/local/bin/aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.16.8/2020-04-16/bin/linux/amd64/aws-iam-authenticator \
|
||||
&& chmod +x /usr/local/bin/aws-iam-authenticator
|
||||
|
||||
# Install Kubectl
|
||||
RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.18.0/bin/linux/amd64/kubectl \
|
||||
&& chmod +x ./kubectl \
|
||||
&& mv ./kubectl /usr/local/bin/kubectl
|
||||
|
||||
# Install Argo CLI
|
||||
RUN curl -sSL -o /usr/local/bin/argo https://github.com/argoproj/argo/releases/download/v2.8.0/argo-linux-amd64 \
|
||||
&& chmod +x /usr/local/bin/argo
|
||||
|
||||
# Copy conda environment early to avoid cache busting
|
||||
COPY ./sagemaker/tests/integration_tests/environment.yml environment.yml
|
||||
|
||||
# Create conda environment for running tests and set as start-up environment
|
||||
RUN conda env create -f environment.yml
|
||||
RUN echo "source activate kfp_test_env" > ~/.bashrc
|
||||
ENV PATH "/opt/conda/envs/kfp_test_env/bin":$PATH
|
||||
|
||||
# Environment variables to be used by tests
|
||||
ENV REGION="us-west-2"
|
||||
ENV SAGEMAKER_EXECUTION_ROLE_ARN="arn:aws:iam::1234567890:role/sagemaker-role"
|
||||
ENV S3_DATA_BUCKET="kfp-test-data"
|
||||
ENV MINIO_LOCAL_PORT=9000
|
||||
ENV KFP_NAMESPACE="kubeflow"
|
||||
|
||||
COPY ./sagemaker/ .
|
||||
|
||||
ENTRYPOINT [ "/bin/bash", "./tests/integration_tests/scripts/run_integration_tests" ]
|
||||
|
|
@ -1,42 +1,21 @@
|
|||
## Requirements
|
||||
1. [Conda](https://docs.conda.io/en/latest/miniconda.html)
|
||||
1. [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/)
|
||||
1. Argo CLI: [Mac](https://github.com/argoproj/homebrew-tap), [Linux](https://eksworkshop.com/advanced/410_batch/install/)
|
||||
1. K8s cluster with Kubeflow pipelines > 0.4.0 installed
|
||||
1. [IAM Role](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) with a SageMakerFullAccess and S3FullAccess
|
||||
1. IAM User credentials with SageMakerFullAccess permissions
|
||||
1. [Docker](https://www.docker.com/)
|
||||
1. [IAM Role](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) with a SageMakerFullAccess and AmazonS3FullAccess
|
||||
1. IAM User credentials with SageMakerFullAccess, AWSCloudFormationFullAccess, IAMFullAccess, AmazonEC2FullAccess, AmazonS3FullAccess permissions
|
||||
|
||||
## Creating S3 buckets with datasets
|
||||
|
||||
Change the bucket name and run the python script `[s3_sample_data_creator.py](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset)` to create S3 buckets with mnist dataset in the region where you want to run the tests
|
||||
In the following Python script, change the bucket name and run the [`s3_sample_data_creator.py`](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset) to create an S3 bucket with the sample mnist dataset in the region where you want to run the tests.
|
||||
|
||||
## Step to run integration tests
|
||||
1. Configure AWS credentials with access to EKS cluster
|
||||
1. Fetch kubeconfig to `~/.kube/config` or set `KUBECONFIG` environment variable to point to kubeconfig of the cluster
|
||||
1. Create a [secret](https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/) named `aws-secret` in kubeflow namespace with credentials of IAM User for SageMakerFullAccess
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: aws-secret
|
||||
namespace: kubeflow
|
||||
type: Opaque
|
||||
data:
|
||||
AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY
|
||||
AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS
|
||||
```
|
||||
|
||||
> Note: To get base64 string, run `echo -n $AWS_ACCESS_KEY_ID | base64`
|
||||
1. Create conda environment using environment.yml for running tests. Run `conda env create -f environment.yml`
|
||||
1. Activate the conda environment `conda activate kfp_test_env`
|
||||
1. Run port-forward to minio service in background. Example: `kubectl port-forward svc/minio-service 9000:9000 -n kubeflow &`
|
||||
1. Provide the following arguments to pytest:
|
||||
1. `region`: AWS region where test will run. Default - us-west-2
|
||||
1. `role-arn`: SageMaker execution IAM role ARN
|
||||
1. `s3-data-bucket`: Regional S3 bucket in which test data is hosted
|
||||
1. `minio-service-port`: Localhost port to which minio service is mapped to. Default - 9000
|
||||
1. `kfp-namespace`: Cluster namespace where kubeflow pipelines is installed. Default - Kubeflow
|
||||
1. cd into this directory and run
|
||||
```
|
||||
pytest --region <> --role-arn <> --s3-data-bucket <> --minio-service-port <> --kfp-namespace <>
|
||||
```
|
||||
1. Copy the `.env.example` file to `.env` and in the following steps modify the fields of this new file:
|
||||
1. Configure the AWS credentials fields with those of your IAM User.
|
||||
1. Update the `SAGEMAKER_EXECUTION_ROLE_ARN` with that of your role created earlier.
|
||||
1. Update the `S3_DATA_BUCKET` parameter with the name of the bucket created earlier.
|
||||
1. (Optional) If you have already created an EKS cluster for testing, replace the `EKS_EXISTING_CLUSTER` field with it's name.
|
||||
1. Build the image by doing the following:
|
||||
1. Navigate to the `components/aws` directory.
|
||||
1. Run `docker build . -f sagemaker/tests/integration_tests/Dockerfile -t amazon/integration_test`
|
||||
1. Run the image, injecting your environment variable files:
|
||||
1. Navigate to the `components/aws` directory.
|
||||
1. Run `docker run --env-file sagemaker/tests/integration_tests/.env amazon/integration_test`
|
||||
|
|
@ -5,6 +5,7 @@ import os
|
|||
import utils
|
||||
|
||||
from datetime import datetime
|
||||
from filelock import FileLock
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
|
|
@ -86,12 +87,29 @@ def kfp_client():
|
|||
kfp_installed_namespace = utils.get_kfp_namespace()
|
||||
return kfp.Client(namespace=kfp_installed_namespace)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def experiment_id(kfp_client):
|
||||
exp_name = datetime.now().strftime("%Y-%m-%d")
|
||||
def get_experiment_id(kfp_client):
|
||||
exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M")
|
||||
try:
|
||||
experiment = kfp_client.get_experiment(experiment_name=exp_name)
|
||||
except ValueError:
|
||||
experiment = kfp_client.create_experiment(name=exp_name)
|
||||
return experiment.id
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def experiment_id(kfp_client, tmp_path_factory, worker_id):
|
||||
if not worker_id:
|
||||
return get_experiment_id(kfp_client)
|
||||
|
||||
# Locking taking as an example from
|
||||
# https://github.com/pytest-dev/pytest-xdist#making-session-scoped-fixtures-execute-only-once
|
||||
# get the temp directory shared by all workers
|
||||
root_tmp_dir = tmp_path_factory.getbasetemp().parent
|
||||
|
||||
fn = root_tmp_dir / "experiment_id"
|
||||
with FileLock(str(fn) + ".lock"):
|
||||
if fn.is_file():
|
||||
data = fn.read_text()
|
||||
else:
|
||||
data = get_experiment_id(kfp_client)
|
||||
fn.write_text(data)
|
||||
return data
|
||||
|
|
@ -12,6 +12,7 @@ dependencies:
|
|||
- pyyaml=5.3.*
|
||||
- flake8=3.7.*
|
||||
- flake8-black=0.1.*
|
||||
- filelock=3.0.*
|
||||
- pip:
|
||||
- kubernetes==11.0.*
|
||||
- kfp==0.5.*
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ Arguments:
|
|||
variant_name_1: variant-1
|
||||
instance_type_1: ml.m4.xlarge
|
||||
initial_instance_count_1: 1
|
||||
initial_variant_weight_1: 1.0
|
||||
network_isolation: "True"
|
||||
role: ((ROLE_ARN))
|
||||
|
||||
|
|
@ -34,7 +34,7 @@ def create_endpoint_pipeline(
|
|||
model_artifact_url=model_artifact_url,
|
||||
network_isolation=network_isolation,
|
||||
role=role,
|
||||
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
|
||||
)
|
||||
|
||||
sagemaker_deploy_op(
|
||||
region=region,
|
||||
|
|
@ -46,7 +46,7 @@ def create_endpoint_pipeline(
|
|||
instance_type_1=instance_type_1,
|
||||
initial_instance_count_1=initial_instance_count_1,
|
||||
initial_variant_weight_1=initial_variant_weight_1,
|
||||
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ def create_model_pipeline(
|
|||
model_artifact_url=model_artifact_url,
|
||||
network_isolation=network_isolation,
|
||||
role=role,
|
||||
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ def hpo_pipeline(
|
|||
network_isolation=network_isolation,
|
||||
max_wait_time=max_wait_time,
|
||||
role=role,
|
||||
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ def training_pipeline(
|
|||
max_wait_time=max_wait_time,
|
||||
checkpoint_config=checkpoint_config,
|
||||
role=role,
|
||||
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ def batch_transform_pipeline(
|
|||
model_artifact_url=model_artifact_url,
|
||||
network_isolation=network_isolation,
|
||||
role=role,
|
||||
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
|
||||
)
|
||||
|
||||
sagemaker_batch_transform_op(
|
||||
region=region,
|
||||
|
|
@ -57,7 +57,7 @@ def batch_transform_pipeline(
|
|||
split_type=split_type,
|
||||
compression_type=compression_type,
|
||||
output_location=output_location,
|
||||
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -0,0 +1,68 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Helper script to generate an IAM Role needed to install role-based authentication to a KFP service account.
|
||||
#
|
||||
# Run as:
|
||||
# $ ./generate_iam_role ${cluster_arn/cluster_name} ${role_name} ${cluster_region} [optional: ${service_namespace} ${service_account}]
|
||||
#
|
||||
|
||||
CLUSTER_ARN="${1}"
|
||||
ROLE_NAME="${2}"
|
||||
CLUSTER_REGION="${3:-us-east-1}"
|
||||
SERVICE_NAMESPACE="${4:-kubeflow}"
|
||||
SERVICE_ACCOUNT="${5:-pipeline-runner}"
|
||||
aws_account=$(aws sts get-caller-identity --query Account --output text)
|
||||
trustfile="trust.json"
|
||||
|
||||
cwd=$(dirname $(realpath $0))
|
||||
|
||||
# if using an existing cluster, use the cluster arn to get the region and cluster name
|
||||
# example, cluster_arn=arn:aws:eks:us-east-1:12345678910:cluster/test
|
||||
cluster_name=$(echo ${CLUSTER_ARN} | cut -d'/' -f2)
|
||||
|
||||
# A function to get the OIDC_ID associated with an EKS cluster
|
||||
function get_oidc_id {
|
||||
# TODO: Ideally this should be based on version compatibility instead of command failure
|
||||
eksctl utils associate-iam-oidc-provider --cluster ${cluster_name} --region ${CLUSTER_REGION} --approve
|
||||
if [[ $? -ge 1 ]]; then
|
||||
eksctl utils associate-iam-oidc-provider --name ${cluster_name} --region ${CLUSTER_REGION} --approve
|
||||
fi
|
||||
|
||||
local oidc=$(aws eks describe-cluster --name ${cluster_name} --region ${CLUSTER_REGION} --query cluster.identity.oidc.issuer --output text)
|
||||
oidc_id=$(echo ${oidc} | rev | cut -d'/' -f1 | rev)
|
||||
}
|
||||
|
||||
# A function that generates an IAM role for the given account, cluster, namespace, region
|
||||
# Parameter:
|
||||
# $1: Name of the trust file to generate.
|
||||
function create_namespaced_iam_role {
|
||||
local trustfile="${1}"
|
||||
# Check if role already exists
|
||||
aws iam get-role --role-name ${ROLE_NAME}
|
||||
if [[ $? -eq 0 ]]; then
|
||||
echo "A role for this cluster and namespace already exists in this account, assuming sagemaker access and proceeding."
|
||||
else
|
||||
echo "IAM Role does not exist, creating a new Role for the cluster"
|
||||
aws iam create-role --role-name ${ROLE_NAME} --assume-role-policy-document file://${trustfile} --output=text --query "Role.Arn"
|
||||
aws iam attach-role-policy --role-name ${ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
|
||||
fi
|
||||
}
|
||||
|
||||
# Remove the generated trust file
|
||||
# Parameter:
|
||||
# $1: Name of the trust file to delete.
|
||||
function delete_generated_file {
|
||||
rm "${1}"
|
||||
}
|
||||
|
||||
echo "Get the OIDC ID for the cluster"
|
||||
get_oidc_id
|
||||
echo "Delete the trust json file if it already exists"
|
||||
delete_generated_file "${trustfile}"
|
||||
echo "Generate a trust json"
|
||||
"$cwd"/generate_trust_policy ${CLUSTER_REGION} ${aws_account} ${oidc_id} ${SERVICE_NAMESPACE} ${SERVICE_ACCOUNT} > "${trustfile}"
|
||||
echo "Create the IAM Role using these values"
|
||||
create_namespaced_iam_role "${trustfile}"
|
||||
echo "Cleanup for the next run"
|
||||
delete_generated_file "${trustfile}"
|
||||
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Helper script to generate trust the policy needed to assign role-based authentication to a KFP service account.
|
||||
#
|
||||
# Run as:
|
||||
# $ ./generate_trust_policy ${EKS_CLUSTER_REGION} ${AWS_ACCOUNT_ID} ${OIDC_ID} ${SERVICE_NAMESPACE} ${SERVICE_ACCOUNT} > trust.json
|
||||
#
|
||||
# For example:
|
||||
# $ ./generate_trust_policy us-west-2 123456789012 D48675832CA65BD10A532F597OIDCID > trust.json
|
||||
# This will create a file `trust.json` containing a role policy that enables the KFP service runner in an EKS cluster to assume AWS roles.
|
||||
#
|
||||
# The SERVICE_NAMESPACE parameter is for when you want to run Kubeflow in a custom namespace other than "kubeflow".
|
||||
# The SERVICE_ACCOUNT parameter is for when you want to give permissions to a service account other than the default "pipeline-runner".
|
||||
|
||||
cluster_region="$1"
|
||||
account_number="$2"
|
||||
oidc_id="$3"
|
||||
service_namespace="${4}"
|
||||
service_account="${5}"
|
||||
|
||||
printf '{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Principal": {
|
||||
"Federated": "arn:aws:iam::'"${account_number}"':oidc-provider/oidc.eks.'"${cluster_region}"'.amazonaws.com/id/'"${oidc_id}"'"
|
||||
},
|
||||
"Action": "sts:AssumeRoleWithWebIdentity",
|
||||
"Condition": {
|
||||
"StringEquals": {
|
||||
"oidc.eks.'"${cluster_region}"'.amazonaws.com/id/'"${oidc_id}"':aud": "sts.amazonaws.com",
|
||||
"oidc.eks.'"${cluster_region}"'.amazonaws.com/id/'"${oidc_id}"':sub": "system:serviceaccount:'"${service_namespace}"':'"${service_account}"'"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
'
|
||||
|
|
@ -0,0 +1,168 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -u
|
||||
set -o pipefail
|
||||
|
||||
usage(){
|
||||
echo "Usage: $0 -n <deployment name> [-r <region>]"
|
||||
exit 1
|
||||
}
|
||||
|
||||
cwd=$(dirname $(realpath $0))
|
||||
|
||||
### Input parameters
|
||||
DEPLOY_NAME="sagemaker-kfp-"$(date '+%Y-%m-%d-%H-%M-%S')"" # The name given to the entire deployment (tagging all resources)
|
||||
REGION=${REGION:-"$(aws configure get region)"} # Deployment region
|
||||
|
||||
### Configuration parameters
|
||||
EKS_EXISTING_CLUSTER=${EKS_EXISTING_CLUSTER:-""} # Use an existing EKS cluster
|
||||
EKS_CLUSTER_VERSION=${EKS_CLUSTER_VERSION:-"1.15"} # EKS cluster K8s version
|
||||
EKS_NODE_COUNT=${EKS_NODE_COUNT:-"1"} # The initial node count of the EKS cluster
|
||||
EKS_PUBLIC_SUBNETS=${EKS_PUBLIC_SUBNETS:-""}
|
||||
EKS_PRIVATE_SUBNETS=${EKS_PRIVATE_SUBNETS:-""}
|
||||
|
||||
### Testing parameters
|
||||
MINIO_LOCAL_PORT=${MINIO_LOCAL_PORT:-9000}
|
||||
KFP_NAMESPACE=${KFP_NAMESPACE:-"kubeflow"}
|
||||
KFP_SERVICE_ACCOUNT=${KFP_SERVICE_ACCOUNT:-"pipeline-runner"}
|
||||
|
||||
PYTEST_MARKER=${PYTEST_MARKER:-""}
|
||||
S3_DATA_BUCKET=${S3_DATA_BUCKET:-""}
|
||||
SAGEMAKER_EXECUTION_ROLE_ARN=${SAGEMAKER_EXECUTION_ROLE_ARN:-""}
|
||||
|
||||
while getopts ":n:r:s:" opt; do
|
||||
case $opt in
|
||||
n)
|
||||
DEPLOY_NAME="$OPTARG"
|
||||
;;
|
||||
s)
|
||||
S3_DATA_BUCKET="$OPTARG"
|
||||
;;
|
||||
r)
|
||||
REGION="$OPTARG"
|
||||
;;
|
||||
\?)
|
||||
echo "Invalid option: -$OPTARG" >&2
|
||||
exit 1
|
||||
;;
|
||||
:)
|
||||
echo "Option -$OPTARG requires an argument." >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Ensure a deployment name was specified
|
||||
if [ "$DEPLOY_NAME" == "" ]; then
|
||||
echo "Missing deployment name"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$S3_DATA_BUCKET" == "" ]; then
|
||||
echo "Missing S3 data bucket name"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
function cleanup() {
|
||||
set +e
|
||||
|
||||
cleanup_kfp
|
||||
delete_generated_role
|
||||
|
||||
if [[ -z "${EKS_EXISTING_CLUSTER}" ]]; then
|
||||
delete_eks
|
||||
fi
|
||||
}
|
||||
|
||||
# Set the trap to clean up resources in the case of an error
|
||||
trap cleanup EXIT
|
||||
set -e
|
||||
|
||||
function launch_eks() {
|
||||
EKS_CLUSTER_NAME="${DEPLOY_NAME}-eks-cluster"
|
||||
|
||||
echo "[Creating EKS] Launching EKS cluster $EKS_CLUSTER_NAME"
|
||||
|
||||
eksctl_args=( --managed --nodes "${EKS_NODE_COUNT}" --node-type=c5.xlarge --timeout=30m --region "${REGION}" --auto-kubeconfig --version "${EKS_CLUSTER_VERSION}" )
|
||||
[ ! -z "${EKS_PUBLIC_SUBNETS}" ] && eksctl_args+=( --vpc-public-subnets="${EKS_PUBLIC_SUBNETS}" )
|
||||
[ ! -z "${EKS_PRIVATE_SUBNETS}" ] && eksctl_args+=( --vpc-private-subnets="${EKS_PRIVATE_SUBNETS}" )
|
||||
|
||||
eksctl create cluster "${EKS_CLUSTER_NAME}" "${eksctl_args[@]}"
|
||||
|
||||
aws eks update-kubeconfig --name "$EKS_CLUSTER_NAME" --region "$REGION"
|
||||
|
||||
echo "[Creating EKS] $EKS_CLUSTER_NAME launched"
|
||||
}
|
||||
|
||||
function delete_eks() {
|
||||
eksctl delete cluster --name "${EKS_CLUSTER_NAME}" --region "${REGION}"
|
||||
}
|
||||
|
||||
function install_kfp() {
|
||||
echo "[Installing KFP] Applying KFP manifests"
|
||||
|
||||
PIPELINE_VERSION=0.5.1
|
||||
kubectl apply -k github.com/kubeflow/pipelines/manifests/kustomize/cluster-scoped-resources?ref=$PIPELINE_VERSION
|
||||
kubectl wait --for condition=established --timeout=60s crd/applications.app.k8s.io
|
||||
kubectl apply -k github.com/kubeflow/pipelines/manifests/kustomize/env/dev?ref=$PIPELINE_VERSION
|
||||
|
||||
echo "[Installing KFP] Port-forwarding Minio"
|
||||
|
||||
kubectl wait --for=condition=ready -n "${KFP_NAMESPACE}" pod -l app=minio --timeout=5m
|
||||
kubectl port-forward -n kubeflow svc/minio-service $MINIO_LOCAL_PORT:9000 &
|
||||
MINIO_PID=$!
|
||||
|
||||
echo "[Installing KFP] Minio port-forwarded to ${MINIO_LOCAL_PORT}"
|
||||
|
||||
echo "[Installing KFP] Waiting for pods to stand up"
|
||||
|
||||
kubectl wait --for=condition=ready -n "${KFP_NAMESPACE}" pod -l app=ml-pipeline --timeout=5m
|
||||
|
||||
# TODO: Replace with calculated waits
|
||||
# For the moment we don't know which pods will be slower, so we are just relying on a fixed interval
|
||||
sleep 3m
|
||||
|
||||
echo "[Installing KFP] Pipeline pods are ready"
|
||||
}
|
||||
|
||||
function generate_iam_role_name() {
|
||||
OIDC_ROLE_NAME="$(echo "${DEPLOY_NAME}-kubeflow-role" | cut -c1-64)"
|
||||
OIDC_ROLE_ARN="arn:aws:iam::$(aws sts get-caller-identity --query=Account --output=text):role/${OIDC_ROLE_NAME}"
|
||||
}
|
||||
|
||||
function install_generated_role() {
|
||||
kubectl patch serviceaccount -n ${KFP_NAMESPACE} ${KFP_SERVICE_ACCOUNT} --patch '{"metadata": {"annotations": {"eks.amazonaws.com/role-arn": "'"${OIDC_ROLE_ARN}"'"}}}'
|
||||
}
|
||||
|
||||
function delete_generated_role() {
|
||||
# Delete the role associated with the cluster thats being deleted
|
||||
aws iam detach-role-policy --role-name "${OIDC_ROLE_NAME}" --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
|
||||
aws iam delete-role --role-name "${OIDC_ROLE_NAME}"
|
||||
}
|
||||
|
||||
function cleanup_kfp() {
|
||||
# Clean up Minio
|
||||
if [ ! -z "${MINIO_PID}" ]; then
|
||||
kill -9 $MINIO_PID || true
|
||||
fi
|
||||
}
|
||||
|
||||
if [[ -z "${EKS_EXISTING_CLUSTER}" ]]; then
|
||||
launch_eks
|
||||
else
|
||||
aws eks update-kubeconfig --name "${EKS_EXISTING_CLUSTER}" --region "$REGION"
|
||||
EKS_CLUSTER_NAME="${EKS_EXISTING_CLUSTER}"
|
||||
DEPLOY_NAME="${EKS_EXISTING_CLUSTER}"
|
||||
fi
|
||||
|
||||
generate_iam_role_name
|
||||
"$cwd"/generate_iam_role ${EKS_CLUSTER_NAME} ${OIDC_ROLE_NAME} ${REGION} ${KFP_NAMESPACE} ${KFP_SERVICE_ACCOUNT}
|
||||
install_kfp
|
||||
install_generated_role
|
||||
|
||||
pytest_args=( --region "${REGION}" --role-arn "${SAGEMAKER_EXECUTION_ROLE_ARN}" --s3-data-bucket "${S3_DATA_BUCKET}" --minio-service-port "${MINIO_LOCAL_PORT}" --kfp-namespace "${KFP_NAMESPACE}" )
|
||||
[ ! -z "${PYTEST_MARKER}" ] && pytest_args+=( -m "${PYTEST_MARKER}" )
|
||||
|
||||
cd tests/integration_tests && python -m pytest "${pytest_args[@]}" --junitxml ./integration_tests.log -n $(nproc)
|
||||
Loading…
Reference in New Issue