[AWS SageMaker] Print SageMaker job logs in kfp UI (#3954)

* Print logs for AWS SM Componenets on KFP UI

* address comments

* update version number to 0.5.0

* update yaml to version 0.5.0

* update changelog
This commit is contained in:
Kartik Kalamadi 2020-06-19 00:33:58 -07:00 committed by GitHub
parent 6698fe72d1
commit b3d8e04e1e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 66 additions and 12 deletions

View File

@ -4,6 +4,11 @@ The version of the AWS SageMaker Components is determined by the docker image ta
Repository: https://hub.docker.com/repository/docker/amazon/aws-sagemaker-kfp-components
---------------------------------------------
**Change log for version 0.5.0**
- Print SageMaker logs in KFP UI for Train, Transform and Process component
> Pull requests : [#3954](https://github.com/kubeflow/pipelines/pull/3954)
**Change log for version 0.4.1**
- Fix breaking bug in HPO component

View File

@ -1,4 +1,4 @@
** Amazon SageMaker Components for Kubeflow Pipelines; version 0.4.1 --
** Amazon SageMaker Components for Kubeflow Pipelines; version 0.5.0 --
https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker
Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
** boto3; version 1.12.33 -- https://github.com/boto/boto3/

View File

@ -98,7 +98,7 @@ outputs:
- {name: output_location, description: 'S3 URI of the transform job results.'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
batch_transform.py,

View File

@ -62,7 +62,14 @@ def main(argv=None):
logging.info('Submitting Batch Transformation request to SageMaker...')
batch_job_name = _utils.create_transform_job(client, vars(args))
logging.info('Batch Job request submitted. Waiting for completion...')
_utils.wait_for_transform_job(client, batch_job_name)
try:
_utils.wait_for_transform_job(client, batch_job_name)
except:
raise
finally:
cw_client = _utils.get_cloudwatch_client(args.region)
_utils.print_logs_for_job(cw_client, '/aws/sagemaker/TransformJobs', batch_job_name)
Path(args.output_location_file).parent.mkdir(parents=True, exist_ok=True)
with open(args.output_location_file, 'w') as f:

View File

@ -83,6 +83,30 @@ def get_component_version():
return component_version
def print_logs_for_job(cw_client, log_grp, job_name):
"""Gets the CloudWatch logs for SageMaker jobs"""
try:
logging.info('\n******************** CloudWatch logs for {} {} ********************\n'.format(log_grp, job_name))
log_streams = cw_client.describe_log_streams(
logGroupName=log_grp,
logStreamNamePrefix=job_name + '/'
)['logStreams']
for log_stream in log_streams:
logging.info('\n***** {} *****\n'.format(log_stream['logStreamName']))
response = cw_client.get_log_events(
logGroupName=log_grp,
logStreamName=log_stream['logStreamName']
)
for event in response['events']:
logging.info(event['message'])
logging.info('\n******************** End of CloudWatch logs for {} {} ********************\n'.format(log_grp, job_name))
except Exception as e:
logging.error(e)
def get_sagemaker_client(region, endpoint_url=None):
"""Builds a client to the AWS SageMaker API."""
session_config = botocore.config.Config(
@ -92,6 +116,11 @@ def get_sagemaker_client(region, endpoint_url=None):
return client
def get_cloudwatch_client(region):
client = boto3.client('logs', region_name=region)
return client
def create_training_job_request(args):
### Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_training_job
with open(os.path.join(__cwd__, 'train.template.yaml'), 'r') as f:

View File

@ -104,7 +104,7 @@ outputs:
- {name: endpoint_name, description: 'Endpoint name'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
deploy.py,

View File

@ -119,7 +119,7 @@ outputs:
- {name: active_learning_model_arn, description: 'The ARN for the most recent Amazon SageMaker model trained as part of automated data labeling.'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
ground_truth.py,

View File

@ -150,7 +150,7 @@ outputs:
description: 'The registry path of the Docker image that contains the training algorithm'
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
hyperparameter_tuning.py,

View File

@ -59,7 +59,7 @@ outputs:
- {name: model_name, description: 'The model name Sagemaker created'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
create_model.py,

View File

@ -89,7 +89,7 @@ outputs:
- {name: output_artifacts, description: 'A dictionary containing the output S3 artifacts'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
process.py,

View File

@ -53,7 +53,14 @@ def main(argv=None):
logging.info('Submitting Processing Job to SageMaker...')
job_name = _utils.create_processing_job(client, vars(args))
logging.info('Job request submitted. Waiting for completion...')
_utils.wait_for_processing_job(client, job_name)
try:
_utils.wait_for_processing_job(client, job_name)
except:
raise
finally:
cw_client = _utils.get_cloudwatch_client(args.region)
_utils.print_logs_for_job(cw_client, '/aws/sagemaker/ProcessingJobs', job_name)
outputs = _utils.get_processing_job_outputs(client, job_name)

View File

@ -104,7 +104,7 @@ outputs:
- {name: training_image, description: 'The registry path of the Docker image that contains the training algorithm'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
train.py,

View File

@ -60,7 +60,13 @@ def main(argv=None):
logging.info('Submitting Training Job to SageMaker...')
job_name = _utils.create_training_job(client, vars(args))
logging.info('Job request submitted. Waiting for completion...')
_utils.wait_for_training_job(client, job_name)
try:
_utils.wait_for_training_job(client, job_name)
except:
raise
finally:
cw_client = _utils.get_cloudwatch_client(args.region)
_utils.print_logs_for_job(cw_client, '/aws/sagemaker/TrainingJobs', job_name)
image = _utils.get_image_from_job(client, job_name)
model_artifact_url = _utils.get_model_artifacts_from_job(client, job_name)

View File

@ -36,7 +36,7 @@ outputs:
- {name: workteam_arn, description: 'The ARN of the workteam.'}
implementation:
container:
image: amazon/aws-sagemaker-kfp-components:0.4.1
image: amazon/aws-sagemaker-kfp-components:0.5.0
command: ['python3']
args: [
workteam.py,