[AWS SageMaker] Print SageMaker job logs in kfp UI (#3954)
* Print logs for AWS SM Componenets on KFP UI * address comments * update version number to 0.5.0 * update yaml to version 0.5.0 * update changelog
This commit is contained in:
parent
6698fe72d1
commit
b3d8e04e1e
|
|
@ -4,6 +4,11 @@ The version of the AWS SageMaker Components is determined by the docker image ta
|
|||
Repository: https://hub.docker.com/repository/docker/amazon/aws-sagemaker-kfp-components
|
||||
|
||||
---------------------------------------------
|
||||
**Change log for version 0.5.0**
|
||||
- Print SageMaker logs in KFP UI for Train, Transform and Process component
|
||||
|
||||
> Pull requests : [#3954](https://github.com/kubeflow/pipelines/pull/3954)
|
||||
|
||||
**Change log for version 0.4.1**
|
||||
- Fix breaking bug in HPO component
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
** Amazon SageMaker Components for Kubeflow Pipelines; version 0.4.1 --
|
||||
** Amazon SageMaker Components for Kubeflow Pipelines; version 0.5.0 --
|
||||
https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker
|
||||
Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
** boto3; version 1.12.33 -- https://github.com/boto/boto3/
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ outputs:
|
|||
- {name: output_location, description: 'S3 URI of the transform job results.'}
|
||||
implementation:
|
||||
container:
|
||||
image: amazon/aws-sagemaker-kfp-components:0.4.1
|
||||
image: amazon/aws-sagemaker-kfp-components:0.5.0
|
||||
command: ['python3']
|
||||
args: [
|
||||
batch_transform.py,
|
||||
|
|
|
|||
|
|
@ -62,7 +62,14 @@ def main(argv=None):
|
|||
logging.info('Submitting Batch Transformation request to SageMaker...')
|
||||
batch_job_name = _utils.create_transform_job(client, vars(args))
|
||||
logging.info('Batch Job request submitted. Waiting for completion...')
|
||||
_utils.wait_for_transform_job(client, batch_job_name)
|
||||
|
||||
try:
|
||||
_utils.wait_for_transform_job(client, batch_job_name)
|
||||
except:
|
||||
raise
|
||||
finally:
|
||||
cw_client = _utils.get_cloudwatch_client(args.region)
|
||||
_utils.print_logs_for_job(cw_client, '/aws/sagemaker/TransformJobs', batch_job_name)
|
||||
|
||||
Path(args.output_location_file).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(args.output_location_file, 'w') as f:
|
||||
|
|
|
|||
|
|
@ -83,6 +83,30 @@ def get_component_version():
|
|||
return component_version
|
||||
|
||||
|
||||
def print_logs_for_job(cw_client, log_grp, job_name):
|
||||
"""Gets the CloudWatch logs for SageMaker jobs"""
|
||||
try:
|
||||
logging.info('\n******************** CloudWatch logs for {} {} ********************\n'.format(log_grp, job_name))
|
||||
|
||||
log_streams = cw_client.describe_log_streams(
|
||||
logGroupName=log_grp,
|
||||
logStreamNamePrefix=job_name + '/'
|
||||
)['logStreams']
|
||||
|
||||
for log_stream in log_streams:
|
||||
logging.info('\n***** {} *****\n'.format(log_stream['logStreamName']))
|
||||
response = cw_client.get_log_events(
|
||||
logGroupName=log_grp,
|
||||
logStreamName=log_stream['logStreamName']
|
||||
)
|
||||
for event in response['events']:
|
||||
logging.info(event['message'])
|
||||
|
||||
logging.info('\n******************** End of CloudWatch logs for {} {} ********************\n'.format(log_grp, job_name))
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
|
||||
|
||||
def get_sagemaker_client(region, endpoint_url=None):
|
||||
"""Builds a client to the AWS SageMaker API."""
|
||||
session_config = botocore.config.Config(
|
||||
|
|
@ -92,6 +116,11 @@ def get_sagemaker_client(region, endpoint_url=None):
|
|||
return client
|
||||
|
||||
|
||||
def get_cloudwatch_client(region):
|
||||
client = boto3.client('logs', region_name=region)
|
||||
return client
|
||||
|
||||
|
||||
def create_training_job_request(args):
|
||||
### Documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_training_job
|
||||
with open(os.path.join(__cwd__, 'train.template.yaml'), 'r') as f:
|
||||
|
|
|
|||
|
|
@ -104,7 +104,7 @@ outputs:
|
|||
- {name: endpoint_name, description: 'Endpoint name'}
|
||||
implementation:
|
||||
container:
|
||||
image: amazon/aws-sagemaker-kfp-components:0.4.1
|
||||
image: amazon/aws-sagemaker-kfp-components:0.5.0
|
||||
command: ['python3']
|
||||
args: [
|
||||
deploy.py,
|
||||
|
|
|
|||
|
|
@ -119,7 +119,7 @@ outputs:
|
|||
- {name: active_learning_model_arn, description: 'The ARN for the most recent Amazon SageMaker model trained as part of automated data labeling.'}
|
||||
implementation:
|
||||
container:
|
||||
image: amazon/aws-sagemaker-kfp-components:0.4.1
|
||||
image: amazon/aws-sagemaker-kfp-components:0.5.0
|
||||
command: ['python3']
|
||||
args: [
|
||||
ground_truth.py,
|
||||
|
|
|
|||
|
|
@ -150,7 +150,7 @@ outputs:
|
|||
description: 'The registry path of the Docker image that contains the training algorithm'
|
||||
implementation:
|
||||
container:
|
||||
image: amazon/aws-sagemaker-kfp-components:0.4.1
|
||||
image: amazon/aws-sagemaker-kfp-components:0.5.0
|
||||
command: ['python3']
|
||||
args: [
|
||||
hyperparameter_tuning.py,
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ outputs:
|
|||
- {name: model_name, description: 'The model name Sagemaker created'}
|
||||
implementation:
|
||||
container:
|
||||
image: amazon/aws-sagemaker-kfp-components:0.4.1
|
||||
image: amazon/aws-sagemaker-kfp-components:0.5.0
|
||||
command: ['python3']
|
||||
args: [
|
||||
create_model.py,
|
||||
|
|
|
|||
|
|
@ -89,7 +89,7 @@ outputs:
|
|||
- {name: output_artifacts, description: 'A dictionary containing the output S3 artifacts'}
|
||||
implementation:
|
||||
container:
|
||||
image: amazon/aws-sagemaker-kfp-components:0.4.1
|
||||
image: amazon/aws-sagemaker-kfp-components:0.5.0
|
||||
command: ['python3']
|
||||
args: [
|
||||
process.py,
|
||||
|
|
|
|||
|
|
@ -53,7 +53,14 @@ def main(argv=None):
|
|||
logging.info('Submitting Processing Job to SageMaker...')
|
||||
job_name = _utils.create_processing_job(client, vars(args))
|
||||
logging.info('Job request submitted. Waiting for completion...')
|
||||
_utils.wait_for_processing_job(client, job_name)
|
||||
|
||||
try:
|
||||
_utils.wait_for_processing_job(client, job_name)
|
||||
except:
|
||||
raise
|
||||
finally:
|
||||
cw_client = _utils.get_cloudwatch_client(args.region)
|
||||
_utils.print_logs_for_job(cw_client, '/aws/sagemaker/ProcessingJobs', job_name)
|
||||
|
||||
outputs = _utils.get_processing_job_outputs(client, job_name)
|
||||
|
||||
|
|
|
|||
|
|
@ -104,7 +104,7 @@ outputs:
|
|||
- {name: training_image, description: 'The registry path of the Docker image that contains the training algorithm'}
|
||||
implementation:
|
||||
container:
|
||||
image: amazon/aws-sagemaker-kfp-components:0.4.1
|
||||
image: amazon/aws-sagemaker-kfp-components:0.5.0
|
||||
command: ['python3']
|
||||
args: [
|
||||
train.py,
|
||||
|
|
|
|||
|
|
@ -60,7 +60,13 @@ def main(argv=None):
|
|||
logging.info('Submitting Training Job to SageMaker...')
|
||||
job_name = _utils.create_training_job(client, vars(args))
|
||||
logging.info('Job request submitted. Waiting for completion...')
|
||||
_utils.wait_for_training_job(client, job_name)
|
||||
try:
|
||||
_utils.wait_for_training_job(client, job_name)
|
||||
except:
|
||||
raise
|
||||
finally:
|
||||
cw_client = _utils.get_cloudwatch_client(args.region)
|
||||
_utils.print_logs_for_job(cw_client, '/aws/sagemaker/TrainingJobs', job_name)
|
||||
|
||||
image = _utils.get_image_from_job(client, job_name)
|
||||
model_artifact_url = _utils.get_model_artifacts_from_job(client, job_name)
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ outputs:
|
|||
- {name: workteam_arn, description: 'The ARN of the workteam.'}
|
||||
implementation:
|
||||
container:
|
||||
image: amazon/aws-sagemaker-kfp-components:0.4.1
|
||||
image: amazon/aws-sagemaker-kfp-components:0.5.0
|
||||
command: ['python3']
|
||||
args: [
|
||||
workteam.py,
|
||||
|
|
|
|||
Loading…
Reference in New Issue