feat(components): Copy the GCP components to contrib folder (#6421)
* move a few components to contrib * move a few components to contrib * move a few components to contrib * move a few components to contrib * move a few components to contrib
This commit is contained in:
parent
d864db16f7
commit
c783705c0e
|
|
@ -0,0 +1,122 @@
|
|||
from typing import NamedTuple
|
||||
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
def add_measurement_for_trial_in_gcp_ai_platform_optimizer(
|
||||
trial_name: str,
|
||||
metric_value: float,
|
||||
complete_trial: bool = True,
|
||||
step_count: float = None,
|
||||
gcp_project_id: str = None,
|
||||
gcp_region: str = "us-central1",
|
||||
) -> NamedTuple('Outputs', [
|
||||
("trial_name", list),
|
||||
("trial", dict),
|
||||
("stop_trial", bool),
|
||||
]):
|
||||
"""Add measurement for a trial and check whether to continue.
|
||||
See https://cloud.google.com/ai-platform/optimizer/docs
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
|
||||
Args:
|
||||
trial_name: Full trial resource name.
|
||||
metric_value: Result of the trial evaluation.
|
||||
step_count: Optional. The number of training steps performed with the model. Can be used when checking early stopping.
|
||||
complete_trial: Whether the trial should be completed. Only completed trials are used to suggest new trials. Default is True.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import google.auth
|
||||
from googleapiclient import discovery
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
client_id = 'client1'
|
||||
metric_name = 'metric'
|
||||
|
||||
credentials, default_project_id = google.auth.default()
|
||||
|
||||
# Validating and inferring the arguments
|
||||
if not gcp_project_id:
|
||||
gcp_project_id = default_project_id
|
||||
|
||||
# Building the API client.
|
||||
# The main API does not work, so we need to build from the published discovery document.
|
||||
def create_caip_optimizer_client(project_id):
|
||||
from google.cloud import storage
|
||||
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
|
||||
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
|
||||
client = storage.Client(project_id)
|
||||
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
|
||||
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
|
||||
discovery_document = blob.download_as_bytes()
|
||||
return discovery.build_from_document(service=discovery_document)
|
||||
|
||||
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
|
||||
def get_project_number(project_id):
|
||||
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
|
||||
response = service.projects().get(projectId=project_id).execute()
|
||||
return response['projectNumber']
|
||||
|
||||
gcp_project_number = get_project_number(gcp_project_id)
|
||||
|
||||
def fix_resource_name(name):
|
||||
return name.replace(gcp_project_number, gcp_project_id)
|
||||
|
||||
ml_api = create_caip_optimizer_client(gcp_project_id)
|
||||
trials_api = ml_api.projects().locations().studies().trials()
|
||||
operations_api = ml_api.projects().locations().operations()
|
||||
|
||||
measurement = {
|
||||
'measurement': {
|
||||
'stepCount': step_count,
|
||||
'metrics': [{
|
||||
'metric': metric_name,
|
||||
'value': metric_value,
|
||||
}],
|
||||
},
|
||||
}
|
||||
add_measurement_response = trials_api.addMeasurement(
|
||||
name=fix_resource_name(trial_name),
|
||||
body=measurement,
|
||||
).execute()
|
||||
|
||||
if complete_trial:
|
||||
should_stop_trial = True
|
||||
complete_response = trials_api.complete(
|
||||
name=fix_resource_name(trial_name),
|
||||
).execute()
|
||||
return (trial_name, complete_response, should_stop_trial)
|
||||
else:
|
||||
check_early_stopping_response = trials_api.checkEarlyStoppingState(
|
||||
name=fix_resource_name(trial_name),
|
||||
).execute()
|
||||
operation_name = check_early_stopping_response['name']
|
||||
while True:
|
||||
get_operation_response = operations_api.get(
|
||||
name=fix_resource_name(operation_name),
|
||||
).execute()
|
||||
if get_operation_response.get('done'):
|
||||
break
|
||||
logging.info('Not finished yet: ' + str(get_operation_response))
|
||||
time.sleep(10)
|
||||
operation_response = get_operation_response['response']
|
||||
should_stop_trial = operation_response['shouldStop']
|
||||
return (trial_name, add_measurement_response, should_stop_trial)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
add_measurement_for_trial_in_gcp_ai_platform_optimizer_op = create_component_from_func(
|
||||
add_measurement_for_trial_in_gcp_ai_platform_optimizer,
|
||||
base_image='python:3.8',
|
||||
packages_to_install=['google-api-python-client==1.12.3', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3'],
|
||||
output_component_file='component.yaml',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Add_measurement_for_trial/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,220 @@
|
|||
name: Add measurement for trial in gcp ai platform optimizer
|
||||
description: Add measurement for a trial and check whether to continue.
|
||||
inputs:
|
||||
- {name: trial_name, type: String, description: Full trial resource name.}
|
||||
- {name: metric_value, type: Float, description: Result of the trial evaluation.}
|
||||
- name: complete_trial
|
||||
type: Boolean
|
||||
description: Whether the trial should be completed. Only completed trials are used
|
||||
to suggest new trials. Default is True.
|
||||
default: "True"
|
||||
optional: true
|
||||
- {name: step_count, type: Float, description: Optional. The number of training steps
|
||||
performed with the model. Can be used when checking early stopping., optional: true}
|
||||
- {name: gcp_project_id, type: String, optional: true}
|
||||
- {name: gcp_region, type: String, default: us-central1, optional: true}
|
||||
outputs:
|
||||
- {name: trial_name, type: JsonArray}
|
||||
- {name: trial, type: JsonObject}
|
||||
- {name: stop_trial, type: Boolean}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Add_measurement_for_trial/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.8
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|
||||
|| PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|
||||
--user) && "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def add_measurement_for_trial_in_gcp_ai_platform_optimizer(
|
||||
trial_name,
|
||||
metric_value,
|
||||
complete_trial = True,
|
||||
step_count = None,
|
||||
gcp_project_id = None,
|
||||
gcp_region = "us-central1",
|
||||
):
|
||||
"""Add measurement for a trial and check whether to continue.
|
||||
See https://cloud.google.com/ai-platform/optimizer/docs
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
|
||||
Args:
|
||||
trial_name: Full trial resource name.
|
||||
metric_value: Result of the trial evaluation.
|
||||
step_count: Optional. The number of training steps performed with the model. Can be used when checking early stopping.
|
||||
complete_trial: Whether the trial should be completed. Only completed trials are used to suggest new trials. Default is True.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import google.auth
|
||||
from googleapiclient import discovery
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
client_id = 'client1'
|
||||
metric_name = 'metric'
|
||||
|
||||
credentials, default_project_id = google.auth.default()
|
||||
|
||||
# Validating and inferring the arguments
|
||||
if not gcp_project_id:
|
||||
gcp_project_id = default_project_id
|
||||
|
||||
# Building the API client.
|
||||
# The main API does not work, so we need to build from the published discovery document.
|
||||
def create_caip_optimizer_client(project_id):
|
||||
from google.cloud import storage
|
||||
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
|
||||
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
|
||||
client = storage.Client(project_id)
|
||||
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
|
||||
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
|
||||
discovery_document = blob.download_as_bytes()
|
||||
return discovery.build_from_document(service=discovery_document)
|
||||
|
||||
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
|
||||
def get_project_number(project_id):
|
||||
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
|
||||
response = service.projects().get(projectId=project_id).execute()
|
||||
return response['projectNumber']
|
||||
|
||||
gcp_project_number = get_project_number(gcp_project_id)
|
||||
|
||||
def fix_resource_name(name):
|
||||
return name.replace(gcp_project_number, gcp_project_id)
|
||||
|
||||
ml_api = create_caip_optimizer_client(gcp_project_id)
|
||||
trials_api = ml_api.projects().locations().studies().trials()
|
||||
operations_api = ml_api.projects().locations().operations()
|
||||
|
||||
measurement = {
|
||||
'measurement': {
|
||||
'stepCount': step_count,
|
||||
'metrics': [{
|
||||
'metric': metric_name,
|
||||
'value': metric_value,
|
||||
}],
|
||||
},
|
||||
}
|
||||
add_measurement_response = trials_api.addMeasurement(
|
||||
name=fix_resource_name(trial_name),
|
||||
body=measurement,
|
||||
).execute()
|
||||
|
||||
if complete_trial:
|
||||
should_stop_trial = True
|
||||
complete_response = trials_api.complete(
|
||||
name=fix_resource_name(trial_name),
|
||||
).execute()
|
||||
return (trial_name, complete_response, should_stop_trial)
|
||||
else:
|
||||
check_early_stopping_response = trials_api.checkEarlyStoppingState(
|
||||
name=fix_resource_name(trial_name),
|
||||
).execute()
|
||||
operation_name = check_early_stopping_response['name']
|
||||
while True:
|
||||
get_operation_response = operations_api.get(
|
||||
name=fix_resource_name(operation_name),
|
||||
).execute()
|
||||
if get_operation_response.get('done'):
|
||||
break
|
||||
logging.info('Not finished yet: ' + str(get_operation_response))
|
||||
time.sleep(10)
|
||||
operation_response = get_operation_response['response']
|
||||
should_stop_trial = operation_response['shouldStop']
|
||||
return (trial_name, add_measurement_response, should_stop_trial)
|
||||
|
||||
def _serialize_bool(bool_value: bool) -> str:
|
||||
if isinstance(bool_value, str):
|
||||
return bool_value
|
||||
if not isinstance(bool_value, bool):
|
||||
raise TypeError('Value "{}" has type "{}" instead of bool.'.format(str(bool_value), str(type(bool_value))))
|
||||
return str(bool_value)
|
||||
|
||||
def _serialize_json(obj) -> str:
|
||||
if isinstance(obj, str):
|
||||
return obj
|
||||
import json
|
||||
def default_serializer(obj):
|
||||
if hasattr(obj, 'to_struct'):
|
||||
return obj.to_struct()
|
||||
else:
|
||||
raise TypeError("Object of type '%s' is not JSON serializable and does not have .to_struct() method." % obj.__class__.__name__)
|
||||
return json.dumps(obj, default=default_serializer, sort_keys=True)
|
||||
|
||||
def _deserialize_bool(s) -> bool:
|
||||
from distutils.util import strtobool
|
||||
return strtobool(s) == 1
|
||||
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Add measurement for trial in gcp ai platform optimizer', description='Add measurement for a trial and check whether to continue.')
|
||||
_parser.add_argument("--trial-name", dest="trial_name", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--metric-value", dest="metric_value", type=float, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--complete-trial", dest="complete_trial", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--step-count", dest="step_count", type=float, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=3)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = add_measurement_for_trial_in_gcp_ai_platform_optimizer(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_json,
|
||||
_serialize_json,
|
||||
_serialize_bool,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --trial-name
|
||||
- {inputValue: trial_name}
|
||||
- --metric-value
|
||||
- {inputValue: metric_value}
|
||||
- if:
|
||||
cond: {isPresent: complete_trial}
|
||||
then:
|
||||
- --complete-trial
|
||||
- {inputValue: complete_trial}
|
||||
- if:
|
||||
cond: {isPresent: step_count}
|
||||
then:
|
||||
- --step-count
|
||||
- {inputValue: step_count}
|
||||
- if:
|
||||
cond: {isPresent: gcp_project_id}
|
||||
then:
|
||||
- --gcp-project-id
|
||||
- {inputValue: gcp_project_id}
|
||||
- if:
|
||||
cond: {isPresent: gcp_region}
|
||||
then:
|
||||
- --gcp-region
|
||||
- {inputValue: gcp_region}
|
||||
- '----output-paths'
|
||||
- {outputPath: trial_name}
|
||||
- {outputPath: trial}
|
||||
- {outputPath: stop_trial}
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
from typing import NamedTuple
|
||||
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
def create_study_in_gcp_ai_platform_optimizer(
|
||||
study_id: str,
|
||||
parameter_specs: list,
|
||||
optimization_goal: str = 'MAXIMIZE',
|
||||
metric_specs: list = None,
|
||||
gcp_project_id: str = None,
|
||||
gcp_region: str = "us-central1",
|
||||
) -> NamedTuple('Outputs', [
|
||||
("study_name", str),
|
||||
]):
|
||||
"""Creates a Google Cloud AI Plaform Optimizer study.
|
||||
See https://cloud.google.com/ai-platform/optimizer/docs
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
|
||||
Args:
|
||||
study_id: Name of the study.
|
||||
parameter_specs: List of parameter specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec
|
||||
optimization_goal: Optimization goal when optimizing a single metric. Can be MAXIMIZE (default) or MINIMIZE. Ignored if metric_specs list is provided.
|
||||
metric_specs: List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec
|
||||
"""
|
||||
|
||||
import logging
|
||||
import google.auth
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
# Validating and inferring the arguments
|
||||
if not gcp_project_id:
|
||||
_, gcp_project_id = google.auth.default()
|
||||
|
||||
# Building the API client.
|
||||
# The main API does not work, so we need to build from the published discovery document.
|
||||
def create_caip_optimizer_client(project_id):
|
||||
from google.cloud import storage
|
||||
from googleapiclient import discovery
|
||||
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
|
||||
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
|
||||
client = storage.Client(project_id)
|
||||
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
|
||||
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
|
||||
discovery_document = blob.download_as_bytes()
|
||||
return discovery.build_from_document(service=discovery_document)
|
||||
|
||||
ml_api = create_caip_optimizer_client(gcp_project_id)
|
||||
|
||||
if not metric_specs:
|
||||
metric_specs=[{
|
||||
'metric': 'metric',
|
||||
'goal': optimization_goal,
|
||||
}]
|
||||
study_config = {
|
||||
'algorithm': 'ALGORITHM_UNSPECIFIED', # Let the service choose the `default` algorithm.
|
||||
'parameters': parameter_specs,
|
||||
'metrics': metric_specs,
|
||||
}
|
||||
study = {'study_config': study_config}
|
||||
|
||||
create_study_request = ml_api.projects().locations().studies().create(
|
||||
parent=f'projects/{gcp_project_id}/locations/{gcp_region}',
|
||||
studyId=study_id,
|
||||
body=study,
|
||||
)
|
||||
create_study_response = create_study_request.execute()
|
||||
study_name = create_study_response['name']
|
||||
return (study_name,)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
create_study_in_gcp_ai_platform_optimizer_op = create_component_from_func(
|
||||
create_study_in_gcp_ai_platform_optimizer,
|
||||
base_image='python:3.8',
|
||||
packages_to_install=['google-api-python-client==1.12.3', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3'],
|
||||
output_component_file='component.yaml',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Create_study/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,160 @@
|
|||
name: Create study in gcp ai platform optimizer
|
||||
description: Creates a Google Cloud AI Plaform Optimizer study.
|
||||
inputs:
|
||||
- {name: study_id, type: String, description: Name of the study.}
|
||||
- {name: parameter_specs, type: JsonArray, description: 'List of parameter specs.
|
||||
See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec'}
|
||||
- {name: optimization_goal, type: String, description: Optimization goal when optimizing
|
||||
a single metric. Can be MAXIMIZE (default) or MINIMIZE. Ignored if metric_specs
|
||||
list is provided., default: MAXIMIZE, optional: true}
|
||||
- {name: metric_specs, type: JsonArray, description: 'List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec',
|
||||
optional: true}
|
||||
- {name: gcp_project_id, type: String, optional: true}
|
||||
- {name: gcp_region, type: String, default: us-central1, optional: true}
|
||||
outputs:
|
||||
- {name: study_name, type: String}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Create_study/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.8
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|
||||
|| PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|
||||
--user) && "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def create_study_in_gcp_ai_platform_optimizer(
|
||||
study_id,
|
||||
parameter_specs,
|
||||
optimization_goal = 'MAXIMIZE',
|
||||
metric_specs = None,
|
||||
gcp_project_id = None,
|
||||
gcp_region = "us-central1",
|
||||
):
|
||||
"""Creates a Google Cloud AI Plaform Optimizer study.
|
||||
See https://cloud.google.com/ai-platform/optimizer/docs
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
|
||||
Args:
|
||||
study_id: Name of the study.
|
||||
parameter_specs: List of parameter specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec
|
||||
optimization_goal: Optimization goal when optimizing a single metric. Can be MAXIMIZE (default) or MINIMIZE. Ignored if metric_specs list is provided.
|
||||
metric_specs: List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec
|
||||
"""
|
||||
|
||||
import logging
|
||||
import google.auth
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
# Validating and inferring the arguments
|
||||
if not gcp_project_id:
|
||||
_, gcp_project_id = google.auth.default()
|
||||
|
||||
# Building the API client.
|
||||
# The main API does not work, so we need to build from the published discovery document.
|
||||
def create_caip_optimizer_client(project_id):
|
||||
from google.cloud import storage
|
||||
from googleapiclient import discovery
|
||||
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
|
||||
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
|
||||
client = storage.Client(project_id)
|
||||
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
|
||||
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
|
||||
discovery_document = blob.download_as_bytes()
|
||||
return discovery.build_from_document(service=discovery_document)
|
||||
|
||||
ml_api = create_caip_optimizer_client(gcp_project_id)
|
||||
|
||||
if not metric_specs:
|
||||
metric_specs=[{
|
||||
'metric': 'metric',
|
||||
'goal': optimization_goal,
|
||||
}]
|
||||
study_config = {
|
||||
'algorithm': 'ALGORITHM_UNSPECIFIED', # Let the service choose the `default` algorithm.
|
||||
'parameters': parameter_specs,
|
||||
'metrics': metric_specs,
|
||||
}
|
||||
study = {'study_config': study_config}
|
||||
|
||||
create_study_request = ml_api.projects().locations().studies().create(
|
||||
parent=f'projects/{gcp_project_id}/locations/{gcp_region}',
|
||||
studyId=study_id,
|
||||
body=study,
|
||||
)
|
||||
create_study_response = create_study_request.execute()
|
||||
study_name = create_study_response['name']
|
||||
return (study_name,)
|
||||
|
||||
def _serialize_str(str_value: str) -> str:
|
||||
if not isinstance(str_value, str):
|
||||
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
|
||||
return str_value
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Create study in gcp ai platform optimizer', description='Creates a Google Cloud AI Plaform Optimizer study.')
|
||||
_parser.add_argument("--study-id", dest="study_id", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--parameter-specs", dest="parameter_specs", type=json.loads, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--optimization-goal", dest="optimization_goal", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--metric-specs", dest="metric_specs", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = create_study_in_gcp_ai_platform_optimizer(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_str,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --study-id
|
||||
- {inputValue: study_id}
|
||||
- --parameter-specs
|
||||
- {inputValue: parameter_specs}
|
||||
- if:
|
||||
cond: {isPresent: optimization_goal}
|
||||
then:
|
||||
- --optimization-goal
|
||||
- {inputValue: optimization_goal}
|
||||
- if:
|
||||
cond: {isPresent: metric_specs}
|
||||
then:
|
||||
- --metric-specs
|
||||
- {inputValue: metric_specs}
|
||||
- if:
|
||||
cond: {isPresent: gcp_project_id}
|
||||
then:
|
||||
- --gcp-project-id
|
||||
- {inputValue: gcp_project_id}
|
||||
- if:
|
||||
cond: {isPresent: gcp_region}
|
||||
then:
|
||||
- --gcp-region
|
||||
- {inputValue: gcp_region}
|
||||
- '----output-paths'
|
||||
- {outputPath: study_name}
|
||||
|
|
@ -0,0 +1,188 @@
|
|||
from typing import NamedTuple
|
||||
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
def suggest_parameter_sets_from_measurements_using_gcp_ai_platform_optimizer(
|
||||
parameter_specs: list,
|
||||
metrics_for_parameter_sets: list,
|
||||
suggestion_count: int,
|
||||
maximize: bool = False,
|
||||
metric_specs: list = None,
|
||||
gcp_project_id: str = None,
|
||||
gcp_region: str = "us-central1",
|
||||
) -> NamedTuple('Outputs', [
|
||||
("suggested_parameter_sets", list),
|
||||
]):
|
||||
"""Suggests trials (parameter sets) to evaluate.
|
||||
See https://cloud.google.com/ai-platform/optimizer/docs
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
|
||||
Args:
|
||||
parameter_specs: List of parameter specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec
|
||||
metrics_for_parameter_sets: List of parameter sets and evaluation metrics for them. Each list item contains "parameters" dict and "metrics" dict. Example: {"parameters": {"p1": 1.1, "p2": 2.2}, "metrics": {"metric1": 101, "metric2": 102} }
|
||||
maximize: Whether to miaximize or minimize when optimizing a single metric.Default is to minimize. Ignored if metric_specs list is provided.
|
||||
metric_specs: List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec
|
||||
suggestion_count: Number of suggestions to request.
|
||||
|
||||
suggested_parameter_sets: List of parameter set dictionaries.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
|
||||
import google.auth
|
||||
from googleapiclient import discovery
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
client_id = 'client1'
|
||||
|
||||
credentials, default_project_id = google.auth.default()
|
||||
|
||||
# Validating and inferring the arguments
|
||||
if not gcp_project_id:
|
||||
gcp_project_id = default_project_id
|
||||
|
||||
# Building the API client.
|
||||
# The main API does not work, so we need to build from the published discovery document.
|
||||
def create_caip_optimizer_client(project_id):
|
||||
from google.cloud import storage
|
||||
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
|
||||
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
|
||||
client = storage.Client(project_id)
|
||||
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
|
||||
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
|
||||
discovery_document = blob.download_as_bytes()
|
||||
return discovery.build_from_document(service=discovery_document)
|
||||
|
||||
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
|
||||
def get_project_number(project_id):
|
||||
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
|
||||
response = service.projects().get(projectId=project_id).execute()
|
||||
return response['projectNumber']
|
||||
|
||||
gcp_project_number = get_project_number(gcp_project_id)
|
||||
|
||||
def fix_resource_name(name):
|
||||
return name.replace(gcp_project_number, gcp_project_id)
|
||||
|
||||
ml_api = create_caip_optimizer_client(gcp_project_id)
|
||||
studies_api = ml_api.projects().locations().studies()
|
||||
trials_api = ml_api.projects().locations().studies().trials()
|
||||
operations_api = ml_api.projects().locations().operations()
|
||||
|
||||
random_integer = random.SystemRandom().getrandbits(256)
|
||||
study_id = '{:064x}'.format(random_integer)
|
||||
|
||||
if not metric_specs:
|
||||
metric_specs=[{
|
||||
'metric': 'metric',
|
||||
'goal': 'MAXIMIZE' if maximize else 'MINIMIZE',
|
||||
}]
|
||||
study_config = {
|
||||
'algorithm': 'ALGORITHM_UNSPECIFIED', # Let the service choose the `default` algorithm.
|
||||
'parameters': parameter_specs,
|
||||
'metrics': metric_specs,
|
||||
}
|
||||
study = {'study_config': study_config}
|
||||
|
||||
logging.info(f'Creating temporary study {study_id}')
|
||||
create_study_request = studies_api.create(
|
||||
parent=f'projects/{gcp_project_id}/locations/{gcp_region}',
|
||||
studyId=study_id,
|
||||
body=study,
|
||||
)
|
||||
create_study_response = create_study_request.execute()
|
||||
study_name = create_study_response['name']
|
||||
|
||||
paremeter_type_names = {parameter_spec['parameter']: parameter_spec['type'] for parameter_spec in parameter_specs}
|
||||
def parameter_name_and_value_to_dict(parameter_name: str, parameter_value) -> dict:
|
||||
result = {'parameter': parameter_name}
|
||||
paremeter_type_name = paremeter_type_names[parameter_name]
|
||||
if paremeter_type_name in ['DOUBLE', 'DISCRETE']:
|
||||
result['floatValue'] = parameter_value
|
||||
elif paremeter_type_name == 'INTEGER':
|
||||
result['intValue'] = parameter_value
|
||||
elif paremeter_type_name == 'CATEGORICAL':
|
||||
result['stringValue'] = parameter_value
|
||||
else:
|
||||
raise TypeError(f'Unsupported parameter type "{paremeter_type_name}"')
|
||||
return result
|
||||
|
||||
try:
|
||||
logging.info(f'Adding {len(metrics_for_parameter_sets)} measurements to the study.')
|
||||
for parameters_and_metrics in metrics_for_parameter_sets:
|
||||
parameter_set = parameters_and_metrics['parameters']
|
||||
metrics_set = parameters_and_metrics['metrics']
|
||||
trial = {
|
||||
'parameters': [
|
||||
parameter_name_and_value_to_dict(parameter_name, parameter_value)
|
||||
for parameter_name, parameter_value in parameter_set.items()
|
||||
],
|
||||
'finalMeasurement': {
|
||||
'metrics': [
|
||||
{
|
||||
'metric': metric_name,
|
||||
'value': metric_value,
|
||||
}
|
||||
for metric_name, metric_value in metrics_set.items()
|
||||
],
|
||||
},
|
||||
'state': 'COMPLETED',
|
||||
}
|
||||
create_trial_response = trials_api.create(
|
||||
parent=fix_resource_name(study_name),
|
||||
body=trial,
|
||||
).execute()
|
||||
trial_name = create_trial_response["name"]
|
||||
logging.info(f'Added trial "{trial_name}" to the study.')
|
||||
|
||||
logging.info(f'Requesting suggestions.')
|
||||
suggest_trials_request = trials_api.suggest(
|
||||
parent=fix_resource_name(study_name),
|
||||
body=dict(
|
||||
suggestionCount=suggestion_count,
|
||||
clientId=client_id,
|
||||
),
|
||||
)
|
||||
suggest_trials_response = suggest_trials_request.execute()
|
||||
operation_name = suggest_trials_response['name']
|
||||
while True:
|
||||
get_operation_response = operations_api.get(
|
||||
name=fix_resource_name(operation_name),
|
||||
).execute()
|
||||
# Knowledge: The "done" key is just missing until the result is available
|
||||
if get_operation_response.get('done'):
|
||||
break
|
||||
logging.info('Operation not finished yet: ' + str(get_operation_response))
|
||||
time.sleep(10)
|
||||
operation_response = get_operation_response['response']
|
||||
suggested_trials = operation_response['trials']
|
||||
|
||||
suggested_parameter_sets = [
|
||||
{
|
||||
parameter['parameter']: parameter.get('floatValue') or parameter.get('intValue') or parameter.get('stringValue') or 0.0
|
||||
for parameter in trial['parameters']
|
||||
}
|
||||
for trial in suggested_trials
|
||||
]
|
||||
return (suggested_parameter_sets,)
|
||||
finally:
|
||||
logging.info(f'Deleting study: "{study_name}"')
|
||||
studies_api.delete(name=fix_resource_name(study_name))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
suggest_parameter_sets_from_measurements_using_gcp_ai_platform_optimizer_op = create_component_from_func(
|
||||
suggest_parameter_sets_from_measurements_using_gcp_ai_platform_optimizer,
|
||||
base_image='python:3.8',
|
||||
packages_to_install=['google-api-python-client==1.12.3', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3'],
|
||||
output_component_file='component.yaml',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Suggest_parameter_sets_based_on_measurements/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,284 @@
|
|||
name: Suggest parameter sets from measurements using gcp ai platform optimizer
|
||||
description: Suggests trials (parameter sets) to evaluate.
|
||||
inputs:
|
||||
- {name: parameter_specs, type: JsonArray, description: 'List of parameter specs.
|
||||
See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec'}
|
||||
- {name: metrics_for_parameter_sets, type: JsonArray, description: 'List of parameter
|
||||
sets and evaluation metrics for them. Each list item contains "parameters" dict
|
||||
and "metrics" dict. Example: {"parameters": {"p1": 1.1, "p2": 2.2}, "metrics":
|
||||
{"metric1": 101, "metric2": 102} }'}
|
||||
- {name: suggestion_count, type: Integer, description: Number of suggestions to request.}
|
||||
- name: maximize
|
||||
type: Boolean
|
||||
description: Whether to miaximize or minimize when optimizing a single metric.Default
|
||||
is to minimize. Ignored if metric_specs list is provided.
|
||||
default: "False"
|
||||
optional: true
|
||||
- {name: metric_specs, type: JsonArray, description: 'List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec',
|
||||
optional: true}
|
||||
- {name: gcp_project_id, type: String, optional: true}
|
||||
- {name: gcp_region, type: String, default: us-central1, optional: true}
|
||||
outputs:
|
||||
- {name: suggested_parameter_sets, type: JsonArray}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Suggest_parameter_sets_based_on_measurements/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.8
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|
||||
|| PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|
||||
--user) && "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def suggest_parameter_sets_from_measurements_using_gcp_ai_platform_optimizer(
|
||||
parameter_specs,
|
||||
metrics_for_parameter_sets,
|
||||
suggestion_count,
|
||||
maximize = False,
|
||||
metric_specs = None,
|
||||
gcp_project_id = None,
|
||||
gcp_region = "us-central1",
|
||||
):
|
||||
"""Suggests trials (parameter sets) to evaluate.
|
||||
See https://cloud.google.com/ai-platform/optimizer/docs
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
|
||||
Args:
|
||||
parameter_specs: List of parameter specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec
|
||||
metrics_for_parameter_sets: List of parameter sets and evaluation metrics for them. Each list item contains "parameters" dict and "metrics" dict. Example: {"parameters": {"p1": 1.1, "p2": 2.2}, "metrics": {"metric1": 101, "metric2": 102} }
|
||||
maximize: Whether to miaximize or minimize when optimizing a single metric.Default is to minimize. Ignored if metric_specs list is provided.
|
||||
metric_specs: List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec
|
||||
suggestion_count: Number of suggestions to request.
|
||||
|
||||
suggested_parameter_sets: List of parameter set dictionaries.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
|
||||
import google.auth
|
||||
from googleapiclient import discovery
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
client_id = 'client1'
|
||||
|
||||
credentials, default_project_id = google.auth.default()
|
||||
|
||||
# Validating and inferring the arguments
|
||||
if not gcp_project_id:
|
||||
gcp_project_id = default_project_id
|
||||
|
||||
# Building the API client.
|
||||
# The main API does not work, so we need to build from the published discovery document.
|
||||
def create_caip_optimizer_client(project_id):
|
||||
from google.cloud import storage
|
||||
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
|
||||
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
|
||||
client = storage.Client(project_id)
|
||||
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
|
||||
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
|
||||
discovery_document = blob.download_as_bytes()
|
||||
return discovery.build_from_document(service=discovery_document)
|
||||
|
||||
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
|
||||
def get_project_number(project_id):
|
||||
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
|
||||
response = service.projects().get(projectId=project_id).execute()
|
||||
return response['projectNumber']
|
||||
|
||||
gcp_project_number = get_project_number(gcp_project_id)
|
||||
|
||||
def fix_resource_name(name):
|
||||
return name.replace(gcp_project_number, gcp_project_id)
|
||||
|
||||
ml_api = create_caip_optimizer_client(gcp_project_id)
|
||||
studies_api = ml_api.projects().locations().studies()
|
||||
trials_api = ml_api.projects().locations().studies().trials()
|
||||
operations_api = ml_api.projects().locations().operations()
|
||||
|
||||
random_integer = random.SystemRandom().getrandbits(256)
|
||||
study_id = '{:064x}'.format(random_integer)
|
||||
|
||||
if not metric_specs:
|
||||
metric_specs=[{
|
||||
'metric': 'metric',
|
||||
'goal': 'MAXIMIZE' if maximize else 'MINIMIZE',
|
||||
}]
|
||||
study_config = {
|
||||
'algorithm': 'ALGORITHM_UNSPECIFIED', # Let the service choose the `default` algorithm.
|
||||
'parameters': parameter_specs,
|
||||
'metrics': metric_specs,
|
||||
}
|
||||
study = {'study_config': study_config}
|
||||
|
||||
logging.info(f'Creating temporary study {study_id}')
|
||||
create_study_request = studies_api.create(
|
||||
parent=f'projects/{gcp_project_id}/locations/{gcp_region}',
|
||||
studyId=study_id,
|
||||
body=study,
|
||||
)
|
||||
create_study_response = create_study_request.execute()
|
||||
study_name = create_study_response['name']
|
||||
|
||||
paremeter_type_names = {parameter_spec['parameter']: parameter_spec['type'] for parameter_spec in parameter_specs}
|
||||
def parameter_name_and_value_to_dict(parameter_name, parameter_value):
|
||||
result = {'parameter': parameter_name}
|
||||
paremeter_type_name = paremeter_type_names[parameter_name]
|
||||
if paremeter_type_name in ['DOUBLE', 'DISCRETE']:
|
||||
result['floatValue'] = parameter_value
|
||||
elif paremeter_type_name == 'INTEGER':
|
||||
result['intValue'] = parameter_value
|
||||
elif paremeter_type_name == 'CATEGORICAL':
|
||||
result['stringValue'] = parameter_value
|
||||
else:
|
||||
raise TypeError(f'Unsupported parameter type "{paremeter_type_name}"')
|
||||
return result
|
||||
|
||||
try:
|
||||
logging.info(f'Adding {len(metrics_for_parameter_sets)} measurements to the study.')
|
||||
for parameters_and_metrics in metrics_for_parameter_sets:
|
||||
parameter_set = parameters_and_metrics['parameters']
|
||||
metrics_set = parameters_and_metrics['metrics']
|
||||
trial = {
|
||||
'parameters': [
|
||||
parameter_name_and_value_to_dict(parameter_name, parameter_value)
|
||||
for parameter_name, parameter_value in parameter_set.items()
|
||||
],
|
||||
'finalMeasurement': {
|
||||
'metrics': [
|
||||
{
|
||||
'metric': metric_name,
|
||||
'value': metric_value,
|
||||
}
|
||||
for metric_name, metric_value in metrics_set.items()
|
||||
],
|
||||
},
|
||||
'state': 'COMPLETED',
|
||||
}
|
||||
create_trial_response = trials_api.create(
|
||||
parent=fix_resource_name(study_name),
|
||||
body=trial,
|
||||
).execute()
|
||||
trial_name = create_trial_response["name"]
|
||||
logging.info(f'Added trial "{trial_name}" to the study.')
|
||||
|
||||
logging.info(f'Requesting suggestions.')
|
||||
suggest_trials_request = trials_api.suggest(
|
||||
parent=fix_resource_name(study_name),
|
||||
body=dict(
|
||||
suggestionCount=suggestion_count,
|
||||
clientId=client_id,
|
||||
),
|
||||
)
|
||||
suggest_trials_response = suggest_trials_request.execute()
|
||||
operation_name = suggest_trials_response['name']
|
||||
while True:
|
||||
get_operation_response = operations_api.get(
|
||||
name=fix_resource_name(operation_name),
|
||||
).execute()
|
||||
# Knowledge: The "done" key is just missing until the result is available
|
||||
if get_operation_response.get('done'):
|
||||
break
|
||||
logging.info('Operation not finished yet: ' + str(get_operation_response))
|
||||
time.sleep(10)
|
||||
operation_response = get_operation_response['response']
|
||||
suggested_trials = operation_response['trials']
|
||||
|
||||
suggested_parameter_sets = [
|
||||
{
|
||||
parameter['parameter']: parameter.get('floatValue') or parameter.get('intValue') or parameter.get('stringValue') or 0.0
|
||||
for parameter in trial['parameters']
|
||||
}
|
||||
for trial in suggested_trials
|
||||
]
|
||||
return (suggested_parameter_sets,)
|
||||
finally:
|
||||
logging.info(f'Deleting study: "{study_name}"')
|
||||
studies_api.delete(name=fix_resource_name(study_name))
|
||||
|
||||
import json
|
||||
def _serialize_json(obj) -> str:
|
||||
if isinstance(obj, str):
|
||||
return obj
|
||||
import json
|
||||
def default_serializer(obj):
|
||||
if hasattr(obj, 'to_struct'):
|
||||
return obj.to_struct()
|
||||
else:
|
||||
raise TypeError("Object of type '%s' is not JSON serializable and does not have .to_struct() method." % obj.__class__.__name__)
|
||||
return json.dumps(obj, default=default_serializer, sort_keys=True)
|
||||
|
||||
def _deserialize_bool(s) -> bool:
|
||||
from distutils.util import strtobool
|
||||
return strtobool(s) == 1
|
||||
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Suggest parameter sets from measurements using gcp ai platform optimizer', description='Suggests trials (parameter sets) to evaluate.')
|
||||
_parser.add_argument("--parameter-specs", dest="parameter_specs", type=json.loads, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--metrics-for-parameter-sets", dest="metrics_for_parameter_sets", type=json.loads, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--suggestion-count", dest="suggestion_count", type=int, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--maximize", dest="maximize", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--metric-specs", dest="metric_specs", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = suggest_parameter_sets_from_measurements_using_gcp_ai_platform_optimizer(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_json,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --parameter-specs
|
||||
- {inputValue: parameter_specs}
|
||||
- --metrics-for-parameter-sets
|
||||
- {inputValue: metrics_for_parameter_sets}
|
||||
- --suggestion-count
|
||||
- {inputValue: suggestion_count}
|
||||
- if:
|
||||
cond: {isPresent: maximize}
|
||||
then:
|
||||
- --maximize
|
||||
- {inputValue: maximize}
|
||||
- if:
|
||||
cond: {isPresent: metric_specs}
|
||||
then:
|
||||
- --metric-specs
|
||||
- {inputValue: metric_specs}
|
||||
- if:
|
||||
cond: {isPresent: gcp_project_id}
|
||||
then:
|
||||
- --gcp-project-id
|
||||
- {inputValue: gcp_project_id}
|
||||
- if:
|
||||
cond: {isPresent: gcp_region}
|
||||
then:
|
||||
- --gcp-region
|
||||
- {inputValue: gcp_region}
|
||||
- '----output-paths'
|
||||
- {outputPath: suggested_parameter_sets}
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
from typing import NamedTuple
|
||||
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
def suggest_trials_in_gcp_ai_platform_optimizer(
|
||||
study_name: str,
|
||||
suggestion_count: int,
|
||||
gcp_project_id: str = None,
|
||||
gcp_region: str = "us-central1",
|
||||
) -> NamedTuple('Outputs', [
|
||||
("suggested_trials", list),
|
||||
]):
|
||||
"""Suggests trials (parameter sets) to evaluate.
|
||||
See https://cloud.google.com/ai-platform/optimizer/docs
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
|
||||
Args:
|
||||
study_name: Full resource name of the study.
|
||||
suggestion_count: Number of suggestions to request.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import google.auth
|
||||
from googleapiclient import discovery
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
client_id = 'client1'
|
||||
|
||||
credentials, default_project_id = google.auth.default()
|
||||
|
||||
# Validating and inferring the arguments
|
||||
if not gcp_project_id:
|
||||
gcp_project_id = default_project_id
|
||||
|
||||
# Building the API client.
|
||||
# The main API does not work, so we need to build from the published discovery document.
|
||||
def create_caip_optimizer_client(project_id):
|
||||
from google.cloud import storage
|
||||
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
|
||||
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
|
||||
client = storage.Client(project_id)
|
||||
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
|
||||
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
|
||||
discovery_document = blob.download_as_bytes()
|
||||
return discovery.build_from_document(service=discovery_document)
|
||||
|
||||
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
|
||||
def get_project_number(project_id):
|
||||
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
|
||||
response = service.projects().get(projectId=project_id).execute()
|
||||
return response['projectNumber']
|
||||
|
||||
gcp_project_number = get_project_number(gcp_project_id)
|
||||
|
||||
def fix_resource_name(name):
|
||||
return name.replace(gcp_project_number, gcp_project_id)
|
||||
|
||||
ml_api = create_caip_optimizer_client(gcp_project_id)
|
||||
trials_api = ml_api.projects().locations().studies().trials()
|
||||
operations_api = ml_api.projects().locations().operations()
|
||||
|
||||
suggest_trials_request = trials_api.suggest(
|
||||
parent=fix_resource_name(study_name),
|
||||
body=dict(
|
||||
suggestionCount=suggestion_count,
|
||||
clientId=client_id,
|
||||
),
|
||||
)
|
||||
suggest_trials_response = suggest_trials_request.execute()
|
||||
operation_name = suggest_trials_response['name']
|
||||
while True:
|
||||
get_operation_response = operations_api.get(
|
||||
name=fix_resource_name(operation_name),
|
||||
).execute()
|
||||
# Knowledge: The "done" key is just missing until the result is available
|
||||
if get_operation_response.get('done'):
|
||||
break
|
||||
logging.info('Not finished yet: ' + str(get_operation_response))
|
||||
time.sleep(10)
|
||||
operation_response = get_operation_response['response']
|
||||
suggested_trials = operation_response['trials']
|
||||
return (suggested_trials,)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
suggest_trials_in_gcp_ai_platform_optimizer_op = create_component_from_func(
|
||||
suggest_trials_in_gcp_ai_platform_optimizer,
|
||||
base_image='python:3.8',
|
||||
packages_to_install=['google-api-python-client==1.12.3', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3'],
|
||||
output_component_file='component.yaml',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Suggest_trials/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,163 @@
|
|||
name: Suggest trials in gcp ai platform optimizer
|
||||
description: Suggests trials (parameter sets) to evaluate.
|
||||
inputs:
|
||||
- {name: study_name, type: String, description: Full resource name of the study.}
|
||||
- {name: suggestion_count, type: Integer, description: Number of suggestions to request.}
|
||||
- {name: gcp_project_id, type: String, optional: true}
|
||||
- {name: gcp_region, type: String, default: us-central1, optional: true}
|
||||
outputs:
|
||||
- {name: suggested_trials, type: JsonArray}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Suggest_trials/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.8
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|
||||
|| PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|
||||
--user) && "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def suggest_trials_in_gcp_ai_platform_optimizer(
|
||||
study_name,
|
||||
suggestion_count,
|
||||
gcp_project_id = None,
|
||||
gcp_region = "us-central1",
|
||||
):
|
||||
"""Suggests trials (parameter sets) to evaluate.
|
||||
See https://cloud.google.com/ai-platform/optimizer/docs
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
|
||||
Args:
|
||||
study_name: Full resource name of the study.
|
||||
suggestion_count: Number of suggestions to request.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import google.auth
|
||||
from googleapiclient import discovery
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
client_id = 'client1'
|
||||
|
||||
credentials, default_project_id = google.auth.default()
|
||||
|
||||
# Validating and inferring the arguments
|
||||
if not gcp_project_id:
|
||||
gcp_project_id = default_project_id
|
||||
|
||||
# Building the API client.
|
||||
# The main API does not work, so we need to build from the published discovery document.
|
||||
def create_caip_optimizer_client(project_id):
|
||||
from google.cloud import storage
|
||||
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
|
||||
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
|
||||
client = storage.Client(project_id)
|
||||
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
|
||||
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
|
||||
discovery_document = blob.download_as_bytes()
|
||||
return discovery.build_from_document(service=discovery_document)
|
||||
|
||||
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
|
||||
def get_project_number(project_id):
|
||||
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
|
||||
response = service.projects().get(projectId=project_id).execute()
|
||||
return response['projectNumber']
|
||||
|
||||
gcp_project_number = get_project_number(gcp_project_id)
|
||||
|
||||
def fix_resource_name(name):
|
||||
return name.replace(gcp_project_number, gcp_project_id)
|
||||
|
||||
ml_api = create_caip_optimizer_client(gcp_project_id)
|
||||
trials_api = ml_api.projects().locations().studies().trials()
|
||||
operations_api = ml_api.projects().locations().operations()
|
||||
|
||||
suggest_trials_request = trials_api.suggest(
|
||||
parent=fix_resource_name(study_name),
|
||||
body=dict(
|
||||
suggestionCount=suggestion_count,
|
||||
clientId=client_id,
|
||||
),
|
||||
)
|
||||
suggest_trials_response = suggest_trials_request.execute()
|
||||
operation_name = suggest_trials_response['name']
|
||||
while True:
|
||||
get_operation_response = operations_api.get(
|
||||
name=fix_resource_name(operation_name),
|
||||
).execute()
|
||||
# Knowledge: The "done" key is just missing until the result is available
|
||||
if get_operation_response.get('done'):
|
||||
break
|
||||
logging.info('Not finished yet: ' + str(get_operation_response))
|
||||
time.sleep(10)
|
||||
operation_response = get_operation_response['response']
|
||||
suggested_trials = operation_response['trials']
|
||||
return (suggested_trials,)
|
||||
|
||||
def _serialize_json(obj) -> str:
|
||||
if isinstance(obj, str):
|
||||
return obj
|
||||
import json
|
||||
def default_serializer(obj):
|
||||
if hasattr(obj, 'to_struct'):
|
||||
return obj.to_struct()
|
||||
else:
|
||||
raise TypeError("Object of type '%s' is not JSON serializable and does not have .to_struct() method." % obj.__class__.__name__)
|
||||
return json.dumps(obj, default=default_serializer, sort_keys=True)
|
||||
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Suggest trials in gcp ai platform optimizer', description='Suggests trials (parameter sets) to evaluate.')
|
||||
_parser.add_argument("--study-name", dest="study_name", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--suggestion-count", dest="suggestion_count", type=int, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = suggest_trials_in_gcp_ai_platform_optimizer(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_json,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --study-name
|
||||
- {inputValue: study_name}
|
||||
- --suggestion-count
|
||||
- {inputValue: suggestion_count}
|
||||
- if:
|
||||
cond: {isPresent: gcp_project_id}
|
||||
then:
|
||||
- --gcp-project-id
|
||||
- {inputValue: gcp_project_id}
|
||||
- if:
|
||||
cond: {isPresent: gcp_region}
|
||||
then:
|
||||
- --gcp-region
|
||||
- {inputValue: gcp_region}
|
||||
- '----output-paths'
|
||||
- {outputPath: suggested_trials}
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
# This pipeline demonstrates hyper-parameter optimization.
|
||||
# The goal is to find a set of hyper-parameter values that helps train the best model.
|
||||
# We launch several optimization stages sequentially.
|
||||
# At each stage the optimizer suggests several parameter sets to explore based on the available measurements.
|
||||
# For each suggested parameter set we train a model (semi-dummy) and measure its quality metrics.
|
||||
# We then collect the metrics for all suggested parameter sets and update out measurements set.
|
||||
# With the expanded set of measurements, each new optimization stage should result in better parameter set suggestions.
|
||||
#
|
||||
# One aspect of this pipeline is the atomicity of the parameter set suggestion.
|
||||
# Some optimizers have a persistent mutable global state that is changed when parameter set metrics are submitted.
|
||||
# The presence of mutable global state may cause reproducibility issues where suggestions for a new model might be based on measurements from a different model.
|
||||
# The "suggest_parameter_sets_from_measurements_op" in this pipeline is a single operation, which behaves like a pure function and does not rely on external global state.
|
||||
|
||||
kfp_endpoint = None
|
||||
|
||||
|
||||
import kfp
|
||||
from kfp import components
|
||||
|
||||
|
||||
suggest_parameter_sets_from_measurements_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/382c4d109fbd489bd85de54dd9171150e326b401/components/google-cloud/Optimizer/Suggest_parameter_sets_based_on_measurements/component.yaml')
|
||||
|
||||
get_element_by_index_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/55ef28a9d51edc4eeed2a5c6f44cc7457e8a41d8/components/json/Get_element_by_index/component.yaml')
|
||||
build_dict_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/4a4be6b748b0d1284d65a417ce4ab5bec596e9fe/components/json/Build_dict/component.yaml')
|
||||
build_list_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/4a4be6b748b0d1284d65a417ce4ab5bec596e9fe/components/json/Build_list/component.yaml')
|
||||
combine_lists_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/4a4be6b748b0d1284d65a417ce4ab5bec596e9fe/components/json/Combine_lists/component.yaml')
|
||||
|
||||
|
||||
# The train_and_measure_model is a semi-dummy component that creates a model given the [hyper]parameters and evaluates that model.
|
||||
# In this case, the model is a polinomial model.
|
||||
# The evaluation procedure compares the model with the real function that our model is trying to learn
|
||||
# and calculates the mean squared error based on a random sample of data points.
|
||||
# In real world cases this component will be substituted by a sequence of model trainer, predictor and evaluator components.
|
||||
@components.create_component_from_func
|
||||
def train_and_measure_model(parameters: dict) -> float:
|
||||
import random
|
||||
|
||||
def real_function(x):
|
||||
p1 = 3
|
||||
p2 = -1
|
||||
p3 = 2
|
||||
return p1 * x**2 + p2 * x + p3
|
||||
|
||||
def get_eval_set() -> dict:
|
||||
eval_set = {}
|
||||
num_samples = 100
|
||||
for i in range(num_samples):
|
||||
x = random.normalvariate(0, 1) * 5
|
||||
eval_set[x] = real_function(x)
|
||||
return eval_set
|
||||
|
||||
def train_model(parameters):
|
||||
def apply_model(x):
|
||||
return parameters['p1'] * x**2 + parameters['p2'] * x + parameters['p3']
|
||||
return apply_model
|
||||
|
||||
model = train_model(parameters)
|
||||
|
||||
eval_set = get_eval_set()
|
||||
sum_squared_error = 0
|
||||
|
||||
for x, expected_y in eval_set.items():
|
||||
actual_y = model(x)
|
||||
error = abs(expected_y - actual_y)
|
||||
squared_error = error ** 2
|
||||
sum_squared_error += squared_error
|
||||
mean_squared_error = sum_squared_error / len(eval_set)
|
||||
return mean_squared_error
|
||||
|
||||
|
||||
parameter_specs=[
|
||||
{
|
||||
'parameter': 'p1',
|
||||
'type': 'DOUBLE',
|
||||
'double_value_spec' : {
|
||||
'min_value' : -5,
|
||||
'max_value' : 5,
|
||||
}
|
||||
},
|
||||
{
|
||||
'parameter': 'p2',
|
||||
'type': 'DOUBLE',
|
||||
'double_value_spec': {
|
||||
'min_value': -5,
|
||||
'max_value': 5,
|
||||
}
|
||||
},
|
||||
{
|
||||
'parameter': 'p3',
|
||||
'type': 'DOUBLE',
|
||||
'double_value_spec': {
|
||||
'min_value': -5,
|
||||
'max_value': 5,
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def optimizer_pipeline():
|
||||
# Number of optimization stages and suggestions per stage.
|
||||
# Note that these numbers cannot be parametrized, since they're used in compile-time python loops.
|
||||
optimization_stages = 3
|
||||
suggestions_per_stage = 5
|
||||
|
||||
# We launch several optimization stages sequentially.
|
||||
# At each stage the optimizer suggests several parameter sets to explore based on the available measurements.
|
||||
# Each stage depends on the completion of all trials in the previous stage (since only completed trials affect new trial suggesions).
|
||||
# Each optimization stage should result in better parameter set suggestions.
|
||||
all_metrics_for_parameter_sets = []
|
||||
for stage in range(optimization_stages):
|
||||
parameter_sets = suggest_parameter_sets_from_measurements_op(
|
||||
parameter_specs=parameter_specs,
|
||||
metrics_for_parameter_sets=all_metrics_for_parameter_sets,
|
||||
suggestion_count=suggestions_per_stage,
|
||||
maximize=False,
|
||||
).output
|
||||
|
||||
# Evaluate each suggested set of parameters.
|
||||
# Loop over the suggested trials.
|
||||
# We need to collect the created tasks in the `trial_measurement_tasks` list so that the next round of suggestions can depend on their completion.
|
||||
# Cannot use dsl.ParallelFor here due to a bug in Argo https://github.com/argoproj/argo-workflows/issues/2660
|
||||
# Without ParallelFor we have to use python loop
|
||||
# and explicitly get individual suggections using the get_element_by_index_op component
|
||||
# then extract the trial name and parameter sets using get_element_by_key_op and query_json_op components.
|
||||
new_metrics_for_parameter_sets = []
|
||||
for siggestion_index in range(suggestions_per_stage):
|
||||
parameter_set = get_element_by_index_op(
|
||||
json=parameter_sets,
|
||||
index=siggestion_index,
|
||||
).output
|
||||
|
||||
model_error = train_and_measure_model(
|
||||
parameters=parameter_set,
|
||||
).output
|
||||
|
||||
metric_for_parameter_set = build_dict_op(
|
||||
key_1='parameters',
|
||||
value_1=parameter_set,
|
||||
key_2='metrics',
|
||||
value_2={
|
||||
'metric': model_error,
|
||||
},
|
||||
).output
|
||||
|
||||
new_metrics_for_parameter_sets.append(metric_for_parameter_set)
|
||||
# Collecting metrics for the current stage
|
||||
new_list_of_metrics_for_parameter_sets = build_list_op(*new_metrics_for_parameter_sets).output
|
||||
# Collecting metrics for all stages
|
||||
all_metrics_for_parameter_sets = combine_lists_op(all_metrics_for_parameter_sets, new_list_of_metrics_for_parameter_sets).output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(optimizer_pipeline, arguments={})
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
kfp_endpoint = None
|
||||
|
||||
|
||||
import kfp
|
||||
from kfp import components
|
||||
|
||||
optimizer_create_study_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/40e117cca61fd923a57a1e84cbd08c22dce4bf00/components/google-cloud/Optimizer/Create_study/component.yaml')
|
||||
optimizer_suggest_trials_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/40e117cca61fd923a57a1e84cbd08c22dce4bf00/components/google-cloud/Optimizer/Suggest_trials/component.yaml')
|
||||
optimizer_add_measurement_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/40e117cca61fd923a57a1e84cbd08c22dce4bf00/components/google-cloud/Optimizer/Add_measurement_for_trial/component.yaml')
|
||||
|
||||
get_element_by_index_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/55ef28a9d51edc4eeed2a5c6f44cc7457e8a41d8/components/json/Get_element_by_index/component.yaml')
|
||||
get_element_by_key_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/55ef28a9d51edc4eeed2a5c6f44cc7457e8a41d8/components/json/Get_element_by_key/component.yaml')
|
||||
query_json_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/55ef28a9d51edc4eeed2a5c6f44cc7457e8a41d8/components/json/Query/component.yaml')
|
||||
|
||||
|
||||
# Component that builds a model given the [hyper]parameters and evaluates that model.
|
||||
# In this case, the model is a polinomial model.
|
||||
# The evaluation procedure compares the model with the real function that our model is trying to learn
|
||||
# and calculates the mean squared error based on a random sample of data points.
|
||||
# In real world cases this component will be substituted by a sequence of model trainer, predictor and evaluator components.
|
||||
@components.create_component_from_func
|
||||
def evaluate_model(parameters: dict) -> float:
|
||||
import random
|
||||
|
||||
def real_function(x):
|
||||
p1 = 3
|
||||
p2 = -1
|
||||
p3 = 2
|
||||
return p1 * x**2 + p2 * x + p3
|
||||
|
||||
def evaluate_model(parameters, x):
|
||||
return parameters['p1'] * x**2 + parameters['p2'] * x + parameters['p3']
|
||||
|
||||
sum_squared_error = 0
|
||||
num_samples = 100
|
||||
for i in range(num_samples):
|
||||
x = random.normalvariate(0, 1) * 5
|
||||
real_y = real_function(x)
|
||||
actual_y = evaluate_model(parameters, x)
|
||||
error = abs(real_y - actual_y)
|
||||
squared_error = error ** 2
|
||||
sum_squared_error += squared_error
|
||||
mean_squared_error = sum_squared_error / num_samples
|
||||
return mean_squared_error
|
||||
|
||||
|
||||
def optimizer_pipeline(
|
||||
):
|
||||
optimization_stages = 3
|
||||
trials_per_stage = 5
|
||||
|
||||
study_name = optimizer_create_study_op(
|
||||
study_id='Study4',
|
||||
parameter_specs=[
|
||||
{
|
||||
'parameter': 'p1',
|
||||
'type': 'DOUBLE',
|
||||
'double_value_spec' : {
|
||||
'min_value' : -5,
|
||||
'max_value' : 5,
|
||||
}
|
||||
},
|
||||
{
|
||||
'parameter': 'p2',
|
||||
'type': 'DOUBLE',
|
||||
'double_value_spec': {
|
||||
'min_value': -5,
|
||||
'max_value': 5,
|
||||
}
|
||||
},
|
||||
{
|
||||
'parameter': 'p3',
|
||||
'type': 'DOUBLE',
|
||||
'double_value_spec': {
|
||||
'min_value': -5,
|
||||
'max_value': 5,
|
||||
}
|
||||
},
|
||||
],
|
||||
optimization_goal='MINIMIZE',
|
||||
).outputs['study_name']
|
||||
|
||||
# We launch several optimization stages sequentially.
|
||||
# Each stage depends on the completion of all trials in the previous stage (since only completed trials affect new trial suggesions).
|
||||
# Each optimization stage should result in better parameter set suggestions.
|
||||
trial_measurement_tasks = []
|
||||
for stage in range(optimization_stages):
|
||||
suggest_trials_task = optimizer_suggest_trials_op(
|
||||
study_name=study_name,
|
||||
suggestion_count=suggestion_count,
|
||||
)
|
||||
suggest_trials_task.after(*trial_measurement_tasks)
|
||||
|
||||
trials = suggest_trials_task.output
|
||||
|
||||
# Evaluate each suggested set of parameters.
|
||||
# Loop over the suggested trials.
|
||||
# We need to collect the created tasks in the `trial_measurement_tasks` list so that the next round of suggestions can depend on their completion.
|
||||
# Cannot use dsl.ParallelFor here due to a bug in Argo https://github.com/argoproj/argo-workflows/issues/2660
|
||||
# Without ParallelFor we have to use python loop
|
||||
# and explicitly get individual suggections using the get_element_by_index_op component
|
||||
# then extract the trial name and parameter sets using get_element_by_key_op and query_json_op components.
|
||||
trial_measurement_tasks = []
|
||||
for trial_index in range(trials_per_stage):
|
||||
trial = get_element_by_index_op(
|
||||
json=trials,
|
||||
index=trial_index,
|
||||
).output
|
||||
|
||||
trial_name = get_element_by_key_op(
|
||||
json=trial,
|
||||
key='name',
|
||||
).output
|
||||
|
||||
trial_parameters = query_json_op(
|
||||
json=trial,
|
||||
query='.parameters | map( {(.parameter): (.floatValue // .intValue // .stringValue)} ) | add',
|
||||
).output
|
||||
|
||||
model_error = evaluate_model(
|
||||
parameters=trial_parameters,
|
||||
).output
|
||||
|
||||
add_measurement_task = optimizer_add_measurement_op(
|
||||
trial_name=trial_name,
|
||||
metric_value=model_error,
|
||||
)
|
||||
|
||||
trial_measurement_tasks.append(add_measurement_task)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(optimizer_pipeline, arguments={})
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
from typing import NamedTuple
|
||||
|
||||
from kfp.components import create_component_from_func, InputPath, OutputPath
|
||||
|
||||
def automl_create_tables_dataset_from_csv(
|
||||
data_path: InputPath('CSV'),
|
||||
target_column_name: str = None,
|
||||
column_nullability: dict = {},
|
||||
column_types: dict = {},
|
||||
gcs_staging_uri: str = None, # Currently AutoML Tables only supports regional buckets in "us-central1".
|
||||
gcp_project_id: str = None,
|
||||
gcp_region: str = 'us-central1', # Currently "us-central1" is the only region supported by AutoML tables.
|
||||
) -> NamedTuple('Outputs', [
|
||||
('dataset_name', str),
|
||||
('dataset_url', 'URI'),
|
||||
]):
|
||||
'''Creates Google Cloud AutoML Tables Dataset from CSV data.
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
|
||||
Args:
|
||||
data_path: Data in CSV format that will be imported to the dataset.
|
||||
target_column_name: Name of the target column for training.
|
||||
column_nullability: Maps column name to boolean specifying whether the column should be marked as nullable.
|
||||
column_types: Maps column name to column type. Supported types: FLOAT64, CATEGORY, STRING.
|
||||
gcs_staging_uri: URI of the data staging location in Google Cloud Storage. The bucket must have the us-central1 region. If not specified, a new staging bucket will be created.
|
||||
gcp_project_id: Google Cloud project ID. If not set, the default one will be used.
|
||||
gcp_region: Google Cloud region. AutoML Tables only supports us-central1.
|
||||
Returns:
|
||||
dataset_name: AutoML dataset name (fully-qualified)
|
||||
'''
|
||||
|
||||
import logging
|
||||
import random
|
||||
|
||||
import google.auth
|
||||
from google.cloud import automl_v1beta1 as automl
|
||||
from google.cloud import storage
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
# Validating and inferring the arguments
|
||||
|
||||
if not gcp_project_id:
|
||||
_, gcp_project_id = google.auth.default()
|
||||
|
||||
if not gcp_region:
|
||||
gcp_region = 'us-central1'
|
||||
if gcp_region != 'us-central1':
|
||||
logging.warn('AutoML only supports the us-central1 region')
|
||||
|
||||
dataset_display_name = 'Dataset' # Allowed characters for displayName are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9
|
||||
|
||||
column_nullability = column_nullability or {}
|
||||
for name, nullability in column_nullability.items():
|
||||
assert isinstance(name, str)
|
||||
assert isinstance(nullability, bool)
|
||||
|
||||
column_types = column_types or {}
|
||||
for name, data_type in column_types.items():
|
||||
assert isinstance(name, str)
|
||||
if not hasattr(automl.TypeCode, data_type):
|
||||
supported_types = [type_name for type_name in dir(automl.TypeCode) if type_name[0] != '_']
|
||||
raise ValueError(f'Unknow column type "{data_type}". Supported types: {supported_types}')
|
||||
|
||||
# Generating execution ID for data staging
|
||||
random_integer = random.SystemRandom().getrandbits(256)
|
||||
execution_id = '{:064x}'.format(random_integer)
|
||||
logging.info(f'Execution ID: {execution_id}')
|
||||
|
||||
logging.info('Uploading the data to storage')
|
||||
# TODO: Split table into < 100MB chunks as required by AutoML Tables
|
||||
storage_client = storage.Client()
|
||||
if gcs_staging_uri:
|
||||
if not gcs_staging_uri.startswith('gs://'):
|
||||
raise ValueError(f"Invalid staging storage URI: {gcs_staging_uri}")
|
||||
(bucket_name, blob_prefix) = gcs_staging_uri[5:].split('/', 1)
|
||||
bucket = storage_client.get_bucket(bucket_name)
|
||||
else:
|
||||
bucket_name = gcp_project_id + '_staging_' + gcp_region
|
||||
try:
|
||||
bucket = storage_client.get_bucket(bucket_name)
|
||||
except Exception as ex:
|
||||
logging.info(f'Creating Storage bucket {bucket_name}')
|
||||
bucket = storage_client.create_bucket(
|
||||
bucket_or_name=bucket_name,
|
||||
project=gcp_project_id,
|
||||
location=gcp_region,
|
||||
)
|
||||
logging.info(f'Created Storage bucket {bucket.name}')
|
||||
blob_prefix = 'google.cloud.automl_tmp'
|
||||
|
||||
# AutoML Tables import data requires that "the file name must have a (case-insensitive) '.CSV' file extension"
|
||||
training_data_blob_name = blob_prefix.rstrip('/') + '/' + execution_id + '/' + 'training_data.csv'
|
||||
training_data_blob_uri = f'gs://{bucket.name}/{training_data_blob_name}'
|
||||
training_data_blob = bucket.blob(training_data_blob_name)
|
||||
logging.info(f'Uploading training data to {training_data_blob_uri}')
|
||||
training_data_blob.upload_from_filename(data_path)
|
||||
|
||||
logging.info(f'Creating AutoML Tables dataset.')
|
||||
automl_client = automl.AutoMlClient()
|
||||
|
||||
project_location_path = f'projects/{gcp_project_id}/locations/{gcp_region}'
|
||||
|
||||
dataset = automl.Dataset(
|
||||
display_name=dataset_display_name,
|
||||
tables_dataset_metadata=automl.TablesDatasetMetadata(),
|
||||
# labels={},
|
||||
)
|
||||
dataset = automl_client.create_dataset(
|
||||
dataset=dataset,
|
||||
parent=project_location_path,
|
||||
)
|
||||
dataset_id = dataset.name.split('/')[-1]
|
||||
dataset_web_url = f'https://console.cloud.google.com/automl-tables/locations/{gcp_region}/datasets/{dataset_id}'
|
||||
logging.info(f'Created dataset {dataset.name}. Link: {dataset_web_url}')
|
||||
|
||||
logging.info(f'Importing data to the dataset: {dataset.name}.')
|
||||
import_data_input_config = automl.InputConfig(
|
||||
gcs_source=automl.GcsSource(
|
||||
input_uris=[training_data_blob_uri],
|
||||
)
|
||||
)
|
||||
import_data_response = automl_client.import_data(
|
||||
name=dataset.name,
|
||||
input_config=import_data_input_config,
|
||||
)
|
||||
import_data_response.result()
|
||||
dataset = automl_client.get_dataset(
|
||||
name=dataset.name,
|
||||
)
|
||||
logging.info(f'Finished importing data.')
|
||||
|
||||
logging.info('Updating column specs')
|
||||
target_column_spec = None
|
||||
primary_table_spec_name = dataset.name + '/tableSpecs/' + dataset.tables_dataset_metadata.primary_table_spec_id
|
||||
table_specs_list = list(automl_client.list_table_specs(
|
||||
parent=dataset.name,
|
||||
))
|
||||
for table_spec in table_specs_list:
|
||||
table_spec_id = table_spec.name.split('/')[-1]
|
||||
column_specs_list = list(automl_client.list_column_specs(
|
||||
parent=table_spec.name,
|
||||
))
|
||||
is_primary_table = table_spec.name == primary_table_spec_name
|
||||
for column_spec in column_specs_list:
|
||||
if column_spec.display_name == target_column_name and is_primary_table:
|
||||
target_column_spec = column_spec
|
||||
column_updated = False
|
||||
if column_spec.display_name in column_nullability:
|
||||
column_spec.data_type.nullable = column_nullability[column_spec.display_name]
|
||||
column_updated = True
|
||||
if column_spec.display_name in column_types:
|
||||
new_column_type = column_types[column_spec.display_name]
|
||||
column_spec.data_type.type_code = getattr(automl.TypeCode, new_column_type)
|
||||
column_updated = True
|
||||
if column_updated:
|
||||
automl_client.update_column_spec(column_spec=column_spec)
|
||||
|
||||
if target_column_name:
|
||||
logging.info('Setting target column')
|
||||
if not target_column_spec:
|
||||
raise ValueError(f'Primary table does not have column "{target_column_name}"')
|
||||
target_column_spec_id = target_column_spec.name.split('/')[-1]
|
||||
dataset.tables_dataset_metadata.target_column_spec_id = target_column_spec_id
|
||||
dataset = automl_client.update_dataset(dataset=dataset)
|
||||
|
||||
return (dataset.name, dataset_web_url)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
automl_create_tables_dataset_from_csv_op = create_component_from_func(
|
||||
automl_create_tables_dataset_from_csv,
|
||||
base_image='python:3.8',
|
||||
packages_to_install=['google-cloud-automl==2.0.0', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3'],
|
||||
output_component_file='component.yaml',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/AutoML/Tables/Create_dataset/from_CSV/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,274 @@
|
|||
name: Automl create tables dataset from csv
|
||||
description: Creates Google Cloud AutoML Tables Dataset from CSV data.
|
||||
inputs:
|
||||
- {name: data, type: CSV, description: Data in CSV format that will be imported to
|
||||
the dataset.}
|
||||
- {name: target_column_name, type: String, description: Name of the target column
|
||||
for training., optional: true}
|
||||
- {name: column_nullability, type: JsonObject, description: Maps column name to boolean
|
||||
specifying whether the column should be marked as nullable., default: '{}', optional: true}
|
||||
- {name: column_types, type: JsonObject, description: 'Maps column name to column
|
||||
type. Supported types: FLOAT64, CATEGORY, STRING.', default: '{}', optional: true}
|
||||
- {name: gcs_staging_uri, type: String, description: 'URI of the data staging location
|
||||
in Google Cloud Storage. The bucket must have the us-central1 region. If not specified,
|
||||
a new staging bucket will be created.', optional: true}
|
||||
- {name: gcp_project_id, type: String, description: 'Google Cloud project ID. If not
|
||||
set, the default one will be used.', optional: true}
|
||||
- {name: gcp_region, type: String, description: Google Cloud region. AutoML Tables
|
||||
only supports us-central1., default: us-central1, optional: true}
|
||||
outputs:
|
||||
- {name: dataset_name, type: String}
|
||||
- {name: dataset_url, type: URI}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/AutoML/Tables/Create_dataset/from_CSV/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.8
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-cloud-automl==2.0.0' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|
||||
|| PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-cloud-automl==2.0.0' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|
||||
--user) && "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def automl_create_tables_dataset_from_csv(
|
||||
data_path,
|
||||
target_column_name = None,
|
||||
column_nullability = {},
|
||||
column_types = {},
|
||||
gcs_staging_uri = None, # Currently AutoML Tables only supports regional buckets in "us-central1".
|
||||
gcp_project_id = None,
|
||||
gcp_region = 'us-central1', # Currently "us-central1" is the only region supported by AutoML tables.
|
||||
):
|
||||
'''Creates Google Cloud AutoML Tables Dataset from CSV data.
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
|
||||
Args:
|
||||
data_path: Data in CSV format that will be imported to the dataset.
|
||||
target_column_name: Name of the target column for training.
|
||||
column_nullability: Maps column name to boolean specifying whether the column should be marked as nullable.
|
||||
column_types: Maps column name to column type. Supported types: FLOAT64, CATEGORY, STRING.
|
||||
gcs_staging_uri: URI of the data staging location in Google Cloud Storage. The bucket must have the us-central1 region. If not specified, a new staging bucket will be created.
|
||||
gcp_project_id: Google Cloud project ID. If not set, the default one will be used.
|
||||
gcp_region: Google Cloud region. AutoML Tables only supports us-central1.
|
||||
Returns:
|
||||
dataset_name: AutoML dataset name (fully-qualified)
|
||||
'''
|
||||
|
||||
import logging
|
||||
import random
|
||||
|
||||
import google.auth
|
||||
from google.cloud import automl_v1beta1 as automl
|
||||
from google.cloud import storage
|
||||
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
# Validating and inferring the arguments
|
||||
|
||||
if not gcp_project_id:
|
||||
_, gcp_project_id = google.auth.default()
|
||||
|
||||
if not gcp_region:
|
||||
gcp_region = 'us-central1'
|
||||
if gcp_region != 'us-central1':
|
||||
logging.warn('AutoML only supports the us-central1 region')
|
||||
|
||||
dataset_display_name = 'Dataset' # Allowed characters for displayName are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9
|
||||
|
||||
column_nullability = column_nullability or {}
|
||||
for name, nullability in column_nullability.items():
|
||||
assert isinstance(name, str)
|
||||
assert isinstance(nullability, bool)
|
||||
|
||||
column_types = column_types or {}
|
||||
for name, data_type in column_types.items():
|
||||
assert isinstance(name, str)
|
||||
if not hasattr(automl.TypeCode, data_type):
|
||||
supported_types = [type_name for type_name in dir(automl.TypeCode) if type_name[0] != '_']
|
||||
raise ValueError(f'Unknow column type "{data_type}". Supported types: {supported_types}')
|
||||
|
||||
# Generating execution ID for data staging
|
||||
random_integer = random.SystemRandom().getrandbits(256)
|
||||
execution_id = '{:064x}'.format(random_integer)
|
||||
logging.info(f'Execution ID: {execution_id}')
|
||||
|
||||
logging.info('Uploading the data to storage')
|
||||
# TODO: Split table into < 100MB chunks as required by AutoML Tables
|
||||
storage_client = storage.Client()
|
||||
if gcs_staging_uri:
|
||||
if not gcs_staging_uri.startswith('gs://'):
|
||||
raise ValueError(f"Invalid staging storage URI: {gcs_staging_uri}")
|
||||
(bucket_name, blob_prefix) = gcs_staging_uri[5:].split('/', 1)
|
||||
bucket = storage_client.get_bucket(bucket_name)
|
||||
else:
|
||||
bucket_name = gcp_project_id + '_staging_' + gcp_region
|
||||
try:
|
||||
bucket = storage_client.get_bucket(bucket_name)
|
||||
except Exception as ex:
|
||||
logging.info(f'Creating Storage bucket {bucket_name}')
|
||||
bucket = storage_client.create_bucket(
|
||||
bucket_or_name=bucket_name,
|
||||
project=gcp_project_id,
|
||||
location=gcp_region,
|
||||
)
|
||||
logging.info(f'Created Storage bucket {bucket.name}')
|
||||
blob_prefix = 'google.cloud.automl_tmp'
|
||||
|
||||
# AutoML Tables import data requires that "the file name must have a (case-insensitive) '.CSV' file extension"
|
||||
training_data_blob_name = blob_prefix.rstrip('/') + '/' + execution_id + '/' + 'training_data.csv'
|
||||
training_data_blob_uri = f'gs://{bucket.name}/{training_data_blob_name}'
|
||||
training_data_blob = bucket.blob(training_data_blob_name)
|
||||
logging.info(f'Uploading training data to {training_data_blob_uri}')
|
||||
training_data_blob.upload_from_filename(data_path)
|
||||
|
||||
logging.info(f'Creating AutoML Tables dataset.')
|
||||
automl_client = automl.AutoMlClient()
|
||||
|
||||
project_location_path = f'projects/{gcp_project_id}/locations/{gcp_region}'
|
||||
|
||||
dataset = automl.Dataset(
|
||||
display_name=dataset_display_name,
|
||||
tables_dataset_metadata=automl.TablesDatasetMetadata(),
|
||||
# labels={},
|
||||
)
|
||||
dataset = automl_client.create_dataset(
|
||||
dataset=dataset,
|
||||
parent=project_location_path,
|
||||
)
|
||||
dataset_id = dataset.name.split('/')[-1]
|
||||
dataset_web_url = f'https://console.cloud.google.com/automl-tables/locations/{gcp_region}/datasets/{dataset_id}'
|
||||
logging.info(f'Created dataset {dataset.name}. Link: {dataset_web_url}')
|
||||
|
||||
logging.info(f'Importing data to the dataset: {dataset.name}.')
|
||||
import_data_input_config = automl.InputConfig(
|
||||
gcs_source=automl.GcsSource(
|
||||
input_uris=[training_data_blob_uri],
|
||||
)
|
||||
)
|
||||
import_data_response = automl_client.import_data(
|
||||
name=dataset.name,
|
||||
input_config=import_data_input_config,
|
||||
)
|
||||
import_data_response.result()
|
||||
dataset = automl_client.get_dataset(
|
||||
name=dataset.name,
|
||||
)
|
||||
logging.info(f'Finished importing data.')
|
||||
|
||||
logging.info('Updating column specs')
|
||||
target_column_spec = None
|
||||
primary_table_spec_name = dataset.name + '/tableSpecs/' + dataset.tables_dataset_metadata.primary_table_spec_id
|
||||
table_specs_list = list(automl_client.list_table_specs(
|
||||
parent=dataset.name,
|
||||
))
|
||||
for table_spec in table_specs_list:
|
||||
table_spec_id = table_spec.name.split('/')[-1]
|
||||
column_specs_list = list(automl_client.list_column_specs(
|
||||
parent=table_spec.name,
|
||||
))
|
||||
is_primary_table = table_spec.name == primary_table_spec_name
|
||||
for column_spec in column_specs_list:
|
||||
if column_spec.display_name == target_column_name and is_primary_table:
|
||||
target_column_spec = column_spec
|
||||
column_updated = False
|
||||
if column_spec.display_name in column_nullability:
|
||||
column_spec.data_type.nullable = column_nullability[column_spec.display_name]
|
||||
column_updated = True
|
||||
if column_spec.display_name in column_types:
|
||||
new_column_type = column_types[column_spec.display_name]
|
||||
column_spec.data_type.type_code = getattr(automl.TypeCode, new_column_type)
|
||||
column_updated = True
|
||||
if column_updated:
|
||||
automl_client.update_column_spec(column_spec=column_spec)
|
||||
|
||||
if target_column_name:
|
||||
logging.info('Setting target column')
|
||||
if not target_column_spec:
|
||||
raise ValueError(f'Primary table does not have column "{target_column_name}"')
|
||||
target_column_spec_id = target_column_spec.name.split('/')[-1]
|
||||
dataset.tables_dataset_metadata.target_column_spec_id = target_column_spec_id
|
||||
dataset = automl_client.update_dataset(dataset=dataset)
|
||||
|
||||
return (dataset.name, dataset_web_url)
|
||||
|
||||
def _serialize_str(str_value: str) -> str:
|
||||
if not isinstance(str_value, str):
|
||||
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
|
||||
return str_value
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl create tables dataset from csv', description='Creates Google Cloud AutoML Tables Dataset from CSV data.')
|
||||
_parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--target-column-name", dest="target_column_name", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--column-nullability", dest="column_nullability", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--column-types", dest="column_types", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcs-staging-uri", dest="gcs_staging_uri", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_create_tables_dataset_from_csv(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_str,
|
||||
str,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --data
|
||||
- {inputPath: data}
|
||||
- if:
|
||||
cond: {isPresent: target_column_name}
|
||||
then:
|
||||
- --target-column-name
|
||||
- {inputValue: target_column_name}
|
||||
- if:
|
||||
cond: {isPresent: column_nullability}
|
||||
then:
|
||||
- --column-nullability
|
||||
- {inputValue: column_nullability}
|
||||
- if:
|
||||
cond: {isPresent: column_types}
|
||||
then:
|
||||
- --column-types
|
||||
- {inputValue: column_types}
|
||||
- if:
|
||||
cond: {isPresent: gcs_staging_uri}
|
||||
then:
|
||||
- --gcs-staging-uri
|
||||
- {inputValue: gcs_staging_uri}
|
||||
- if:
|
||||
cond: {isPresent: gcp_project_id}
|
||||
then:
|
||||
- --gcp-project-id
|
||||
- {inputValue: gcp_project_id}
|
||||
- if:
|
||||
cond: {isPresent: gcp_region}
|
||||
then:
|
||||
- --gcp-region
|
||||
- {inputValue: gcp_region}
|
||||
- '----output-paths'
|
||||
- {outputPath: dataset_name}
|
||||
- {outputPath: dataset_url}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_create_dataset_for_tables(
|
||||
gcp_project_id: str,
|
||||
gcp_region: str,
|
||||
display_name: str,
|
||||
description: str = None,
|
||||
tables_dataset_metadata: dict = {},
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('dataset_path', str), ('create_time', str), ('dataset_id', str), ('dataset_url', 'URI')]):
|
||||
'''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables
|
||||
'''
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
location_path = client.location_path(gcp_project_id, gcp_region)
|
||||
dataset_dict = {
|
||||
'display_name': display_name,
|
||||
'description': description,
|
||||
'tables_dataset_metadata': tables_dataset_metadata,
|
||||
}
|
||||
dataset = client.create_dataset(
|
||||
location_path,
|
||||
dataset_dict,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
print(dataset)
|
||||
dataset_id = dataset.name.rsplit('/', 1)[-1]
|
||||
dataset_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id}/schemav2?project={project_id}'.format(
|
||||
project_id=gcp_project_id,
|
||||
region=gcp_region,
|
||||
dataset_id=dataset_id,
|
||||
)
|
||||
return (dataset.name, str(dataset.create_time), dataset_id, dataset_url)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_create_dataset_for_tables_op = create_component_from_func(
|
||||
automl_create_dataset_for_tables,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
packages_to_install=['google-cloud-automl==0.4.0'],
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_dataset_for_tables/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
name: Automl create dataset for tables
|
||||
description: automl_create_dataset_for_tables creates an empty Dataset for AutoML
|
||||
tables
|
||||
inputs:
|
||||
- {name: gcp_project_id, type: String}
|
||||
- {name: gcp_region, type: String}
|
||||
- {name: display_name, type: String}
|
||||
- {name: description, type: String, optional: true}
|
||||
- {name: tables_dataset_metadata, type: JsonObject, default: '{}', optional: true}
|
||||
- {name: retry, optional: true}
|
||||
- {name: timeout, type: Float, optional: true}
|
||||
- {name: metadata, type: JsonObject, optional: true}
|
||||
outputs:
|
||||
- {name: dataset_path, type: String}
|
||||
- {name: create_time, type: String}
|
||||
- {name: dataset_id, type: String}
|
||||
- {name: dataset_url, type: URI}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_dataset_for_tables/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-cloud-automl==0.4.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
|
||||
install --quiet --no-warn-script-location 'google-cloud-automl==0.4.0' --user)
|
||||
&& "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def automl_create_dataset_for_tables(
|
||||
gcp_project_id ,
|
||||
gcp_region ,
|
||||
display_name ,
|
||||
description = None,
|
||||
tables_dataset_metadata = {},
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout = None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata = None,
|
||||
) :
|
||||
'''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables
|
||||
'''
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
location_path = client.location_path(gcp_project_id, gcp_region)
|
||||
dataset_dict = {
|
||||
'display_name': display_name,
|
||||
'description': description,
|
||||
'tables_dataset_metadata': tables_dataset_metadata,
|
||||
}
|
||||
dataset = client.create_dataset(
|
||||
location_path,
|
||||
dataset_dict,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
print(dataset)
|
||||
dataset_id = dataset.name.rsplit('/', 1)[-1]
|
||||
dataset_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id}/schemav2?project={project_id}'.format(
|
||||
project_id=gcp_project_id,
|
||||
region=gcp_region,
|
||||
dataset_id=dataset_id,
|
||||
)
|
||||
return (dataset.name, str(dataset.create_time), dataset_id, dataset_url)
|
||||
|
||||
import json
|
||||
def _serialize_str(str_value: str) -> str:
|
||||
if not isinstance(str_value, str):
|
||||
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
|
||||
return str_value
|
||||
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl create dataset for tables', description='automl_create_dataset_for_tables creates an empty Dataset for AutoML tables')
|
||||
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--description", dest="description", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--tables-dataset-metadata", dest="tables_dataset_metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=4)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_create_dataset_for_tables(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_str,
|
||||
_serialize_str,
|
||||
_serialize_str,
|
||||
str,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --gcp-project-id
|
||||
- {inputValue: gcp_project_id}
|
||||
- --gcp-region
|
||||
- {inputValue: gcp_region}
|
||||
- --display-name
|
||||
- {inputValue: display_name}
|
||||
- if:
|
||||
cond: {isPresent: description}
|
||||
then:
|
||||
- --description
|
||||
- {inputValue: description}
|
||||
- if:
|
||||
cond: {isPresent: tables_dataset_metadata}
|
||||
then:
|
||||
- --tables-dataset-metadata
|
||||
- {inputValue: tables_dataset_metadata}
|
||||
- if:
|
||||
cond: {isPresent: retry}
|
||||
then:
|
||||
- --retry
|
||||
- {inputValue: retry}
|
||||
- if:
|
||||
cond: {isPresent: timeout}
|
||||
then:
|
||||
- --timeout
|
||||
- {inputValue: timeout}
|
||||
- if:
|
||||
cond: {isPresent: metadata}
|
||||
then:
|
||||
- --metadata
|
||||
- {inputValue: metadata}
|
||||
- '----output-paths'
|
||||
- {outputPath: dataset_path}
|
||||
- {outputPath: create_time}
|
||||
- {outputPath: dataset_id}
|
||||
- {outputPath: dataset_url}
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_create_model_for_tables(
|
||||
gcp_project_id: str,
|
||||
gcp_region: str,
|
||||
display_name: str,
|
||||
dataset_id: str,
|
||||
target_column_path: str = None,
|
||||
input_feature_column_paths: list = None,
|
||||
optimization_objective: str = 'MAXIMIZE_AU_PRC',
|
||||
train_budget_milli_node_hours: int = 1000,
|
||||
) -> NamedTuple('Outputs', [('model_path', str), ('model_id', str), ('model_page_url', 'URI'),]):
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
location_path = client.location_path(gcp_project_id, gcp_region)
|
||||
model_dict = {
|
||||
'display_name': display_name,
|
||||
'dataset_id': dataset_id,
|
||||
'tables_model_metadata': {
|
||||
'target_column_spec': automl.types.ColumnSpec(name=target_column_path),
|
||||
'input_feature_column_specs': [automl.types.ColumnSpec(name=path) for path in input_feature_column_paths] if input_feature_column_paths else None,
|
||||
'optimization_objective': optimization_objective,
|
||||
'train_budget_milli_node_hours': train_budget_milli_node_hours,
|
||||
},
|
||||
}
|
||||
|
||||
create_model_response = client.create_model(location_path, model_dict)
|
||||
print('Create model operation: {}'.format(create_model_response.operation))
|
||||
result = create_model_response.result()
|
||||
print(result)
|
||||
model_name = result.name
|
||||
model_id = model_name.rsplit('/', 1)[-1]
|
||||
model_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id};modelId={model_id};task=basic/train?project={project_id}'.format(
|
||||
project_id=gcp_project_id,
|
||||
region=gcp_region,
|
||||
dataset_id=dataset_id,
|
||||
model_id=model_id,
|
||||
)
|
||||
|
||||
return (model_name, model_id, model_url)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_create_model_for_tables_op = create_component_from_func(
|
||||
automl_create_model_for_tables,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
packages_to_install=['google-cloud-automl==0.4.0'],
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_model_for_tables/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,142 @@
|
|||
name: Automl create model for tables
|
||||
inputs:
|
||||
- {name: gcp_project_id, type: String}
|
||||
- {name: gcp_region, type: String}
|
||||
- {name: display_name, type: String}
|
||||
- {name: dataset_id, type: String}
|
||||
- {name: target_column_path, type: String, optional: true}
|
||||
- {name: input_feature_column_paths, type: JsonArray, optional: true}
|
||||
- {name: optimization_objective, type: String, default: MAXIMIZE_AU_PRC, optional: true}
|
||||
- {name: train_budget_milli_node_hours, type: Integer, default: '1000', optional: true}
|
||||
outputs:
|
||||
- {name: model_path, type: String}
|
||||
- {name: model_id, type: String}
|
||||
- {name: model_page_url, type: URI}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_model_for_tables/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-cloud-automl==0.4.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
|
||||
install --quiet --no-warn-script-location 'google-cloud-automl==0.4.0' --user)
|
||||
&& "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def automl_create_model_for_tables(
|
||||
gcp_project_id ,
|
||||
gcp_region ,
|
||||
display_name ,
|
||||
dataset_id ,
|
||||
target_column_path = None,
|
||||
input_feature_column_paths = None,
|
||||
optimization_objective = 'MAXIMIZE_AU_PRC',
|
||||
train_budget_milli_node_hours = 1000,
|
||||
) :
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
location_path = client.location_path(gcp_project_id, gcp_region)
|
||||
model_dict = {
|
||||
'display_name': display_name,
|
||||
'dataset_id': dataset_id,
|
||||
'tables_model_metadata': {
|
||||
'target_column_spec': automl.types.ColumnSpec(name=target_column_path),
|
||||
'input_feature_column_specs': [automl.types.ColumnSpec(name=path) for path in input_feature_column_paths] if input_feature_column_paths else None,
|
||||
'optimization_objective': optimization_objective,
|
||||
'train_budget_milli_node_hours': train_budget_milli_node_hours,
|
||||
},
|
||||
}
|
||||
|
||||
create_model_response = client.create_model(location_path, model_dict)
|
||||
print('Create model operation: {}'.format(create_model_response.operation))
|
||||
result = create_model_response.result()
|
||||
print(result)
|
||||
model_name = result.name
|
||||
model_id = model_name.rsplit('/', 1)[-1]
|
||||
model_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id};modelId={model_id};task=basic/train?project={project_id}'.format(
|
||||
project_id=gcp_project_id,
|
||||
region=gcp_region,
|
||||
dataset_id=dataset_id,
|
||||
model_id=model_id,
|
||||
)
|
||||
|
||||
return (model_name, model_id, model_url)
|
||||
|
||||
def _serialize_str(str_value: str) -> str:
|
||||
if not isinstance(str_value, str):
|
||||
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
|
||||
return str_value
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl create model for tables', description='')
|
||||
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--dataset-id", dest="dataset_id", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--target-column-path", dest="target_column_path", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--input-feature-column-paths", dest="input_feature_column_paths", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--optimization-objective", dest="optimization_objective", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--train-budget-milli-node-hours", dest="train_budget_milli_node_hours", type=int, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=3)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_create_model_for_tables(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_str,
|
||||
_serialize_str,
|
||||
str,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --gcp-project-id
|
||||
- {inputValue: gcp_project_id}
|
||||
- --gcp-region
|
||||
- {inputValue: gcp_region}
|
||||
- --display-name
|
||||
- {inputValue: display_name}
|
||||
- --dataset-id
|
||||
- {inputValue: dataset_id}
|
||||
- if:
|
||||
cond: {isPresent: target_column_path}
|
||||
then:
|
||||
- --target-column-path
|
||||
- {inputValue: target_column_path}
|
||||
- if:
|
||||
cond: {isPresent: input_feature_column_paths}
|
||||
then:
|
||||
- --input-feature-column-paths
|
||||
- {inputValue: input_feature_column_paths}
|
||||
- if:
|
||||
cond: {isPresent: optimization_objective}
|
||||
then:
|
||||
- --optimization-objective
|
||||
- {inputValue: optimization_objective}
|
||||
- if:
|
||||
cond: {isPresent: train_budget_milli_node_hours}
|
||||
then:
|
||||
- --train-budget-milli-node-hours
|
||||
- {inputValue: train_budget_milli_node_hours}
|
||||
- '----output-paths'
|
||||
- {outputPath: model_path}
|
||||
- {outputPath: model_id}
|
||||
- {outputPath: model_page_url}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
from typing import NamedTuple
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
|
||||
def automl_deploy_model(
|
||||
model_path: str,
|
||||
) -> NamedTuple('Outputs', [
|
||||
('model_path', str),
|
||||
]):
|
||||
"""Deploys a trained model.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
"""
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
response = client.deploy_model(
|
||||
name=model_path,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (model_path, )
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
automl_deploy_model_op = create_component_from_func(
|
||||
automl_deploy_model,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.8',
|
||||
packages_to_install=[
|
||||
'google-cloud-automl==2.0.0',
|
||||
],
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/deploy_model/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
name: Automl deploy model
|
||||
description: |-
|
||||
Deploys a trained model.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
inputs:
|
||||
- {name: model_path, type: String}
|
||||
outputs:
|
||||
- {name: model_path, type: String}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/deploy_model/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.8
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-cloud-automl==2.0.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
|
||||
install --quiet --no-warn-script-location 'google-cloud-automl==2.0.0' --user)
|
||||
&& "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def automl_deploy_model(
|
||||
model_path,
|
||||
):
|
||||
"""Deploys a trained model.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
"""
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
response = client.deploy_model(
|
||||
name=model_path,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (model_path, )
|
||||
|
||||
def _serialize_str(str_value: str) -> str:
|
||||
if not isinstance(str_value, str):
|
||||
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
|
||||
return str_value
|
||||
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl deploy model', description="Deploys a trained model.\n\n Args:\n model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>")
|
||||
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_deploy_model(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_str,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --model-path
|
||||
- {inputValue: model_path}
|
||||
- '----output-paths'
|
||||
- {outputPath: model_path}
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_export_data_to_gcs(
|
||||
dataset_path: str,
|
||||
gcs_output_uri_prefix: str = None,
|
||||
#retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = {},
|
||||
) -> NamedTuple('Outputs', [('gcs_output_uri_prefix', str)]):
|
||||
"""Exports dataset data to GCS."""
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, "-m", "pip", "install", "google-cloud-automl==0.4.0", "--quiet", "--no-warn-script-location"], env={"PIP_DISABLE_PIP_VERSION_CHECK": "1"}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
output_config = {"gcs_destination": {"output_uri_prefix": gcs_output_uri_prefix}}
|
||||
|
||||
response = client.export_data(
|
||||
name=dataset_path,
|
||||
output_config=output_config,
|
||||
#retry=retry or google.api_core.gapic_v1.method.DEFAULT
|
||||
timeout=timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata=metadata,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (gcs_output_uri_prefix, )
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_export_data_to_gcs_op = create_component_from_func(
|
||||
automl_export_data_to_gcs,
|
||||
output_component_file='component.yaml',base_image='python:3.7',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_data_to_gcs/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
name: Automl export data to gcs
|
||||
description: |
|
||||
Exports dataset data to GCS.
|
||||
inputs:
|
||||
- name: dataset_path
|
||||
type: String
|
||||
- name: gcs_output_uri_prefix
|
||||
optional: true
|
||||
type: String
|
||||
- name: timeout
|
||||
optional: true
|
||||
type: Float
|
||||
- default: '{}'
|
||||
name: metadata
|
||||
optional: true
|
||||
type: JsonObject
|
||||
outputs:
|
||||
- name: gcs_output_uri_prefix
|
||||
type: String
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_data_to_gcs/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
from typing import NamedTuple
|
||||
|
||||
def automl_export_data_to_gcs(
|
||||
dataset_path: str,
|
||||
gcs_output_uri_prefix: str = None,
|
||||
#retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = {},
|
||||
) -> NamedTuple('Outputs', [('gcs_output_uri_prefix', str)]):
|
||||
"""Exports dataset data to GCS."""
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, "-m", "pip", "install", "google-cloud-automl==0.4.0", "--quiet", "--no-warn-script-location"], env={"PIP_DISABLE_PIP_VERSION_CHECK": "1"}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
output_config = {"gcs_destination": {"output_uri_prefix": gcs_output_uri_prefix}}
|
||||
|
||||
response = client.export_data(
|
||||
name=dataset_path,
|
||||
output_config=output_config,
|
||||
#retry=retry or google.api_core.gapic_v1.method.DEFAULT
|
||||
timeout=timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata=metadata,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (gcs_output_uri_prefix, )
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl export data to gcs', description='Exports dataset data to GCS.\n')
|
||||
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_export_data_to_gcs(**_parsed_args)
|
||||
|
||||
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
|
||||
_outputs = [_outputs]
|
||||
|
||||
_output_serializers = [
|
||||
str
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --dataset-path
|
||||
- inputValue: dataset_path
|
||||
- if:
|
||||
cond:
|
||||
isPresent: gcs_output_uri_prefix
|
||||
then:
|
||||
- --gcs-output-uri-prefix
|
||||
- inputValue: gcs_output_uri_prefix
|
||||
- if:
|
||||
cond:
|
||||
isPresent: timeout
|
||||
then:
|
||||
- --timeout
|
||||
- inputValue: timeout
|
||||
- if:
|
||||
cond:
|
||||
isPresent: metadata
|
||||
then:
|
||||
- --metadata
|
||||
- inputValue: metadata
|
||||
- '----output-paths'
|
||||
- outputPath: gcs_output_uri_prefix
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
from typing import NamedTuple
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
|
||||
def automl_export_model_to_gcs(
|
||||
model_path: str,
|
||||
gcs_output_uri_prefix: str,
|
||||
model_format: str = 'tf_saved_model',
|
||||
) -> NamedTuple('Outputs', [
|
||||
('model_directory', 'Uri'),
|
||||
]):
|
||||
"""Exports a trained model to a user specified Google Cloud Storage location.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
|
||||
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
"""
|
||||
from google.cloud import automl
|
||||
|
||||
client = automl.AutoMlClient()
|
||||
response = client.export_model(
|
||||
name=model_path,
|
||||
output_config=automl.ModelExportOutputConfig(
|
||||
model_format=model_format,
|
||||
gcs_destination=automl.GcsDestination(
|
||||
output_uri_prefix=gcs_output_uri_prefix,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (metadata.export_model_details.output_info.gcs_output_directory, )
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
automl_export_model_to_gcs_op = create_component_from_func(
|
||||
automl_export_model_to_gcs,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.8',
|
||||
packages_to_install=[
|
||||
'google-cloud-automl==2.0.0',
|
||||
],
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_model_to_gcs/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,107 @@
|
|||
name: Automl export model to gcs
|
||||
description: |-
|
||||
Exports a trained model to a user specified Google Cloud Storage location.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
|
||||
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
inputs:
|
||||
- {name: model_path, type: String}
|
||||
- {name: gcs_output_uri_prefix, type: String}
|
||||
- {name: model_format, type: String, default: tf_saved_model, optional: true}
|
||||
outputs:
|
||||
- {name: model_directory, type: Uri}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_model_to_gcs/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.8
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-cloud-automl==2.0.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
|
||||
install --quiet --no-warn-script-location 'google-cloud-automl==2.0.0' --user)
|
||||
&& "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def automl_export_model_to_gcs(
|
||||
model_path,
|
||||
gcs_output_uri_prefix,
|
||||
model_format = 'tf_saved_model',
|
||||
):
|
||||
"""Exports a trained model to a user specified Google Cloud Storage location.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
|
||||
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
"""
|
||||
from google.cloud import automl
|
||||
|
||||
client = automl.AutoMlClient()
|
||||
response = client.export_model(
|
||||
name=model_path,
|
||||
output_config=automl.ModelExportOutputConfig(
|
||||
model_format=model_format,
|
||||
gcs_destination=automl.GcsDestination(
|
||||
output_uri_prefix=gcs_output_uri_prefix,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (metadata.export_model_details.output_info.gcs_output_directory, )
|
||||
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl export model to gcs', description="Exports a trained model to a user specified Google Cloud Storage location.\n\n Args:\n model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'\n gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.\n model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>")
|
||||
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--model-format", dest="model_format", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_export_model_to_gcs(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
str,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --model-path
|
||||
- {inputValue: model_path}
|
||||
- --gcs-output-uri-prefix
|
||||
- {inputValue: gcs_output_uri_prefix}
|
||||
- if:
|
||||
cond: {isPresent: model_format}
|
||||
then:
|
||||
- --model-format
|
||||
- {inputValue: model_format}
|
||||
- '----output-paths'
|
||||
- {outputPath: model_directory}
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_import_data_from_bigquery(
|
||||
dataset_path,
|
||||
input_uri: str,
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('dataset_path', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
input_config = {
|
||||
'bigquery_source': {
|
||||
'input_uri': input_uri,
|
||||
},
|
||||
}
|
||||
response = client.import_data(
|
||||
dataset_path,
|
||||
input_config,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
result = response.result()
|
||||
print(result)
|
||||
metadata = response.metadata
|
||||
print(metadata)
|
||||
return (dataset_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
automl_import_data_from_bigquery_op = create_component_from_func(
|
||||
automl_import_data_from_bigquery,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_bigquery/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
name: Automl import data from bigquery
|
||||
inputs:
|
||||
- name: dataset_path
|
||||
- name: input_uri
|
||||
type: String
|
||||
- name: retry
|
||||
optional: true
|
||||
- name: timeout
|
||||
optional: true
|
||||
- name: metadata
|
||||
type: JsonObject
|
||||
optional: true
|
||||
outputs:
|
||||
- name: dataset_path
|
||||
type: String
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_bigquery/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
from typing import NamedTuple
|
||||
|
||||
def automl_import_data_from_bigquery(
|
||||
dataset_path,
|
||||
input_uri: str,
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('dataset_path', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
input_config = {
|
||||
'bigquery_source': {
|
||||
'input_uri': input_uri,
|
||||
},
|
||||
}
|
||||
response = client.import_data(
|
||||
dataset_path,
|
||||
input_config,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
result = response.result()
|
||||
print(result)
|
||||
metadata = response.metadata
|
||||
print(metadata)
|
||||
return (dataset_path)
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_missing_arg = object()
|
||||
_parser = argparse.ArgumentParser(prog='Automl import data from bigquery', description='')
|
||||
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--input-uri", dest="input_uri", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_import_data_from_bigquery(**_parsed_args)
|
||||
|
||||
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
|
||||
_outputs = [_outputs]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(str(_outputs[idx]))
|
||||
args:
|
||||
- --dataset-path
|
||||
- inputValue: dataset_path
|
||||
- --input-uri
|
||||
- inputValue: input_uri
|
||||
- if:
|
||||
cond:
|
||||
isPresent: retry
|
||||
then:
|
||||
- --retry
|
||||
- inputValue: retry
|
||||
- if:
|
||||
cond:
|
||||
isPresent: timeout
|
||||
then:
|
||||
- --timeout
|
||||
- inputValue: timeout
|
||||
- if:
|
||||
cond:
|
||||
isPresent: metadata
|
||||
then:
|
||||
- --metadata
|
||||
- inputValue: metadata
|
||||
- '----output-paths'
|
||||
- outputPath: dataset_path
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_import_data_from_gcs(
|
||||
dataset_path: str,
|
||||
input_uris: list,
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('dataset_path', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
input_config = {
|
||||
'gcs_source': {
|
||||
'input_uris': input_uris,
|
||||
},
|
||||
}
|
||||
response = client.import_data(
|
||||
dataset_path,
|
||||
input_config,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
result = response.result()
|
||||
print(result)
|
||||
metadata = response.metadata
|
||||
print(metadata)
|
||||
return (dataset_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_import_data_from_gcs_op = create_component_from_func(
|
||||
automl_import_data_from_gcs,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_gcs/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
name: Automl import data from gcs
|
||||
inputs:
|
||||
- name: dataset_path
|
||||
type: String
|
||||
- name: input_uris
|
||||
type: JsonArray
|
||||
- name: retry
|
||||
optional: true
|
||||
- name: timeout
|
||||
optional: true
|
||||
- name: metadata
|
||||
type: JsonObject
|
||||
optional: true
|
||||
outputs:
|
||||
- name: dataset_path
|
||||
type: String
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_gcs/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
from typing import NamedTuple
|
||||
|
||||
def automl_import_data_from_gcs(
|
||||
dataset_path: str,
|
||||
input_uris: list,
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('dataset_path', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
input_config = {
|
||||
'gcs_source': {
|
||||
'input_uris': input_uris,
|
||||
},
|
||||
}
|
||||
response = client.import_data(
|
||||
dataset_path,
|
||||
input_config,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
result = response.result()
|
||||
print(result)
|
||||
metadata = response.metadata
|
||||
print(metadata)
|
||||
return (dataset_path)
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_missing_arg = object()
|
||||
_parser = argparse.ArgumentParser(prog='Automl import data from gcs', description='')
|
||||
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--input-uris", dest="input_uris", type=json.loads, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_import_data_from_gcs(**_parsed_args)
|
||||
|
||||
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
|
||||
_outputs = [_outputs]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(str(_outputs[idx]))
|
||||
args:
|
||||
- --dataset-path
|
||||
- inputValue: dataset_path
|
||||
- --input-uris
|
||||
- inputValue: input_uris
|
||||
- if:
|
||||
cond:
|
||||
isPresent: retry
|
||||
then:
|
||||
- --retry
|
||||
- inputValue: retry
|
||||
- if:
|
||||
cond:
|
||||
isPresent: timeout
|
||||
then:
|
||||
- --timeout
|
||||
- inputValue: timeout
|
||||
- if:
|
||||
cond:
|
||||
isPresent: metadata
|
||||
then:
|
||||
- --metadata
|
||||
- inputValue: metadata
|
||||
- '----output-paths'
|
||||
- outputPath: dataset_path
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_prediction_service_batch_predict(
|
||||
model_path,
|
||||
gcs_input_uris: list = None,
|
||||
gcs_output_uri_prefix: str = None,
|
||||
bq_input_uri: str = None,
|
||||
bq_output_uri: str = None,
|
||||
params=None,
|
||||
retry=None, #google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('gcs_output_directory', str), ('bigquery_output_dataset', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
input_config = {}
|
||||
if gcs_input_uris:
|
||||
input_config['gcs_source'] = {'input_uris': gcs_input_uris}
|
||||
if bq_input_uri:
|
||||
input_config['bigquery_source'] = {'input_uri': bq_input_uri}
|
||||
|
||||
output_config = {}
|
||||
if gcs_output_uri_prefix:
|
||||
output_config['gcs_destination'] = {'output_uri_prefix': gcs_output_uri_prefix}
|
||||
if bq_output_uri:
|
||||
output_config['bigquery_destination'] = {'output_uri': bq_output_uri}
|
||||
|
||||
from google.cloud import automl
|
||||
client = automl.PredictionServiceClient()
|
||||
response = client.batch_predict(
|
||||
model_path,
|
||||
input_config,
|
||||
output_config,
|
||||
params,
|
||||
retry,
|
||||
timeout,
|
||||
metadata,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
output_info = metadata.batch_predict_details.output_info
|
||||
# Workaround for Argo issue - it fails when output is empty: https://github.com/argoproj/argo-workflows/pull/1277/files#r326028422
|
||||
return (output_info.gcs_output_directory or '-', output_info.bigquery_output_dataset or '-')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_prediction_service_batch_predict_op = create_component_from_func(
|
||||
automl_prediction_service_batch_predict,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/prediction_service_batch_predict/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
name: Automl prediction service batch predict
|
||||
inputs:
|
||||
- name: model_path
|
||||
- name: gcs_input_uris
|
||||
type: JsonArray
|
||||
optional: true
|
||||
- name: gcs_output_uri_prefix
|
||||
type: String
|
||||
optional: true
|
||||
- name: bq_input_uri
|
||||
type: String
|
||||
optional: true
|
||||
- name: bq_output_uri
|
||||
type: String
|
||||
optional: true
|
||||
- name: params
|
||||
optional: true
|
||||
- name: retry
|
||||
optional: true
|
||||
- name: timeout
|
||||
optional: true
|
||||
- name: metadata
|
||||
type: JsonObject
|
||||
optional: true
|
||||
outputs:
|
||||
- name: gcs_output_directory
|
||||
type: String
|
||||
- name: bigquery_output_dataset
|
||||
type: String
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/prediction_service_batch_predict/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
from typing import NamedTuple
|
||||
|
||||
def automl_prediction_service_batch_predict(
|
||||
model_path,
|
||||
gcs_input_uris: str = None,
|
||||
gcs_output_uri_prefix: str = None,
|
||||
bq_input_uri: str = None,
|
||||
bq_output_uri: str = None,
|
||||
params=None,
|
||||
retry=None, #google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('gcs_output_directory', str), ('bigquery_output_dataset', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
input_config = {}
|
||||
if gcs_input_uris:
|
||||
input_config['gcs_source'] = {'input_uris': gcs_input_uris}
|
||||
if bq_input_uri:
|
||||
input_config['bigquery_source'] = {'input_uri': bq_input_uri}
|
||||
|
||||
output_config = {}
|
||||
if gcs_output_uri_prefix:
|
||||
output_config['gcs_destination'] = {'output_uri_prefix': gcs_output_uri_prefix}
|
||||
if bq_output_uri:
|
||||
output_config['bigquery_destination'] = {'output_uri': bq_output_uri}
|
||||
|
||||
from google.cloud import automl
|
||||
client = automl.PredictionServiceClient()
|
||||
response = client.batch_predict(
|
||||
model_path,
|
||||
input_config,
|
||||
output_config,
|
||||
params,
|
||||
retry,
|
||||
timeout,
|
||||
metadata,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
output_info = metadata.batch_predict_details.output_info
|
||||
# Workaround for Argo issue - it fails when output is empty: https://github.com/argoproj/argo-workflows/pull/1277/files#r326028422
|
||||
return (output_info.gcs_output_directory or '-', output_info.bigquery_output_dataset or '-')
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_missing_arg = object()
|
||||
_parser = argparse.ArgumentParser(prog='Automl prediction service batch predict', description='')
|
||||
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--gcs-input-uris", dest="gcs_input_uris", type=json.loads, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--bq-input-uri", dest="bq_input_uri", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--bq-output-uri", dest="bq_output_uri", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--params", dest="params", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
|
||||
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_prediction_service_batch_predict(**_parsed_args)
|
||||
|
||||
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
|
||||
_outputs = [_outputs]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(str(_outputs[idx]))
|
||||
args:
|
||||
- --model-path
|
||||
- inputValue: model_path
|
||||
- if:
|
||||
cond:
|
||||
isPresent: gcs_input_uris
|
||||
then:
|
||||
- --gcs-input-uris
|
||||
- inputValue: gcs_input_uris
|
||||
- if:
|
||||
cond:
|
||||
isPresent: gcs_output_uri_prefix
|
||||
then:
|
||||
- --gcs-output-uri-prefix
|
||||
- inputValue: gcs_output_uri_prefix
|
||||
- if:
|
||||
cond:
|
||||
isPresent: bq_input_uri
|
||||
then:
|
||||
- --bq-input-uri
|
||||
- inputValue: bq_input_uri
|
||||
- if:
|
||||
cond:
|
||||
isPresent: bq_output_uri
|
||||
then:
|
||||
- --bq-output-uri
|
||||
- inputValue: bq_output_uri
|
||||
- if:
|
||||
cond:
|
||||
isPresent: params
|
||||
then:
|
||||
- --params
|
||||
- inputValue: params
|
||||
- if:
|
||||
cond:
|
||||
isPresent: retry
|
||||
then:
|
||||
- --retry
|
||||
- inputValue: retry
|
||||
- if:
|
||||
cond:
|
||||
isPresent: timeout
|
||||
then:
|
||||
- --timeout
|
||||
- inputValue: timeout
|
||||
- if:
|
||||
cond:
|
||||
isPresent: metadata
|
||||
then:
|
||||
- --metadata
|
||||
- inputValue: metadata
|
||||
- '----output-paths'
|
||||
- outputPath: gcs_output_directory
|
||||
- outputPath: bigquery_output_dataset
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_split_dataset_table_column_names(
|
||||
dataset_path: str,
|
||||
target_column_name: str,
|
||||
table_index: int = 0,
|
||||
) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
list_table_specs_response = client.list_table_specs(dataset_path)
|
||||
table_specs = [s for s in list_table_specs_response]
|
||||
print('table_specs=')
|
||||
print(table_specs)
|
||||
table_spec_name = table_specs[table_index].name
|
||||
|
||||
list_column_specs_response = client.list_column_specs(table_spec_name)
|
||||
column_specs = [s for s in list_column_specs_response]
|
||||
print('column_specs=')
|
||||
print(column_specs)
|
||||
|
||||
target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0]
|
||||
feature_column_specs = [s for s in column_specs if s.display_name != target_column_name]
|
||||
feature_column_names = [s.name for s in feature_column_specs]
|
||||
|
||||
import json
|
||||
return (target_column_spec.name, json.dumps(feature_column_names))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_split_dataset_table_column_names_op = create_component_from_func(
|
||||
automl_split_dataset_table_column_names,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/split_dataset_table_column_names/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
name: Automl split dataset table column names
|
||||
inputs:
|
||||
- name: dataset_path
|
||||
type: String
|
||||
- name: target_column_name
|
||||
type: String
|
||||
- name: table_index
|
||||
type: Integer
|
||||
default: '0'
|
||||
optional: true
|
||||
outputs:
|
||||
- name: target_column_path
|
||||
type: String
|
||||
- name: feature_column_paths
|
||||
type: JsonArray
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/split_dataset_table_column_names/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
from typing import NamedTuple
|
||||
|
||||
def automl_split_dataset_table_column_names(
|
||||
dataset_path: str,
|
||||
target_column_name: str,
|
||||
table_index: int = 0,
|
||||
) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
list_table_specs_response = client.list_table_specs(dataset_path)
|
||||
table_specs = [s for s in list_table_specs_response]
|
||||
print('table_specs=')
|
||||
print(table_specs)
|
||||
table_spec_name = table_specs[table_index].name
|
||||
|
||||
list_column_specs_response = client.list_column_specs(table_spec_name)
|
||||
column_specs = [s for s in list_column_specs_response]
|
||||
print('column_specs=')
|
||||
print(column_specs)
|
||||
|
||||
target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0]
|
||||
feature_column_specs = [s for s in column_specs if s.display_name != target_column_name]
|
||||
feature_column_names = [s.name for s in feature_column_specs]
|
||||
|
||||
import json
|
||||
return (target_column_spec.name, json.dumps(feature_column_names))
|
||||
|
||||
import argparse
|
||||
_missing_arg = object()
|
||||
_parser = argparse.ArgumentParser(prog='Automl split dataset table column names', description='')
|
||||
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--target-column-name", dest="target_column_name", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--table-index", dest="table_index", type=int, required=False, default=_missing_arg)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
|
||||
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_split_dataset_table_column_names(**_parsed_args)
|
||||
|
||||
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
|
||||
_outputs = [_outputs]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(str(_outputs[idx]))
|
||||
args:
|
||||
- --dataset-path
|
||||
- inputValue: dataset_path
|
||||
- --target-column-name
|
||||
- inputValue: target_column_name
|
||||
- if:
|
||||
cond:
|
||||
isPresent: table_index
|
||||
then:
|
||||
- --table-index
|
||||
- inputValue: table_index
|
||||
- '----output-paths'
|
||||
- outputPath: target_column_path
|
||||
- outputPath: feature_column_paths
|
||||
|
|
@ -0,0 +1,176 @@
|
|||
|
||||
# Name
|
||||
Component: Data processing by creating a cluster in Cloud Dataproc
|
||||
|
||||
|
||||
# Label
|
||||
Cloud Dataproc, Kubeflow
|
||||
|
||||
# Facets
|
||||
<!--Make sure the asset has data for the following facets:
|
||||
Use case
|
||||
Technique
|
||||
Input data type
|
||||
ML workflow
|
||||
|
||||
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
|
||||
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
|
||||
--->
|
||||
Use case:
|
||||
Other
|
||||
|
||||
Technique:
|
||||
Other
|
||||
|
||||
Input data type:
|
||||
Tabular
|
||||
|
||||
ML workflow:
|
||||
Data preparation
|
||||
|
||||
# Summary
|
||||
A Kubeflow pipeline component to create a cluster in Cloud Dataproc.
|
||||
|
||||
# Details
|
||||
## Intended use
|
||||
|
||||
Use this component at the start of a Kubeflow pipeline to create a temporary Cloud Dataproc cluster to run Cloud Dataproc jobs as steps in the pipeline.
|
||||
|
||||
## Runtime arguments
|
||||
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|----------|-------------|----------|-----------|-----------------|---------|
|
||||
| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | | |
|
||||
| region | The Cloud Dataproc region to create the cluster in. | No | GCPRegion | | |
|
||||
| name | The name of the cluster. Cluster names within a project must be unique. You can reuse the names of deleted clusters. | Yes | String | | None |
|
||||
| name_prefix | The prefix of the cluster name. | Yes | String | | None |
|
||||
| initialization_actions | A list of Cloud Storage URIs identifying the executables on each node after the configuration is completed. By default, executables are run on the master and all the worker nodes. | Yes | List | | None |
|
||||
| config_bucket | The Cloud Storage bucket to use to stage the job dependencies, the configuration files, and the job driver console’s output. | Yes | GCSPath | | None |
|
||||
| image_version | The version of the software inside the cluster. | Yes | String | | None |
|
||||
| cluster | The full [cluster configuration](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#Cluster). | Yes | Dict | | None |
|
||||
| wait_interval | The number of seconds to pause before polling the operation. | Yes | Integer | | 30 |
|
||||
|
||||
## Output
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
cluster_name | The name of the cluster. | String
|
||||
|
||||
Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/delete_cluster).
|
||||
|
||||
|
||||
## Cautions & requirements
|
||||
|
||||
To use the component, you must:
|
||||
* Set up the GCP project by following these [steps](https://cloud.google.com/dataproc/docs/guides/setup-project).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant the following types of access to the Kubeflow user service account:
|
||||
* Read access to the Cloud Storage buckets which contain the initialization action files.
|
||||
* The role, `roles/dataproc.editor`, on the project.
|
||||
|
||||
## Detailed description
|
||||
|
||||
This component creates a new Dataproc cluster by using the [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create).
|
||||
|
||||
Follow these steps to use the component in a pipeline:
|
||||
|
||||
1. Install the Kubeflow pipeline's SDK
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using the Kubeflow pipeline's SDK
|
||||
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
dataproc_create_cluster_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/create_cluster/component.yaml')
|
||||
help(dataproc_create_cluster_op)
|
||||
```
|
||||
|
||||
### Sample
|
||||
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
```python
|
||||
# Required parameters
|
||||
PROJECT_ID = '<Put your project ID here>'
|
||||
|
||||
# Optional parameters
|
||||
EXPERIMENT_NAME = 'Dataproc - Create Cluster'
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Dataproc create cluster pipeline',
|
||||
description='Dataproc create cluster pipeline'
|
||||
)
|
||||
def dataproc_create_cluster_pipeline(
|
||||
project_id = PROJECT_ID,
|
||||
region = 'us-central1',
|
||||
name='',
|
||||
name_prefix='',
|
||||
initialization_actions='',
|
||||
config_bucket='',
|
||||
image_version='',
|
||||
cluster='',
|
||||
wait_interval='30'
|
||||
):
|
||||
dataproc_create_cluster_op(
|
||||
project_id=project_id,
|
||||
region=region,
|
||||
name=name,
|
||||
name_prefix=name_prefix,
|
||||
initialization_actions=initialization_actions,
|
||||
config_bucket=config_bucket,
|
||||
image_version=image_version,
|
||||
cluster=cluster,
|
||||
wait_interval=wait_interval)
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
#Compile the pipeline
|
||||
pipeline_func = dataproc_create_cluster_pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify values for the pipeline's arguments
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
## References
|
||||
* [Kubernetes Engine for Kubeflow](https://www.kubeflow.org/docs/started/getting-started-gke/#gcp-service-accounts)
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/create_cluster/sample.ipynb)
|
||||
* [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: dataproc_create_cluster
|
||||
description: |
|
||||
Creates a DataProc cluster under a project.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: project_id
|
||||
description: >-
|
||||
Required. The ID of the Google Cloud Platform project that the cluster belongs to.
|
||||
type: GCPProjectID
|
||||
- name: region
|
||||
description: 'Required. The Cloud Dataproc region in which to handle the request.'
|
||||
type: GCPRegion
|
||||
- name: name
|
||||
description: >-
|
||||
Optional. The cluster name. Cluster names within a project must be unique. Names of
|
||||
deleted clusters can be reused
|
||||
default: ''
|
||||
type: String
|
||||
- name: name_prefix
|
||||
description: 'Optional. The prefix of the cluster name.'
|
||||
default: ''
|
||||
type: String
|
||||
- name: initialization_actions
|
||||
description: >-
|
||||
Optional. List of GCS URIs of executables to execute on each node after config
|
||||
is completed. By default, executables are run on master and all worker nodes.
|
||||
default: ''
|
||||
type: List
|
||||
- name: config_bucket
|
||||
description: >-
|
||||
Optional. A Google Cloud Storage bucket used to stage job dependencies, config
|
||||
files, and job driver console output.
|
||||
default: ''
|
||||
type: GCSPath
|
||||
- name: image_version
|
||||
description: 'Optional. The version of software inside the cluster.'
|
||||
default: ''
|
||||
type: String
|
||||
- name: cluster
|
||||
description: >-
|
||||
Optional. The full cluster config. See
|
||||
[full details](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#Cluster)
|
||||
default: ''
|
||||
type: Dict
|
||||
- name: wait_interval
|
||||
default: '30'
|
||||
description: 'Optional. The wait seconds between polling the operation. Defaults to 30.'
|
||||
type: Integer
|
||||
outputs:
|
||||
- name: cluster_name
|
||||
description: 'The cluster name of the created cluster.'
|
||||
type: String
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.dataproc, create_cluster,
|
||||
--project_id, {inputValue: project_id},
|
||||
--region, {inputValue: region},
|
||||
--name, {inputValue: name},
|
||||
--name_prefix, {inputValue: name_prefix},
|
||||
--initialization_actions, {inputValue: initialization_actions},
|
||||
--config_bucket, {inputValue: config_bucket},
|
||||
--image_version, {inputValue: image_version},
|
||||
--cluster, {inputValue: cluster},
|
||||
--wait_interval, {inputValue: wait_interval},
|
||||
--cluster_name_output_path, {outputPath: cluster_name},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,245 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"Data processing by creating a cluster in Cloud Dataproc\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"Cloud Dataproc, cluster, GCP, Cloud Storage, KubeFlow, Pipeline\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"A Kubeflow Pipeline component to create a cluster in Cloud Dataproc.\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"## Intended use\n",
|
||||
"\n",
|
||||
"Use this component at the start of a Kubeflow Pipeline to create a temporary Cloud Dataproc cluster to run Cloud Dataproc jobs as steps in the pipeline.\n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"\n",
|
||||
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
"|----------|-------------|----------|-----------|-----------------|---------|\n",
|
||||
"| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | | |\n",
|
||||
"| region | The Cloud Dataproc region to create the cluster in. | No | GCPRegion | | |\n",
|
||||
"| name | The name of the cluster. Cluster names within a project must be unique. You can reuse the names of deleted clusters. | Yes | String | | None |\n",
|
||||
"| name_prefix | The prefix of the cluster name. | Yes | String | | None |\n",
|
||||
"| initialization_actions | A list of Cloud Storage URIs identifying executables to execute on each node after the configuration is completed. By default, executables are run on the master and all the worker nodes. | Yes | List | | None |\n",
|
||||
"| config_bucket | The Cloud Storage bucket to use to stage the job dependencies, the configuration files, and the job driver console’s output. | Yes | GCSPath | | None |\n",
|
||||
"| image_version | The version of the software inside the cluster. | Yes | String | | None |\n",
|
||||
"| cluster | The full [cluster configuration](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#Cluster). | Yes | Dict | | None |\n",
|
||||
"| wait_interval | The number of seconds to pause before polling the operation. | Yes | Integer | | 30 |\n",
|
||||
"\n",
|
||||
"## Output\n",
|
||||
"Name | Description | Type\n",
|
||||
":--- | :---------- | :---\n",
|
||||
"cluster_name | The name of the cluster. | String\n",
|
||||
"\n",
|
||||
"Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/delete_cluster).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"\n",
|
||||
"To use the component, you must:\n",
|
||||
"* Set up the GCP project by following these [steps](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant the following types of access to the Kubeflow user service account:\n",
|
||||
" * Read access to the Cloud Storage buckets which contains initialization action files.\n",
|
||||
" * The role, `roles/dataproc.editor` on the project.\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"\n",
|
||||
"This component creates a new Dataproc cluster by using the [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create). \n",
|
||||
"\n",
|
||||
"Follow these steps to use the component in a pipeline:\n",
|
||||
"\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"dataproc_create_cluster_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/create_cluster/component.yaml')\n",
|
||||
"help(dataproc_create_cluster_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sample\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
|
||||
"\n",
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Required Parameters\n",
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"\n",
|
||||
"# Optional Parameters\n",
|
||||
"EXPERIMENT_NAME = 'Dataproc - Create Cluster'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='Dataproc create cluster pipeline',\n",
|
||||
" description='Dataproc create cluster pipeline'\n",
|
||||
")\n",
|
||||
"def dataproc_create_cluster_pipeline(\n",
|
||||
" project_id = PROJECT_ID, \n",
|
||||
" region = 'us-central1', \n",
|
||||
" name='', \n",
|
||||
" name_prefix='',\n",
|
||||
" initialization_actions='', \n",
|
||||
" config_bucket='', \n",
|
||||
" image_version='', \n",
|
||||
" cluster='', \n",
|
||||
" wait_interval='30'\n",
|
||||
"):\n",
|
||||
" dataproc_create_cluster_op(\n",
|
||||
" project_id=project_id, \n",
|
||||
" region=region, \n",
|
||||
" name=name, \n",
|
||||
" name_prefix=name_prefix, \n",
|
||||
" initialization_actions=initialization_actions, \n",
|
||||
" config_bucket=config_bucket, \n",
|
||||
" image_version=image_version, \n",
|
||||
" cluster=cluster, \n",
|
||||
" wait_interval=wait_interval)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = dataproc_create_cluster_pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Kubernetes Engine for Kubeflow](https://www.kubeflow.org/docs/started/getting-started-gke/#gcp-service-accounts)\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/create_cluster/sample.ipynb)\n",
|
||||
"* [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
|
||||
# Name
|
||||
|
||||
Component: Data preparation by deleting a cluster in Cloud Dataproc
|
||||
|
||||
# Label
|
||||
Cloud Dataproc, Kubeflow
|
||||
|
||||
|
||||
# Summary
|
||||
A Kubeflow pipeline component to delete a cluster in Cloud Dataproc.
|
||||
|
||||
## Intended use
|
||||
Use this component at the start of a Kubeflow pipeline to delete a temporary Cloud Dataproc cluster when running Cloud Dataproc jobs as steps in the pipeline. This component is usually used with an [exit handler](https://github.com/kubeflow/pipelines/blob/master/samples/core/exit_handler/exit_handler.py) to run at the end of a pipeline.
|
||||
|
||||
# Facets
|
||||
<!--Make sure the asset has data for the following facets:
|
||||
Use case
|
||||
Technique
|
||||
Input data type
|
||||
ML workflow
|
||||
|
||||
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
|
||||
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
|
||||
--->
|
||||
Use case:
|
||||
|
||||
Technique:
|
||||
|
||||
Input data type:
|
||||
|
||||
ML workflow:
|
||||
|
||||
## Runtime arguments
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|:----------|:-------------|:----------|:-----------|:-----------------|:---------|
|
||||
| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | - | - |
|
||||
| region | The Cloud Dataproc region in which to handle the request. | No | GCPRegion | - | - |
|
||||
| name | The name of the cluster to delete. | No | String | - | - |
|
||||
| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | - | 30 |
|
||||
|
||||
|
||||
## Cautions & requirements
|
||||
To use the component, you must:
|
||||
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant the Kubeflow user service account the role, `roles/dataproc.editor`, on the project.
|
||||
|
||||
## Detailed description
|
||||
This component deletes a Dataproc cluster by using [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete).
|
||||
|
||||
Follow these steps to use the component in a pipeline:
|
||||
1. Install the Kubeflow pipeline's SDK:
|
||||
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using the Kubeflow pipeline's SDK:
|
||||
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
dataproc_delete_cluster_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/delete_cluster/component.yaml')
|
||||
help(dataproc_delete_cluster_op)
|
||||
```
|
||||
|
||||
### Sample
|
||||
|
||||
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
|
||||
|
||||
#### Prerequisites
|
||||
|
||||
[Create a Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) before running the sample code.
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
PROJECT_ID = '<Put your project ID here>'
|
||||
CLUSTER_NAME = '<Put your existing cluster name here>'
|
||||
|
||||
REGION = 'us-central1'
|
||||
EXPERIMENT_NAME = 'Dataproc - Delete Cluster'
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Dataproc delete cluster pipeline',
|
||||
description='Dataproc delete cluster pipeline'
|
||||
)
|
||||
def dataproc_delete_cluster_pipeline(
|
||||
project_id = PROJECT_ID,
|
||||
region = REGION,
|
||||
name = CLUSTER_NAME
|
||||
):
|
||||
dataproc_delete_cluster_op(
|
||||
project_id=project_id,
|
||||
region=region,
|
||||
name=name)
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
pipeline_func = dataproc_delete_cluster_pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify values for the pipeline's arguments
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/delete_cluster/sample.ipynb)
|
||||
* [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete)
|
||||
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: dataproc_delete_cluster
|
||||
description: |
|
||||
Deletes a DataProc cluster.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: project_id
|
||||
description: >-
|
||||
Required. The ID of the Google Cloud Platform project that the cluster belongs to.
|
||||
type: GCPProjectID
|
||||
- name: region
|
||||
description: >-
|
||||
Required. The Cloud Dataproc region in which to handle the request.
|
||||
type: GCPRegion
|
||||
- name: name
|
||||
description: 'Required. The cluster name to delete.'
|
||||
type: String
|
||||
- name: wait_interval
|
||||
default: '30'
|
||||
description: 'Optional. The wait seconds between polling the operation. Defaults to 30.'
|
||||
type: Integer
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
kfp_component.google.dataproc, delete_cluster,
|
||||
--project_id, {inputValue: project_id},
|
||||
--region, {inputValue: region},
|
||||
--name, {inputValue: name},
|
||||
--wait_interval, {inputValue: wait_interval}
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"\n",
|
||||
"Data preparation by deleting a cluster in Cloud Dataproc\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"Cloud Dataproc, cluster, GCP, Cloud Storage, Kubeflow, Pipeline\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"A Kubeflow Pipeline component to delete a cluster in Cloud Dataproc.\n",
|
||||
"\n",
|
||||
"## Intended use\n",
|
||||
"Use this component at the start of a Kubeflow Pipeline to delete a temporary Cloud Dataproc \n",
|
||||
"cluster to run Cloud Dataproc jobs as steps in the pipeline. This component is usually \n",
|
||||
"used with an [exit handler](https://github.com/kubeflow/pipelines/blob/master/samples/core/exit_handler/exit_handler.py) to run at the end of a pipeline.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
"|----------|-------------|----------|-----------|-----------------|---------|\n",
|
||||
"| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | | |\n",
|
||||
"| region | The Cloud Dataproc region in which to handle the request. | No | GCPRegion | | |\n",
|
||||
"| name | The name of the cluster to delete. | No | String | | |\n",
|
||||
"| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | | 30 |\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"To use the component, you must:\n",
|
||||
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"This component deletes a Dataproc cluster by using [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete).\n",
|
||||
"\n",
|
||||
"Follow these steps to use the component in a pipeline:\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"dataproc_delete_cluster_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/delete_cluster/component.yaml')\n",
|
||||
"help(dataproc_delete_cluster_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sample\n",
|
||||
"\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
|
||||
"\n",
|
||||
"#### Prerequisites\n",
|
||||
"\n",
|
||||
"[Create a Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) before running the sample code.\n",
|
||||
"\n",
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
|
||||
"\n",
|
||||
"REGION = 'us-central1'\n",
|
||||
"EXPERIMENT_NAME = 'Dataproc - Delete Cluster'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='Dataproc delete cluster pipeline',\n",
|
||||
" description='Dataproc delete cluster pipeline'\n",
|
||||
")\n",
|
||||
"def dataproc_delete_cluster_pipeline(\n",
|
||||
" project_id = PROJECT_ID, \n",
|
||||
" region = REGION,\n",
|
||||
" name = CLUSTER_NAME\n",
|
||||
"):\n",
|
||||
" dataproc_delete_cluster_op(\n",
|
||||
" project_id=project_id, \n",
|
||||
" region=region, \n",
|
||||
" name=name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = dataproc_delete_cluster_pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/delete_cluster/sample.ipynb)\n",
|
||||
"* [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,205 @@
|
|||
|
||||
# Name
|
||||
Component: Data preparation using Hadoop MapReduce on YARN with Cloud Dataproc
|
||||
|
||||
# Labels
|
||||
Cloud Dataproc, Hadoop, YARN, Apache, MapReduce
|
||||
|
||||
|
||||
# Summary
|
||||
A Kubeflow pipeline component to prepare data by submitting an Apache Hadoop MapReduce job on Apache Hadoop YARN to Cloud Dataproc.
|
||||
|
||||
# Facets
|
||||
<!--Make sure the asset has data for the following facets:
|
||||
Use case
|
||||
Technique
|
||||
Input data type
|
||||
ML workflow
|
||||
|
||||
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
|
||||
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
|
||||
--->
|
||||
Use case:
|
||||
|
||||
Technique:
|
||||
|
||||
Input data type:
|
||||
|
||||
ML workflow:
|
||||
|
||||
# Details
|
||||
## Intended use
|
||||
Use the component to run an Apache Hadoop MapReduce job as one preprocessing step in a Kubeflow pipeline.
|
||||
|
||||
## Runtime arguments
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|----------|-------------|----------|-----------|-----------------|---------|
|
||||
| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | - | - |
|
||||
| region | The Dataproc region to handle the request. | No | GCPRegion | - | - |
|
||||
| cluster_name | The name of the cluster to run the job. | No | String | - | - |
|
||||
| main_jar_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the JAR file containing the main class to execute. | No | List |- |- |
|
||||
| main_class | The name of the driver's main class. The JAR file that contains the class must be either in the default CLASSPATH or specified in `hadoop_job.jarFileUris`. | No | String |- | - |
|
||||
| args | The arguments to pass to the driver. Do not include arguments, such as -libjars or -Dfoo=bar, that can be set as job properties, since a collision may occur that causes an incorrect job submission. | Yes | List | - | None |
|
||||
| hadoop_job | The payload of a [HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob). | Yes | Dict | - | None |
|
||||
| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | -| None |
|
||||
| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | - | 30 |
|
||||
|
||||
Note:
|
||||
|
||||
`main_jar_file_uri`: The examples for the files are:
|
||||
- `gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar`
|
||||
- `hdfs:/tmp/test-samples/custom-wordcount.jarfile:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar`
|
||||
|
||||
|
||||
## Output
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
job_id | The ID of the created job. | String
|
||||
|
||||
## Cautions & requirements
|
||||
To use the component, you must:
|
||||
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
|
||||
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant the Kubeflow user service account the role, `roles/dataproc.editor`, on the project.
|
||||
|
||||
## Detailed description
|
||||
|
||||
This component creates a Hadoop job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
|
||||
|
||||
Follow these steps to use the component in a pipeline:
|
||||
|
||||
1. Install the Kubeflow pipeline's SDK:
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using the Kubeflow pipeline's SDK:
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
dataproc_submit_hadoop_job_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_hadoop_job/component.yaml')
|
||||
help(dataproc_submit_hadoop_job_op)
|
||||
```
|
||||
|
||||
### Sample
|
||||
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
|
||||
|
||||
#### Setup a Dataproc cluster
|
||||
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
|
||||
|
||||
|
||||
### Prepare a Hadoop job
|
||||
Upload your Hadoop JAR file to a Cloud Storage bucket. In the sample, we will use a JAR file that is preinstalled in the main cluster, so you don't have to provide the argument, `main_jar_file_uri`.
|
||||
|
||||
To package a self-contained Hadoop MapReduce application from the [WordCount example source code](https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordCount.java), follow the [MapReduce Tutorial](https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html).
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
```python
|
||||
PROJECT_ID = '<Put your project ID here>'
|
||||
CLUSTER_NAME = '<Put your existing cluster name here>'
|
||||
OUTPUT_GCS_PATH = '<Put your output GCS path here>'
|
||||
REGION = 'us-central1'
|
||||
MAIN_CLASS = 'org.apache.hadoop.examples.WordCount'
|
||||
INTPUT_GCS_PATH = 'gs://ml-pipeline-playground/shakespeare1.txt'
|
||||
EXPERIMENT_NAME = 'Dataproc - Submit Hadoop Job'
|
||||
```
|
||||
|
||||
#### Inspect the input data
|
||||
The input file is a simple text file:
|
||||
|
||||
```python
|
||||
!gsutil cat $INTPUT_GCS_PATH
|
||||
```
|
||||
|
||||
#### Clean up the existing output files (optional)
|
||||
This is needed because the sample code requires the output folder to be a clean folder. To continue to run the sample, make sure that the service account of the notebook server has access to `OUTPUT_GCS_PATH`.
|
||||
|
||||
Caution: This will remove all blob files under `OUTPUT_GCS_PATH`.
|
||||
|
||||
```python
|
||||
!gsutil rm $OUTPUT_GCS_PATH/**
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Dataproc submit Hadoop job pipeline',
|
||||
description='Dataproc submit Hadoop job pipeline'
|
||||
)
|
||||
def dataproc_submit_hadoop_job_pipeline(
|
||||
project_id = PROJECT_ID,
|
||||
region = REGION,
|
||||
cluster_name = CLUSTER_NAME,
|
||||
main_jar_file_uri = '',
|
||||
main_class = MAIN_CLASS,
|
||||
args = json.dumps([
|
||||
INTPUT_GCS_PATH,
|
||||
OUTPUT_GCS_PATH
|
||||
]),
|
||||
hadoop_job='',
|
||||
job='{}',
|
||||
wait_interval='30'
|
||||
):
|
||||
dataproc_submit_hadoop_job_op(
|
||||
project_id=project_id,
|
||||
region=region,
|
||||
cluster_name=cluster_name,
|
||||
main_jar_file_uri=main_jar_file_uri,
|
||||
main_class=main_class,
|
||||
args=args,
|
||||
hadoop_job=hadoop_job,
|
||||
job=job,
|
||||
wait_interval=wait_interval)
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
pipeline_func = dataproc_submit_hadoop_job_pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify values for the pipeline's arguments
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
#### Inspect the output
|
||||
The sample in the notebook will count the words in the input text and save them in sharded files. The command to inspect the output is:
|
||||
|
||||
```python
|
||||
!gsutil cat $OUTPUT_GCS_PATH/*
|
||||
```
|
||||
|
||||
## References
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)
|
||||
* [Dataproc HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob)
|
||||
|
||||
# License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: dataproc_submit_hadoop_job
|
||||
description: >-
|
||||
Submits a Cloud Dataproc job for running Apache Hadoop MapReduce jobs on
|
||||
Apache Hadoop YARN.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: project_id
|
||||
description: >-
|
||||
Required. The ID of the Google Cloud Platform project that the cluster
|
||||
belongs to.
|
||||
type: GCPProjectID
|
||||
- name: region
|
||||
description: >-
|
||||
Required. The Cloud Dataproc region in which to handle the request.
|
||||
type: GCPRegion
|
||||
- name: cluster_name
|
||||
description: 'Required. The cluster to run the job.'
|
||||
type: String
|
||||
- name: main_jar_file_uri
|
||||
default: ''
|
||||
description: >-
|
||||
The HCFS URI of the jar file containing the main class. Examples:
|
||||
`gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar`
|
||||
`hdfs:/tmp/test-samples/custom-wordcount.jar`
|
||||
`file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar`
|
||||
type: GCSPath
|
||||
- name: main_class
|
||||
default: ''
|
||||
description: >-
|
||||
The name of the driver's main class. The jar file
|
||||
containing the class must be in the default CLASSPATH or specified
|
||||
in `jarFileUris`.
|
||||
type: String
|
||||
- name: args
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The arguments to pass to the driver. Do not include
|
||||
arguments, such as -libjars or -Dfoo=bar, that can be set as job properties,
|
||||
since a collision may occur that causes an incorrect job submission.
|
||||
type: List
|
||||
- name: hadoop_job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[hadoop job](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob).
|
||||
type: Dict
|
||||
- name: job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
type: Dict
|
||||
- name: wait_interval
|
||||
default: '30'
|
||||
description: >-
|
||||
Optional. The wait seconds between polling the operation.
|
||||
Defaults to 30.
|
||||
type: Integer
|
||||
outputs:
|
||||
- name: job_id
|
||||
description: 'The ID of the created job.'
|
||||
type: String
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.dataproc, submit_hadoop_job,
|
||||
--project_id, {inputValue: project_id},
|
||||
--region, {inputValue: region},
|
||||
--cluster_name, {inputValue: cluster_name},
|
||||
--main_jar_file_uri, {inputValue: main_jar_file_uri},
|
||||
--main_class, {inputValue: main_class},
|
||||
--args, {inputValue: args},
|
||||
--hadoop_job, {inputValue: hadoop_job},
|
||||
--job, {inputValue: job},
|
||||
--wait_interval, {inputValue: wait_interval},
|
||||
--job_id_output_path, {outputPath: job_id},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,313 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"Data preparation using Hadoop MapReduce on YARN with Cloud Dataproc\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"Cloud Dataproc, GCP, Cloud Storage, Hadoop, YARN, Apache, MapReduce\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"A Kubeflow Pipeline component to prepare data by submitting an Apache Hadoop MapReduce job on Apache Hadoop YARN to Cloud Dataproc.\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"## Intended use\n",
|
||||
"Use the component to run an Apache Hadoop MapReduce job as one preprocessing step in a Kubeflow Pipeline. \n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
"|----------|-------------|----------|-----------|-----------------|---------|\n",
|
||||
"| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | | |\n",
|
||||
"| region | The Dataproc region to handle the request. | No | GCPRegion | | |\n",
|
||||
"| cluster_name | The name of the cluster to run the job. | No | String | | |\n",
|
||||
"| main_jar_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the JAR file containing the main class to execute. | No | List | | |\n",
|
||||
"| main_class | The name of the driver's main class. The JAR file that contains the class must be either in the default CLASSPATH or specified in `hadoop_job.jarFileUris`. | No | String | | |\n",
|
||||
"| args | The arguments to pass to the driver. Do not include arguments, such as -libjars or -Dfoo=bar, that can be set as job properties, since a collision may occur that causes an incorrect job submission. | Yes | List | | None |\n",
|
||||
"| hadoop_job | The payload of a [HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob). | Yes | Dict | | None |\n",
|
||||
"| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |\n",
|
||||
"| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | | 30 |\n",
|
||||
"\n",
|
||||
"Note: \n",
|
||||
"`main_jar_file_uri`: The examples for the files are : \n",
|
||||
"- `gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar` \n",
|
||||
"- `hdfs:/tmp/test-samples/custom-wordcount.jarfile:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar`\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Output\n",
|
||||
"Name | Description | Type\n",
|
||||
":--- | :---------- | :---\n",
|
||||
"job_id | The ID of the created job. | String\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"To use the component, you must:\n",
|
||||
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
|
||||
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"\n",
|
||||
"This component creates a Hadoop job from [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
|
||||
"\n",
|
||||
"Follow these steps to use the component in a pipeline:\n",
|
||||
"\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"dataproc_submit_hadoop_job_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_hadoop_job/component.yaml')\n",
|
||||
"help(dataproc_submit_hadoop_job_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Sample\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### Setup a Dataproc cluster\n",
|
||||
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### Prepare a Hadoop job\n",
|
||||
"Upload your Hadoop JAR file to a Cloud Storage bucket. In the sample, we will use a JAR file that is preinstalled in the main cluster, so there is no need to provide `main_jar_file_uri`. \n",
|
||||
"\n",
|
||||
"Here is the [WordCount example source code](https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordCount.java).\n",
|
||||
"\n",
|
||||
"To package a self-contained Hadoop MapReduce application from the source code, follow the [MapReduce Tutorial](https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
|
||||
"OUTPUT_GCS_PATH = '<Please put your output GCS path here>'\n",
|
||||
"REGION = 'us-central1'\n",
|
||||
"MAIN_CLASS = 'org.apache.hadoop.examples.WordCount'\n",
|
||||
"INTPUT_GCS_PATH = 'gs://ml-pipeline-playground/shakespeare1.txt'\n",
|
||||
"EXPERIMENT_NAME = 'Dataproc - Submit Hadoop Job'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Insepct Input Data\n",
|
||||
"The input file is a simple text file:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gsutil cat $INTPUT_GCS_PATH"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Clean up the existing output files (optional)\n",
|
||||
"This is needed because the sample code requires the output folder to be a clean folder. To continue to run the sample, make sure that the service account of the notebook server has access to the `OUTPUT_GCS_PATH`.\n",
|
||||
"\n",
|
||||
"CAUTION: This will remove all blob files under `OUTPUT_GCS_PATH`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gsutil rm $OUTPUT_GCS_PATH/**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='Dataproc submit Hadoop job pipeline',\n",
|
||||
" description='Dataproc submit Hadoop job pipeline'\n",
|
||||
")\n",
|
||||
"def dataproc_submit_hadoop_job_pipeline(\n",
|
||||
" project_id = PROJECT_ID, \n",
|
||||
" region = REGION,\n",
|
||||
" cluster_name = CLUSTER_NAME,\n",
|
||||
" main_jar_file_uri = '',\n",
|
||||
" main_class = MAIN_CLASS,\n",
|
||||
" args = json.dumps([\n",
|
||||
" INTPUT_GCS_PATH,\n",
|
||||
" OUTPUT_GCS_PATH\n",
|
||||
" ]), \n",
|
||||
" hadoop_job='', \n",
|
||||
" job='{}', \n",
|
||||
" wait_interval='30'\n",
|
||||
"):\n",
|
||||
" dataproc_submit_hadoop_job_op(\n",
|
||||
" project_id=project_id, \n",
|
||||
" region=region, \n",
|
||||
" cluster_name=cluster_name, \n",
|
||||
" main_jar_file_uri=main_jar_file_uri, \n",
|
||||
" main_class=main_class,\n",
|
||||
" args=args, \n",
|
||||
" hadoop_job=hadoop_job, \n",
|
||||
" job=job, \n",
|
||||
" wait_interval=wait_interval)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = dataproc_submit_hadoop_job_pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Inspect the output\n",
|
||||
"The sample in the notebook will count the words in the input text and save them in sharded files. The command to inspect the output is:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gsutil cat $OUTPUT_GCS_PATH/*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)\n",
|
||||
"* [Dataproc HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,189 @@
|
|||
|
||||
# Name
|
||||
Component: Data preparation using Apache Hive on YARN with Cloud Dataproc
|
||||
|
||||
# Label
|
||||
Cloud Dataproc, YARN, Apache Hive
|
||||
|
||||
# Summary
|
||||
A Kubeflow pipeline component to prepare data by submitting an Apache Hive job on YARN to Cloud Dataproc.
|
||||
|
||||
# Facets
|
||||
<!--Make sure the asset has data for the following facets:
|
||||
Use case
|
||||
Technique
|
||||
Input data type
|
||||
ML workflow
|
||||
|
||||
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
|
||||
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
|
||||
--->
|
||||
Use case:
|
||||
|
||||
Technique:
|
||||
|
||||
Input data type:
|
||||
|
||||
ML workflow:
|
||||
|
||||
# Details
|
||||
## Intended use
|
||||
Use the component to run an Apache Hive job as one preprocessing step in a Kubeflow pipeline.
|
||||
|
||||
## Runtime arguments
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|----------|-------------|----------|-----------|-----------------|---------|
|
||||
| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectId | | |
|
||||
| region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | |
|
||||
| cluster_name | The name of the cluster to run the job. | No | String | | |
|
||||
| queries | The queries to execute the Hive job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | | None |
|
||||
| query_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the script that contains the Hive queries. | Yes | GCSPath | | None |
|
||||
| script_variables | Mapping of the query’s variable names to their values (equivalent to the Hive command: SET name="value";). | Yes | Dict | | None |
|
||||
| hive_job | The payload of a [Hive job](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob) | Yes | Dict | | None |
|
||||
| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |
|
||||
| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | | 30 |
|
||||
|
||||
## Output
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
job_id | The ID of the created job. | String
|
||||
|
||||
## Cautions & requirements
|
||||
To use the component, you must:
|
||||
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
|
||||
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.
|
||||
|
||||
## Detailed description
|
||||
This component creates a Hive job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
|
||||
|
||||
Follow these steps to use the component in a pipeline:
|
||||
1. Install the Kubeflow pipeline's SDK:
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using the Kubeflow pipeline's SDK:
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
dataproc_submit_hive_job_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_hive_job/component.yaml')
|
||||
help(dataproc_submit_hive_job_op)
|
||||
```
|
||||
|
||||
### Sample
|
||||
|
||||
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
|
||||
|
||||
|
||||
#### Setup a Dataproc cluster
|
||||
|
||||
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
|
||||
|
||||
#### Prepare a Hive query
|
||||
|
||||
You can put your Hive queries in the `queries` list, or you can use `query_file_uri`. In this sample, we will use a hard coded query in the `queries` list to select data from a public CSV file in Cloud Storage.
|
||||
|
||||
For more details, see the [Hive language manual.](https://cwiki.apache.org/confluence/display/Hive/LanguageManual)
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
```python
|
||||
PROJECT_ID = '<Put your project ID here>'
|
||||
CLUSTER_NAME = '<Put your existing cluster name here>'
|
||||
REGION = 'us-central1'
|
||||
QUERY = '''
|
||||
DROP TABLE IF EXISTS natality_csv;
|
||||
CREATE EXTERNAL TABLE natality_csv (
|
||||
source_year BIGINT, year BIGINT, month BIGINT, day BIGINT, wday BIGINT,
|
||||
state STRING, is_male BOOLEAN, child_race BIGINT, weight_pounds FLOAT,
|
||||
plurality BIGINT, apgar_1min BIGINT, apgar_5min BIGINT,
|
||||
mother_residence_state STRING, mother_race BIGINT, mother_age BIGINT,
|
||||
gestation_weeks BIGINT, lmp STRING, mother_married BOOLEAN,
|
||||
mother_birth_state STRING, cigarette_use BOOLEAN, cigarettes_per_day BIGINT,
|
||||
alcohol_use BOOLEAN, drinks_per_week BIGINT, weight_gain_pounds BIGINT,
|
||||
born_alive_alive BIGINT, born_alive_dead BIGINT, born_dead BIGINT,
|
||||
ever_born BIGINT, father_race BIGINT, father_age BIGINT,
|
||||
record_weight BIGINT
|
||||
)
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
|
||||
LOCATION 'gs://public-datasets/natality/csv';
|
||||
|
||||
SELECT * FROM natality_csv LIMIT 10;'''
|
||||
EXPERIMENT_NAME = 'Dataproc - Submit Hive Job'
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Dataproc submit Hive job pipeline',
|
||||
description='Dataproc submit Hive job pipeline'
|
||||
)
|
||||
def dataproc_submit_hive_job_pipeline(
|
||||
project_id = PROJECT_ID,
|
||||
region = REGION,
|
||||
cluster_name = CLUSTER_NAME,
|
||||
queries = json.dumps([QUERY]),
|
||||
query_file_uri = '',
|
||||
script_variables = '',
|
||||
hive_job='',
|
||||
job='',
|
||||
wait_interval='30'
|
||||
):
|
||||
dataproc_submit_hive_job_op(
|
||||
project_id=project_id,
|
||||
region=region,
|
||||
cluster_name=cluster_name,
|
||||
queries=queries,
|
||||
query_file_uri=query_file_uri,
|
||||
script_variables=script_variables,
|
||||
hive_job=hive_job,
|
||||
job=job,
|
||||
wait_interval=wait_interval)
|
||||
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
pipeline_func = dataproc_submit_hive_job_pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify values for the pipeline's arguments
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
## References
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hive_job/sample.ipynb)
|
||||
* [Dataproc HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: dataproc_submit_hive_job
|
||||
description: >-
|
||||
Submits a Cloud Dataproc job for running Apache Hive queries on YARN.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: project_id
|
||||
description: >-
|
||||
Required. The ID of the Google Cloud Platform project that the cluster
|
||||
belongs to.
|
||||
type: GCPProjectID
|
||||
- name: region
|
||||
description: >-
|
||||
Required. The Cloud Dataproc region in which to handle the request.
|
||||
type: GCPRegion
|
||||
- name: cluster_name
|
||||
description: 'Required. The cluster to run the job.'
|
||||
type: String
|
||||
- name: queries
|
||||
default: ''
|
||||
description: >-
|
||||
Required. The queries to execute. You do not need to
|
||||
terminate a query with a semicolon. Multiple queries can be specified
|
||||
in one string by separating each with a semicolon.
|
||||
type: List
|
||||
- name: query_file_uri
|
||||
default: ''
|
||||
description: >-
|
||||
The HCFS URI of the script that contains Hive queries.
|
||||
type: GCSPath
|
||||
- name: script_variables
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. Mapping of query variable names to
|
||||
values (equivalent to the Hive command: SET name="value";).
|
||||
type: Dict
|
||||
- name: hive_job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob).
|
||||
type: Dict
|
||||
- name: job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
type: Dict
|
||||
- name: wait_interval
|
||||
default: '30'
|
||||
description: >-
|
||||
Optional. The wait seconds between polling the operation.
|
||||
Defaults to 30.
|
||||
type: Integer
|
||||
outputs:
|
||||
- name: job_id
|
||||
description: 'The ID of the created job.'
|
||||
type: String
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.dataproc, submit_hive_job,
|
||||
--project_id, {inputValue: project_id},
|
||||
--region, {inputValue: region},
|
||||
--cluster_name, {inputValue: cluster_name},
|
||||
--queries, {inputValue: queries},
|
||||
--query_file_uri, {inputValue: query_file_uri},
|
||||
--script_variables, {inputValue: script_variables},
|
||||
--hive_job, {inputValue: hive_job},
|
||||
--job, {inputValue: job},
|
||||
--wait_interval, {inputValue: wait_interval},
|
||||
--job_id_output_path, {outputPath: job_id},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,264 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"Data preparation using Apache Hive on YARN with Cloud Dataproc\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"Cloud Dataproc, GCP, Cloud Storage, YARN, Hive, Apache\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"A Kubeflow Pipeline component to prepare data by submitting an Apache Hive job on YARN to Cloud Dataproc.\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"## Intended use\n",
|
||||
"Use the component to run an Apache Hive job as one preprocessing step in a Kubeflow Pipeline.\n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
"|----------|-------------|----------|-----------|-----------------|---------|\n",
|
||||
"| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectId | | |\n",
|
||||
"| region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | |\n",
|
||||
"| cluster_name | The name of the cluster to run the job. | No | String | | |\n",
|
||||
"| queries | The queries to execute the Hive job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | | None |\n",
|
||||
"| query_file_uri | The HCFS URI of the script that contains the Hive queries. | Yes | GCSPath | | None |\n",
|
||||
"| script_variables | Mapping of the query’s variable names to their values (equivalent to the Hive command: SET name=\"value\";). | Yes | Dict | | None |\n",
|
||||
"| hive_job | The payload of a [HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob) | Yes | Dict | | None |\n",
|
||||
"| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |\n",
|
||||
"| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | | 30 |\n",
|
||||
"\n",
|
||||
"## Output\n",
|
||||
"Name | Description | Type\n",
|
||||
":--- | :---------- | :---\n",
|
||||
"job_id | The ID of the created job. | String\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"To use the component, you must:\n",
|
||||
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
|
||||
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"This component creates a Hive job from [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
|
||||
"\n",
|
||||
"Follow these steps to use the component in a pipeline:\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"dataproc_submit_hive_job_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_hive_job/component.yaml')\n",
|
||||
"help(dataproc_submit_hive_job_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sample\n",
|
||||
"\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### Setup a Dataproc cluster\n",
|
||||
"\n",
|
||||
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
|
||||
"\n",
|
||||
"#### Prepare a Hive query\n",
|
||||
"\n",
|
||||
"Put your Hive queries in the queries list, or upload your Hive queries into a file saved in a Cloud Storage bucket and then enter the Cloud Storage bucket’s path in `query_file_uri.` In this sample, we will use a hard coded query in the queries list to select data from a public CSV file from Cloud Storage.\n",
|
||||
"\n",
|
||||
"For more details, see the [Hive language manual.](https://cwiki.apache.org/confluence/display/Hive/LanguageManual)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
|
||||
"REGION = 'us-central1'\n",
|
||||
"QUERY = '''\n",
|
||||
"DROP TABLE IF EXISTS natality_csv;\n",
|
||||
"CREATE EXTERNAL TABLE natality_csv (\n",
|
||||
" source_year BIGINT, year BIGINT, month BIGINT, day BIGINT, wday BIGINT,\n",
|
||||
" state STRING, is_male BOOLEAN, child_race BIGINT, weight_pounds FLOAT,\n",
|
||||
" plurality BIGINT, apgar_1min BIGINT, apgar_5min BIGINT,\n",
|
||||
" mother_residence_state STRING, mother_race BIGINT, mother_age BIGINT,\n",
|
||||
" gestation_weeks BIGINT, lmp STRING, mother_married BOOLEAN,\n",
|
||||
" mother_birth_state STRING, cigarette_use BOOLEAN, cigarettes_per_day BIGINT,\n",
|
||||
" alcohol_use BOOLEAN, drinks_per_week BIGINT, weight_gain_pounds BIGINT,\n",
|
||||
" born_alive_alive BIGINT, born_alive_dead BIGINT, born_dead BIGINT,\n",
|
||||
" ever_born BIGINT, father_race BIGINT, father_age BIGINT,\n",
|
||||
" record_weight BIGINT\n",
|
||||
")\n",
|
||||
"ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n",
|
||||
"LOCATION 'gs://public-datasets/natality/csv';\n",
|
||||
"\n",
|
||||
"SELECT * FROM natality_csv LIMIT 10;'''\n",
|
||||
"EXPERIMENT_NAME = 'Dataproc - Submit Hive Job'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='Dataproc submit Hive job pipeline',\n",
|
||||
" description='Dataproc submit Hive job pipeline'\n",
|
||||
")\n",
|
||||
"def dataproc_submit_hive_job_pipeline(\n",
|
||||
" project_id = PROJECT_ID, \n",
|
||||
" region = REGION,\n",
|
||||
" cluster_name = CLUSTER_NAME,\n",
|
||||
" queries = json.dumps([QUERY]),\n",
|
||||
" query_file_uri = '',\n",
|
||||
" script_variables = '', \n",
|
||||
" hive_job='', \n",
|
||||
" job='', \n",
|
||||
" wait_interval='30'\n",
|
||||
"):\n",
|
||||
" dataproc_submit_hive_job_op(\n",
|
||||
" project_id=project_id, \n",
|
||||
" region=region, \n",
|
||||
" cluster_name=cluster_name, \n",
|
||||
" queries=queries, \n",
|
||||
" query_file_uri=query_file_uri,\n",
|
||||
" script_variables=script_variables, \n",
|
||||
" hive_job=hive_job, \n",
|
||||
" job=job, \n",
|
||||
" wait_interval=wait_interval)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = dataproc_submit_hive_job_pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hive_job/sample.ipynb)\n",
|
||||
"* [Dataproc HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,185 @@
|
|||
|
||||
# Name
|
||||
Component: Data preparation using Apache Pig on YARN with Cloud Dataproc
|
||||
|
||||
# Labels
|
||||
Cloud Dataproc, YARN, Apache Pig, Kubeflow
|
||||
|
||||
|
||||
# Summary
|
||||
A Kubeflow pipeline component to prepare data by submitting an Apache Pig job on YARN to Cloud Dataproc.
|
||||
|
||||
# Facets
|
||||
<!--Make sure the asset has data for the following facets:
|
||||
Use case
|
||||
Technique
|
||||
Input data type
|
||||
ML workflow
|
||||
|
||||
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
|
||||
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
|
||||
--->
|
||||
Use case:
|
||||
Other
|
||||
|
||||
Technique:
|
||||
Other
|
||||
|
||||
Input data type:
|
||||
Tabular
|
||||
|
||||
ML workflow:
|
||||
Data preparation
|
||||
|
||||
# Details
|
||||
## Intended use
|
||||
Use this component to run an Apache Pig job as one preprocessing step in a Kubeflow pipeline.
|
||||
|
||||
## Runtime arguments
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|:----------|:-------------|:----------|:-----------|:-----------------|:---------|
|
||||
| project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No | GCPProjectID |- | -|
|
||||
| region | The Cloud Dataproc region that handles the request. | No | GCPRegion | - |- |
|
||||
| cluster_name | The name of the cluster that runs the job. | No | String | - | - |
|
||||
| queries | The queries to execute the Pig job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | -| None |
|
||||
| query_file_uri | The Cloud Storage bucket path pointing to a file that contains the Pig queries. | Yes | GCSPath | - | None |
|
||||
| script_variables | Mapping of the query’s variable names to their values (equivalent to the Pig command: SET name="value";). | Yes | Dict | -| None |
|
||||
| pig_job | The payload of a [PigJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob). | Yes | Dict | - | None |
|
||||
| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |
|
||||
| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | - | 30 |
|
||||
|
||||
## Output
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
job_id | The ID of the created job. | String
|
||||
|
||||
## Cautions & requirements
|
||||
|
||||
To use the component, you must:
|
||||
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
|
||||
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant the Kubeflow user service account the role, `roles/dataproc.editor`, on the project.
|
||||
|
||||
## Detailed description
|
||||
This component creates a Pig job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
|
||||
|
||||
Follow these steps to use the component in a pipeline:
|
||||
1. Install the Kubeflow pipeline's SDK
|
||||
|
||||
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using the Kubeflow pipeline's SDK
|
||||
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
dataproc_submit_pig_job_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_pig_job/component.yaml')
|
||||
help(dataproc_submit_pig_job_op)
|
||||
```
|
||||
|
||||
### Sample
|
||||
|
||||
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
|
||||
|
||||
#### Setup a Dataproc cluster
|
||||
|
||||
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
|
||||
|
||||
#### Prepare a Pig query
|
||||
|
||||
You can put your Pig queries in the `queries` list, or you can use `query_file_uri`. In this sample, we will use a hard-coded query in the `queries` list to select data from a local password file.
|
||||
|
||||
For more details on Apache Pig, see the [Pig documentation.](http://pig.apache.org/docs/latest/)
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
PROJECT_ID = '<Put your project ID here>'
|
||||
CLUSTER_NAME = '<Put your existing cluster name here>'
|
||||
|
||||
REGION = 'us-central1'
|
||||
QUERY = '''
|
||||
natality_csv = load 'gs://public-datasets/natality/csv' using PigStorage(':');
|
||||
top_natality_csv = LIMIT natality_csv 10;
|
||||
dump natality_csv;'''
|
||||
EXPERIMENT_NAME = 'Dataproc - Submit Pig Job'
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Dataproc submit Pig job pipeline',
|
||||
description='Dataproc submit Pig job pipeline'
|
||||
)
|
||||
def dataproc_submit_pig_job_pipeline(
|
||||
project_id = PROJECT_ID,
|
||||
region = REGION,
|
||||
cluster_name = CLUSTER_NAME,
|
||||
queries = json.dumps([QUERY]),
|
||||
query_file_uri = '',
|
||||
script_variables = '',
|
||||
pig_job='',
|
||||
job='',
|
||||
wait_interval='30'
|
||||
):
|
||||
dataproc_submit_pig_job_op(
|
||||
project_id=project_id,
|
||||
region=region,
|
||||
cluster_name=cluster_name,
|
||||
queries=queries,
|
||||
query_file_uri=query_file_uri,
|
||||
script_variables=script_variables,
|
||||
pig_job=pig_job,
|
||||
job=job,
|
||||
wait_interval=wait_interval)
|
||||
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
pipeline_func = dataproc_submit_pig_job_pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify values for the pipeline's arguments
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
## References
|
||||
* [Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster)
|
||||
* [Pig documentation](http://pig.apache.org/docs/latest/)
|
||||
* [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)
|
||||
* [PigJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: dataproc_submit_pig_job
|
||||
description: >-
|
||||
Submits a Cloud Dataproc job for running Apache Pig queries on YARN.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: project_id
|
||||
description: >-
|
||||
Required. The ID of the Google Cloud Platform project that the cluster
|
||||
belongs to.
|
||||
type: GCPProjectID
|
||||
- name: region
|
||||
description: >-
|
||||
Required. The Cloud Dataproc region in which to handle the request.
|
||||
type: GCPRegion
|
||||
- name: cluster_name
|
||||
description: 'Required. The cluster to run the job.'
|
||||
type: String
|
||||
- name: queries
|
||||
default: ''
|
||||
description: >-
|
||||
Required. The queries to execute. You do not need to
|
||||
terminate a query with a semicolon. Multiple queries can be specified
|
||||
in one string by separating each with a semicolon.
|
||||
type: List
|
||||
- name: query_file_uri
|
||||
default: ''
|
||||
description: >-
|
||||
The HCFS URI of the script that contains Pig queries.
|
||||
type: GCSPath
|
||||
- name: script_variables
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. Mapping of query variable names to
|
||||
values (equivalent to the Pig command: SET name="value";).
|
||||
type: Dict
|
||||
- name: pig_job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[PigJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob).
|
||||
type: Dict
|
||||
- name: job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
type: Dict
|
||||
- name: wait_interval
|
||||
default: '30'
|
||||
description: >-
|
||||
Optional. The wait seconds between polling the operation.
|
||||
Defaults to 30.
|
||||
type: Integer
|
||||
outputs:
|
||||
- name: job_id
|
||||
description: 'The ID of the created job.'
|
||||
type: String
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.dataproc, submit_pig_job,
|
||||
--project_id, {inputValue: project_id},
|
||||
--region, {inputValue: region},
|
||||
--cluster_name, {inputValue: cluster_name},
|
||||
--queries, {inputValue: queries},
|
||||
--query_file_uri, {inputValue: query_file_uri},
|
||||
--script_variables, {inputValue: script_variables},
|
||||
--pig_job, {inputValue: pig_job},
|
||||
--job, {inputValue: job},
|
||||
--wait_interval, {inputValue: wait_interval},
|
||||
--job_id_output_path, {outputPath: job_id},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,254 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"Data preparation using Apache Pig on YARN with Cloud Dataproc\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"Cloud Dataproc, GCP, Cloud Storage, YARN, Pig, Apache, Kubeflow, pipelines, components\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"A Kubeflow Pipeline component to prepare data by submitting an Apache Pig job on YARN to Cloud Dataproc.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"## Intended use\n",
|
||||
"Use the component to run an Apache Pig job as one preprocessing step in a Kubeflow Pipeline.\n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
"|----------|-------------|----------|-----------|-----------------|---------|\n",
|
||||
"| project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No | GCPProjectID | | |\n",
|
||||
"| region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | |\n",
|
||||
"| cluster_name | The name of the cluster to run the job. | No | String | | |\n",
|
||||
"| queries | The queries to execute the Pig job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | | None |\n",
|
||||
"| query_file_uri | The HCFS URI of the script that contains the Pig queries. | Yes | GCSPath | | None |\n",
|
||||
"| script_variables | Mapping of the query’s variable names to their values (equivalent to the Pig command: SET name=\"value\";). | Yes | Dict | | None |\n",
|
||||
"| pig_job | The payload of a [PigJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob). | Yes | Dict | | None |\n",
|
||||
"| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |\n",
|
||||
"| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | | 30 |\n",
|
||||
"\n",
|
||||
"## Output\n",
|
||||
"Name | Description | Type\n",
|
||||
":--- | :---------- | :---\n",
|
||||
"job_id | The ID of the created job. | String\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"\n",
|
||||
"To use the component, you must:\n",
|
||||
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
|
||||
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"This component creates a Pig job from [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
|
||||
"\n",
|
||||
"Follow these steps to use the component in a pipeline:\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"dataproc_submit_pig_job_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_pig_job/component.yaml')\n",
|
||||
"help(dataproc_submit_pig_job_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sample\n",
|
||||
"\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### Setup a Dataproc cluster\n",
|
||||
"\n",
|
||||
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### Prepare a Pig query\n",
|
||||
"\n",
|
||||
"Either put your Pig queries in the `queries` list, or upload your Pig queries into a file to a Cloud Storage bucket and then enter the Cloud Storage bucket’s path in `query_file_uri`. In this sample, we will use a hard coded query in the `queries` list to select data from a local `passwd` file.\n",
|
||||
"\n",
|
||||
"For more details on Apache Pig, see the [Pig documentation.](http://pig.apache.org/docs/latest/)\n",
|
||||
"\n",
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
|
||||
"\n",
|
||||
"REGION = 'us-central1'\n",
|
||||
"QUERY = '''\n",
|
||||
"natality_csv = load 'gs://public-datasets/natality/csv' using PigStorage(':');\n",
|
||||
"top_natality_csv = LIMIT natality_csv 10; \n",
|
||||
"dump natality_csv;'''\n",
|
||||
"EXPERIMENT_NAME = 'Dataproc - Submit Pig Job'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='Dataproc submit Pig job pipeline',\n",
|
||||
" description='Dataproc submit Pig job pipeline'\n",
|
||||
")\n",
|
||||
"def dataproc_submit_pig_job_pipeline(\n",
|
||||
" project_id = PROJECT_ID, \n",
|
||||
" region = REGION,\n",
|
||||
" cluster_name = CLUSTER_NAME,\n",
|
||||
" queries = json.dumps([QUERY]),\n",
|
||||
" query_file_uri = '',\n",
|
||||
" script_variables = '', \n",
|
||||
" pig_job='', \n",
|
||||
" job='', \n",
|
||||
" wait_interval='30'\n",
|
||||
"):\n",
|
||||
" dataproc_submit_pig_job_op(\n",
|
||||
" project_id=project_id, \n",
|
||||
" region=region, \n",
|
||||
" cluster_name=cluster_name, \n",
|
||||
" queries=queries, \n",
|
||||
" query_file_uri=query_file_uri,\n",
|
||||
" script_variables=script_variables, \n",
|
||||
" pig_job=pig_job, \n",
|
||||
" job=job, \n",
|
||||
" wait_interval=wait_interval)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = dataproc_submit_pig_job_pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) \n",
|
||||
"* [Pig documentation](http://pig.apache.org/docs/latest/)\n",
|
||||
"* [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)\n",
|
||||
"* [PigJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,178 @@
|
|||
|
||||
# Name
|
||||
Component: Data preparation using PySpark on Cloud Dataproc
|
||||
|
||||
|
||||
# Labels
|
||||
Cloud Dataproc, PySpark, Kubeflow
|
||||
|
||||
|
||||
# Summary
|
||||
A Kubeflow Pipeline component to prepare data by submitting a PySpark job to Cloud Dataproc.
|
||||
|
||||
# Facets
|
||||
<!--Make sure the asset has data for the following facets:
|
||||
Use case
|
||||
Technique
|
||||
Input data type
|
||||
ML workflow
|
||||
|
||||
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
|
||||
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
|
||||
--->
|
||||
Use case:
|
||||
|
||||
Technique:
|
||||
|
||||
Input data type:
|
||||
|
||||
ML workflow:
|
||||
|
||||
# Details
|
||||
## Intended use
|
||||
Use this component to run an Apache PySpark job as one preprocessing step in a Kubeflow pipeline.
|
||||
|
||||
|
||||
## Runtime arguments
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|:----------------------|:------------|:----------|:--------------|:-----------------|:---------|
|
||||
| project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No | GCPProjectID | - | - |
|
||||
| region | The Cloud Dataproc region to handle the request. | No | GCPRegion | - | - |
|
||||
| cluster_name | The name of the cluster to run the job. | No | String | - | - |
|
||||
| main_python_file_uri | The HCFS URI of the Python file to use as the driver. This must be a .py file. | No | GCSPath | - | - |
|
||||
| args | The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission. | Yes | List | - | None |
|
||||
| pyspark_job | The payload of a [PySparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob). | Yes | Dict | - | None |
|
||||
| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | - | None |
|
||||
|
||||
## Output
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
job_id | The ID of the created job. | String
|
||||
|
||||
## Cautions & requirements
|
||||
|
||||
To use the component, you must:
|
||||
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
|
||||
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.
|
||||
|
||||
## Detailed description
|
||||
|
||||
This component creates a PySpark job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
|
||||
|
||||
Follow these steps to use the component in a pipeline:
|
||||
|
||||
1. Install the Kubeflow pipeline's SDK:
|
||||
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the Kubeflow pipeline's SDK:
|
||||
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
dataproc_submit_pyspark_job_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_pyspark_job/component.yaml')
|
||||
help(dataproc_submit_pyspark_job_op)
|
||||
```
|
||||
|
||||
### Sample
|
||||
|
||||
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
|
||||
|
||||
|
||||
#### Setup a Dataproc cluster
|
||||
|
||||
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
|
||||
|
||||
|
||||
#### Prepare a PySpark job
|
||||
|
||||
Upload your PySpark code file to a Cloud Storage bucket. For example, this is a publicly accessible `hello-world.py` in Cloud Storage:
|
||||
|
||||
```python
|
||||
!gsutil cat gs://dataproc-examples-2f10d78d114f6aaec76462e3c310f31f/src/pyspark/hello-world/hello-world.py
|
||||
```
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
```python
|
||||
PROJECT_ID = '<Put your project ID here>'
|
||||
CLUSTER_NAME = '<Put your existing cluster name here>'
|
||||
REGION = 'us-central1'
|
||||
PYSPARK_FILE_URI = 'gs://dataproc-examples-2f10d78d114f6aaec76462e3c310f31f/src/pyspark/hello-world/hello-world.py'
|
||||
ARGS = ''
|
||||
EXPERIMENT_NAME = 'Dataproc - Submit PySpark Job'
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Dataproc submit PySpark job pipeline',
|
||||
description='Dataproc submit PySpark job pipeline'
|
||||
)
|
||||
def dataproc_submit_pyspark_job_pipeline(
|
||||
project_id = PROJECT_ID,
|
||||
region = REGION,
|
||||
cluster_name = CLUSTER_NAME,
|
||||
main_python_file_uri = PYSPARK_FILE_URI,
|
||||
args = ARGS,
|
||||
pyspark_job='{}',
|
||||
job='{}',
|
||||
wait_interval='30'
|
||||
):
|
||||
dataproc_submit_pyspark_job_op(
|
||||
project_id=project_id,
|
||||
region=region,
|
||||
cluster_name=cluster_name,
|
||||
main_python_file_uri=main_python_file_uri,
|
||||
args=args,
|
||||
pyspark_job=pyspark_job,
|
||||
job=job,
|
||||
wait_interval=wait_interval)
|
||||
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
pipeline_func = dataproc_submit_pyspark_job_pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
```python
|
||||
#Specify values for the pipeline's arguments
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
* [Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster)
|
||||
* [PySparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob)
|
||||
* [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: dataproc_submit_pyspark_job
|
||||
description: >-
|
||||
Submits a Cloud Dataproc job for running Apache PySpark applications on YARN.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: project_id
|
||||
description: >-
|
||||
Required. The ID of the Google Cloud Platform project that the cluster
|
||||
belongs to.
|
||||
type: GCPProjectID
|
||||
- name: region
|
||||
description: >-
|
||||
Required. The Cloud Dataproc region in which to handle the request.
|
||||
type: GCPRegion
|
||||
- name: cluster_name
|
||||
description: 'Required. The cluster to run the job.'
|
||||
type: String
|
||||
- name: main_python_file_uri
|
||||
description: >-
|
||||
Required. The HCFS URI of the main Python file to
|
||||
use as the driver. Must be a .py file.
|
||||
type: GCSPath
|
||||
- name: args
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The arguments to pass to the driver. Do not include
|
||||
arguments, such as --conf, that can be set as job properties, since a
|
||||
collision may occur that causes an incorrect job submission.
|
||||
type: List
|
||||
- name: pyspark_job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[PySparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob).
|
||||
type: Dict
|
||||
- name: job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
type: Dict
|
||||
- name: wait_interval
|
||||
default: '30'
|
||||
description: >-
|
||||
Optional. The wait seconds between polling the operation.
|
||||
Defaults to 30.
|
||||
type: Integer
|
||||
outputs:
|
||||
- name: job_id
|
||||
description: 'The ID of the created job.'
|
||||
type: String
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.dataproc, submit_pyspark_job,
|
||||
--project_id, {inputValue: project_id},
|
||||
--region, {inputValue: region},
|
||||
--cluster_name, {inputValue: cluster_name},
|
||||
--main_python_file_uri, {inputValue: main_python_file_uri},
|
||||
--args, {inputValue: args},
|
||||
--pyspark_job, {inputValue: pyspark_job},
|
||||
--job, {inputValue: job},
|
||||
--wait_interval, {inputValue: wait_interval},
|
||||
--job_id_output_path, {outputPath: job_id},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,263 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"Data preparation using PySpark on Cloud Dataproc\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"Cloud Dataproc, GCP, Cloud Storage,PySpark, Kubeflow, pipelines, components\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"A Kubeflow Pipeline component to prepare data by submitting a PySpark job to Cloud Dataproc.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"## Intended use\n",
|
||||
"Use the component to run an Apache PySpark job as one preprocessing step in a Kubeflow Pipeline.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
"|----------------------|------------|----------|--------------|-----------------|---------|\n",
|
||||
"| project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No | GCPProjectID | | |\n",
|
||||
"| region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | |\n",
|
||||
"| cluster_name | The name of the cluster to run the job. | No | String | | |\n",
|
||||
"| main_python_file_uri | The HCFS URI of the Python file to use as the driver. This must be a .py file. | No | GCSPath | | |\n",
|
||||
"| args | The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission. | Yes | List | | None |\n",
|
||||
"| pyspark_job | The payload of a [PySparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob). | Yes | Dict | | None |\n",
|
||||
"| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |\n",
|
||||
"\n",
|
||||
"## Output\n",
|
||||
"Name | Description | Type\n",
|
||||
":--- | :---------- | :---\n",
|
||||
"job_id | The ID of the created job. | String\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"\n",
|
||||
"To use the component, you must:\n",
|
||||
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
|
||||
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"\n",
|
||||
"This component creates a PySpark job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
|
||||
"\n",
|
||||
"Follow these steps to use the component in a pipeline:\n",
|
||||
"\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"dataproc_submit_pyspark_job_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_pyspark_job/component.yaml')\n",
|
||||
"help(dataproc_submit_pyspark_job_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sample\n",
|
||||
"\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### Setup a Dataproc cluster\n",
|
||||
"\n",
|
||||
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### Prepare a PySpark job\n",
|
||||
"\n",
|
||||
"Upload your PySpark code file to a Cloud Storage bucket. For example, this is a publicly accessible `hello-world.py` in Cloud Storage:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gsutil cat gs://dataproc-examples-2f10d78d114f6aaec76462e3c310f31f/src/pyspark/hello-world/hello-world.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
|
||||
"REGION = 'us-central1'\n",
|
||||
"PYSPARK_FILE_URI = 'gs://dataproc-examples-2f10d78d114f6aaec76462e3c310f31f/src/pyspark/hello-world/hello-world.py'\n",
|
||||
"ARGS = ''\n",
|
||||
"EXPERIMENT_NAME = 'Dataproc - Submit PySpark Job'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='Dataproc submit PySpark job pipeline',\n",
|
||||
" description='Dataproc submit PySpark job pipeline'\n",
|
||||
")\n",
|
||||
"def dataproc_submit_pyspark_job_pipeline(\n",
|
||||
" project_id = PROJECT_ID, \n",
|
||||
" region = REGION,\n",
|
||||
" cluster_name = CLUSTER_NAME,\n",
|
||||
" main_python_file_uri = PYSPARK_FILE_URI, \n",
|
||||
" args = ARGS, \n",
|
||||
" pyspark_job='{}', \n",
|
||||
" job='{}', \n",
|
||||
" wait_interval='30'\n",
|
||||
"):\n",
|
||||
" dataproc_submit_pyspark_job_op(\n",
|
||||
" project_id=project_id, \n",
|
||||
" region=region, \n",
|
||||
" cluster_name=cluster_name, \n",
|
||||
" main_python_file_uri=main_python_file_uri, \n",
|
||||
" args=args, \n",
|
||||
" pyspark_job=pyspark_job, \n",
|
||||
" job=job, \n",
|
||||
" wait_interval=wait_interval)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = dataproc_submit_pyspark_job_pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"\n",
|
||||
"* [Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) \n",
|
||||
"* [PySparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob)\n",
|
||||
"* [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,200 @@
|
|||
|
||||
# Name
|
||||
|
||||
Component: Data preparation using Spark on YARN with Cloud Dataproc
|
||||
|
||||
|
||||
# Labels
|
||||
|
||||
Spark, Kubeflow,YARN
|
||||
|
||||
# Facets
|
||||
<!--Make sure the asset has data for the following facets:
|
||||
Use case
|
||||
Technique
|
||||
Input data type
|
||||
ML workflow
|
||||
|
||||
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
|
||||
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
|
||||
--->
|
||||
Use case:
|
||||
Other
|
||||
|
||||
Technique:
|
||||
Other
|
||||
|
||||
Input data type:
|
||||
Tabular
|
||||
|
||||
ML workflow:
|
||||
Data preparation
|
||||
|
||||
|
||||
|
||||
# Summary
|
||||
|
||||
A Kubeflow pipeline component to prepare data by submitting a Spark job on YARN to Cloud Dataproc.
|
||||
|
||||
# Details
|
||||
|
||||
## Intended use
|
||||
|
||||
Use the component to run an Apache Spark job as one preprocessing step in a Kubeflow pipeline.
|
||||
|
||||
## Runtime arguments
|
||||
Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
:--- | :---------- | :--- | :------- | :------| :------|
|
||||
project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to.|No | GCPProjectID | | |
|
||||
region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | |
|
||||
cluster_name | The name of the cluster to run the job. | No | String | | |
|
||||
main_jar_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the JAR file that contains the main class. | No | GCSPath | | |
|
||||
main_class | The name of the driver's main class. The JAR file that contains the class must be either in the default CLASSPATH or specified in `spark_job.jarFileUris`.| No | | | |
|
||||
args | The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission.| Yes | | | |
|
||||
spark_job | The payload of a [SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob).| Yes | | | |
|
||||
job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | | | |
|
||||
wait_interval | The number of seconds to wait between polling the operation. | Yes | | | 30 |
|
||||
|
||||
## Output
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
job_id | The ID of the created job. | String
|
||||
|
||||
## Cautions & requirements
|
||||
|
||||
To use the component, you must:
|
||||
|
||||
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
|
||||
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.
|
||||
|
||||
|
||||
## Detailed description
|
||||
|
||||
This component creates a Spark job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
|
||||
|
||||
Follow these steps to use the component in a pipeline:
|
||||
|
||||
|
||||
|
||||
1. Install the Kubeflow Pipeline's SDK:
|
||||
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using the Kubeflow Pipeline's SDK
|
||||
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
dataproc_submit_spark_job_op = comp.load_component_from_url(
|
||||
'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_spark_job/component.yaml')
|
||||
help(dataproc_submit_spark_job_op)
|
||||
```
|
||||
|
||||
### Sample
|
||||
Note: The following sample code works in an IPython notebook or directly in Python code.
|
||||
|
||||
|
||||
#### Set up a Dataproc cluster
|
||||
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
|
||||
|
||||
|
||||
#### Prepare a Spark job
|
||||
Upload your Spark JAR file to a Cloud Storage bucket. In the sample, we use a JAR file that is preinstalled in the main cluster: `file:///usr/lib/spark/examples/jars/spark-examples.jar`.
|
||||
|
||||
Here is the [source code of the sample](https://github.com/apache/spark/blob/master/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java).
|
||||
|
||||
To package a self-contained Spark application, follow these [instructions](https://spark.apache.org/docs/latest/quick-start.html#self-contained-applications).
|
||||
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
PROJECT_ID = '<Put your project ID here>'
|
||||
CLUSTER_NAME = '<Put your existing cluster name here>'
|
||||
REGION = 'us-central1'
|
||||
SPARK_FILE_URI = 'file:///usr/lib/spark/examples/jars/spark-examples.jar'
|
||||
MAIN_CLASS = 'org.apache.spark.examples.SparkPi'
|
||||
ARGS = ['1000']
|
||||
EXPERIMENT_NAME = 'Dataproc - Submit Spark Job'
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Dataproc submit Spark job pipeline',
|
||||
description='Dataproc submit Spark job pipeline'
|
||||
)
|
||||
def dataproc_submit_spark_job_pipeline(
|
||||
project_id = PROJECT_ID,
|
||||
region = REGION,
|
||||
cluster_name = CLUSTER_NAME,
|
||||
main_jar_file_uri = '',
|
||||
main_class = MAIN_CLASS,
|
||||
args = json.dumps(ARGS),
|
||||
spark_job=json.dumps({ 'jarFileUris': [ SPARK_FILE_URI ] }),
|
||||
job='{}',
|
||||
wait_interval='30'
|
||||
):
|
||||
dataproc_submit_spark_job_op(
|
||||
project_id=project_id,
|
||||
region=region,
|
||||
cluster_name=cluster_name,
|
||||
main_jar_file_uri=main_jar_file_uri,
|
||||
main_class=main_class,
|
||||
args=args,
|
||||
spark_job=spark_job,
|
||||
job=job,
|
||||
wait_interval=wait_interval)
|
||||
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
#Compile the pipeline
|
||||
pipeline_func = dataproc_submit_spark_job_pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify values for the pipeline's arguments
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_spark_job/sample.ipynb)
|
||||
* [Dataproc SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,96 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: dataproc_submit_spark_job
|
||||
description: >-
|
||||
Submits a Cloud Dataproc job for running Apache Spark applications on YARN.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: project_id
|
||||
description: >-
|
||||
Required. The ID of the Google Cloud Platform project that the cluster
|
||||
belongs to.
|
||||
type: GCPProjectID
|
||||
- name: region
|
||||
description: >-
|
||||
Required. The Cloud Dataproc region in which to handle the request.
|
||||
type: GCPRegion
|
||||
- name: cluster_name
|
||||
description: 'Required. The cluster to run the job.'
|
||||
type: String
|
||||
- name: main_jar_file_uri
|
||||
default: ''
|
||||
description: >-
|
||||
The HCFS URI of the jar file that contains the main class.
|
||||
type: GCSPath
|
||||
- name: main_class
|
||||
default: ''
|
||||
description: >-
|
||||
The name of the driver's main class. The jar file that
|
||||
contains the class must be in the default CLASSPATH or specified in
|
||||
jarFileUris.
|
||||
type: String
|
||||
- name: args
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The arguments to pass to the driver. Do not include
|
||||
arguments, such as --conf, that can be set as job properties, since a
|
||||
collision may occur that causes an incorrect job submission.
|
||||
type: List
|
||||
- name: spark_job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob).
|
||||
type: Dict
|
||||
- name: job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
type: Dict
|
||||
- name: wait_interval
|
||||
default: '30'
|
||||
description: >-
|
||||
Optional. The wait seconds between polling the operation.
|
||||
Defaults to 30.
|
||||
type: Integer
|
||||
outputs:
|
||||
- name: job_id
|
||||
description: 'The ID of the created job.'
|
||||
type: String
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.dataproc, submit_spark_job,
|
||||
--project_id, {inputValue: project_id},
|
||||
--region, {inputValue: region},
|
||||
--cluster_name, {inputValue: cluster_name},
|
||||
--main_jar_file_uri, {inputValue: main_jar_file_uri},
|
||||
--main_class, {inputValue: main_class},
|
||||
--args, {inputValue: args},
|
||||
--spark_job, {inputValue: spark_job},
|
||||
--job, {inputValue: job},
|
||||
--wait_interval, {inputValue: wait_interval},
|
||||
--job_id_output_path, {outputPath: job_id},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,266 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"\n",
|
||||
"Data preparation using Spark on YARN with Cloud Dataproc\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"\n",
|
||||
"Cloud Dataproc, GCP, Cloud Storage, Spark, Kubeflow, pipelines, components, YARN\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"\n",
|
||||
"A Kubeflow Pipeline component to prepare data by submitting a Spark job on YARN to Cloud Dataproc.\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"\n",
|
||||
"## Intended use\n",
|
||||
"\n",
|
||||
"Use the component to run an Apache Spark job as one preprocessing step in a Kubeflow Pipeline.\n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
":--- | :---------- | :--- | :------- | :------| :------| \n",
|
||||
"project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to.|No | GCPProjectID | | |\n",
|
||||
"region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | | \n",
|
||||
"cluster_name | The name of the cluster to run the job. | No | String | | |\n",
|
||||
"main_jar_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the JAR file that contains the main class. | No | GCSPath | | |\n",
|
||||
"main_class | The name of the driver's main class. The JAR file that contains the class must be either in the default CLASSPATH or specified in `spark_job.jarFileUris`.| No | | | | \n",
|
||||
"args | The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission.| Yes | | | |\n",
|
||||
"spark_job | The payload of a [SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob).| Yes | | | |\n",
|
||||
"job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | | | |\n",
|
||||
"wait_interval | The number of seconds to wait between polling the operation. | Yes | | | 30 |\n",
|
||||
"\n",
|
||||
"## Output\n",
|
||||
"Name | Description | Type\n",
|
||||
":--- | :---------- | :---\n",
|
||||
"job_id | The ID of the created job. | String\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"\n",
|
||||
"To use the component, you must:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
|
||||
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"\n",
|
||||
"This component creates a Spark job from [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
|
||||
"\n",
|
||||
"Follow these steps to use the component in a pipeline:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"dataproc_submit_spark_job_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_spark_job/component.yaml')\n",
|
||||
"help(dataproc_submit_spark_job_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sample\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### Set up a Dataproc cluster\n",
|
||||
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### Prepare a Spark job\n",
|
||||
"Upload your Spark JAR file to a Cloud Storage bucket. In the sample, we use a JAR file that is preinstalled in the main cluster: `file:///usr/lib/spark/examples/jars/spark-examples.jar`.\n",
|
||||
"\n",
|
||||
"Here is the [source code of the sample](https://github.com/apache/spark/blob/master/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java).\n",
|
||||
"\n",
|
||||
"To package a self-contained Spark application, follow these [instructions](https://spark.apache.org/docs/latest/quick-start.html#self-contained-applications).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
|
||||
"REGION = 'us-central1'\n",
|
||||
"SPARK_FILE_URI = 'file:///usr/lib/spark/examples/jars/spark-examples.jar'\n",
|
||||
"MAIN_CLASS = 'org.apache.spark.examples.SparkPi'\n",
|
||||
"ARGS = ['1000']\n",
|
||||
"EXPERIMENT_NAME = 'Dataproc - Submit Spark Job'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='Dataproc submit Spark job pipeline',\n",
|
||||
" description='Dataproc submit Spark job pipeline'\n",
|
||||
")\n",
|
||||
"def dataproc_submit_spark_job_pipeline(\n",
|
||||
" project_id = PROJECT_ID, \n",
|
||||
" region = REGION,\n",
|
||||
" cluster_name = CLUSTER_NAME,\n",
|
||||
" main_jar_file_uri = '',\n",
|
||||
" main_class = MAIN_CLASS,\n",
|
||||
" args = json.dumps(ARGS), \n",
|
||||
" spark_job=json.dumps({ 'jarFileUris': [ SPARK_FILE_URI ] }), \n",
|
||||
" job='{}', \n",
|
||||
" wait_interval='30'\n",
|
||||
"):\n",
|
||||
" dataproc_submit_spark_job_op(\n",
|
||||
" project_id=project_id, \n",
|
||||
" region=region, \n",
|
||||
" cluster_name=cluster_name, \n",
|
||||
" main_jar_file_uri=main_jar_file_uri, \n",
|
||||
" main_class=main_class,\n",
|
||||
" args=args, \n",
|
||||
" spark_job=spark_job, \n",
|
||||
" job=job, \n",
|
||||
" wait_interval=wait_interval)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = dataproc_submit_spark_job_pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_spark_job/sample.ipynb)\n",
|
||||
"* [Dataproc SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,184 @@
|
|||
|
||||
# Name
|
||||
Component: Data preparation using SparkSQL on YARN with Cloud Dataproc
|
||||
|
||||
# Label
|
||||
Cloud Dataproc, YARN, SparkSQL, Kubeflow
|
||||
|
||||
# Summary
|
||||
A Kubeflow pipeline component to prepare data by submitting a SparkSql job on YARN to Cloud Dataproc.
|
||||
|
||||
# Facets
|
||||
<!--Make sure the asset has data for the following facets:
|
||||
Use case
|
||||
Technique
|
||||
Input data type
|
||||
ML workflow
|
||||
|
||||
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
|
||||
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
|
||||
--->
|
||||
Use case:
|
||||
|
||||
Technique:
|
||||
|
||||
Input data type:
|
||||
|
||||
ML workflow:
|
||||
|
||||
# Details
|
||||
|
||||
## Intended use
|
||||
Use the component to run an Apache SparkSql job as one preprocessing step in a Kubeflow pipeline.
|
||||
|
||||
## Runtime arguments
|
||||
Argument| Description | Optional | Data type| Accepted values| Default |
|
||||
:--- | :---------- | :--- | :------- | :------ | :------
|
||||
project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No| GCPProjectID | - | -|
|
||||
region | The Cloud Dataproc region to handle the request. | No | GCPRegion|-|-
|
||||
cluster_name | The name of the cluster to run the job. | No | String| -| -|
|
||||
queries | The queries to execute the SparkSQL job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | - | None |
|
||||
query_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the script that contains the SparkSQL queries. The SparkSQL queries are listed in a CSV file that is stored in a Cloud Storage bucket.| Yes | GCSPath | - | None |
|
||||
script_variables | Mapping of the query’s variable names to their values (equivalent to the SparkSQL command: SET name="value";).| Yes| Dict |- | None |
|
||||
sparksql_job | The payload of a [SparkSql job](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob). | Yes | Dict | - | None |
|
||||
job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | - | None |
|
||||
wait_interval | The number of seconds to pause between polling the operation. | Yes |Integer | - | 30 |
|
||||
|
||||
## Output
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
job_id | The ID of the created job. | String
|
||||
|
||||
## Cautions & requirements
|
||||
To use the component, you must:
|
||||
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
|
||||
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant the Kubeflow user service account the role, `roles/dataproc.editor`, on the project.
|
||||
|
||||
## Detailed Description
|
||||
This component creates a SparkSql job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
|
||||
|
||||
Follow these steps to use the component in a pipeline:
|
||||
1. Install the Kubeflow pipeline's SDK:
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using the Kubeflow pipeline's SDK:
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
dataproc_submit_sparksql_job_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_sparksql_job/component.yaml')
|
||||
help(dataproc_submit_sparksql_job_op)
|
||||
```
|
||||
|
||||
### Sample
|
||||
|
||||
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
|
||||
|
||||
#### Setup a Dataproc cluster
|
||||
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
|
||||
|
||||
#### Prepare a SparkSQL job
|
||||
You can put your SparkSQL queries in the `queries` list, or you can use `query_file_uri`. In this sample, we will use a hard coded query in the `queries` list to select data from a public CSV file in Cloud Storage.
|
||||
|
||||
For more details about Spark SQL, see [Spark SQL, DataFrames and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html).
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
```python
|
||||
PROJECT_ID = '<Put your project ID here>'
|
||||
CLUSTER_NAME = '<Put your existing cluster name here>'
|
||||
REGION = 'us-central1'
|
||||
QUERY = '''
|
||||
DROP TABLE IF EXISTS natality_csv;
|
||||
CREATE EXTERNAL TABLE natality_csv (
|
||||
source_year BIGINT, year BIGINT, month BIGINT, day BIGINT, wday BIGINT,
|
||||
state STRING, is_male BOOLEAN, child_race BIGINT, weight_pounds FLOAT,
|
||||
plurality BIGINT, apgar_1min BIGINT, apgar_5min BIGINT,
|
||||
mother_residence_state STRING, mother_race BIGINT, mother_age BIGINT,
|
||||
gestation_weeks BIGINT, lmp STRING, mother_married BOOLEAN,
|
||||
mother_birth_state STRING, cigarette_use BOOLEAN, cigarettes_per_day BIGINT,
|
||||
alcohol_use BOOLEAN, drinks_per_week BIGINT, weight_gain_pounds BIGINT,
|
||||
born_alive_alive BIGINT, born_alive_dead BIGINT, born_dead BIGINT,
|
||||
ever_born BIGINT, father_race BIGINT, father_age BIGINT,
|
||||
record_weight BIGINT
|
||||
)
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
|
||||
LOCATION 'gs://public-datasets/natality/csv';
|
||||
|
||||
SELECT * FROM natality_csv LIMIT 10;'''
|
||||
EXPERIMENT_NAME = 'Dataproc - Submit SparkSQL Job'
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Dataproc submit SparkSQL job pipeline',
|
||||
description='Dataproc submit SparkSQL job pipeline'
|
||||
)
|
||||
def dataproc_submit_sparksql_job_pipeline(
|
||||
project_id = PROJECT_ID,
|
||||
region = REGION,
|
||||
cluster_name = CLUSTER_NAME,
|
||||
queries = json.dumps([QUERY]),
|
||||
query_file_uri = '',
|
||||
script_variables = '',
|
||||
sparksql_job='',
|
||||
job='',
|
||||
wait_interval='30'
|
||||
):
|
||||
dataproc_submit_sparksql_job_op(
|
||||
project_id=project_id,
|
||||
region=region,
|
||||
cluster_name=cluster_name,
|
||||
queries=queries,
|
||||
query_file_uri=query_file_uri,
|
||||
script_variables=script_variables,
|
||||
sparksql_job=sparksql_job,
|
||||
job=job,
|
||||
wait_interval=wait_interval)
|
||||
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
```python
|
||||
pipeline_func = dataproc_submit_sparksql_job_pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
```python
|
||||
#Specify values for the pipeline's arguments
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
## References
|
||||
* [Spark SQL, DataFrames and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html)
|
||||
* [SparkSqlJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob)
|
||||
* [Cloud Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)
|
||||
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: dataproc_submit_sparksql_job
|
||||
description: >-
|
||||
Submits a Cloud Dataproc job for running Apache Spark SQL queries.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: project_id
|
||||
description: >-
|
||||
Required. The ID of the Google Cloud Platform project that the cluster
|
||||
belongs to.
|
||||
type: GCPProjectID
|
||||
- name: region
|
||||
description: >-
|
||||
Required. The Cloud Dataproc region in which to handle the request.
|
||||
type: GCPRegion
|
||||
- name: cluster_name
|
||||
description: 'Required. The cluster to run the job.'
|
||||
type: String
|
||||
- name: queries
|
||||
default: ''
|
||||
description: >-
|
||||
Required. The queries to execute. You do not need to
|
||||
terminate a query with a semicolon. Multiple queries can be specified
|
||||
in one string by separating each with a semicolon.
|
||||
type: List
|
||||
- name: query_file_uri
|
||||
default: ''
|
||||
description: >-
|
||||
The HCFS URI of the script that contains SQL queries.
|
||||
type: GCSPath
|
||||
- name: script_variables
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. Mapping of query variable names to
|
||||
values (equivalent to the Spark SQL command: SET name="value";).
|
||||
type: Dict
|
||||
- name: sparksql_job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[SparkSqlJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob).
|
||||
type: Dict
|
||||
- name: job
|
||||
default: ''
|
||||
description: >-
|
||||
Optional. The full payload of a
|
||||
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
type: Dict
|
||||
- name: wait_interval
|
||||
default: '30'
|
||||
description: >-
|
||||
Optional. The wait seconds between polling the operation.
|
||||
Defaults to 30.
|
||||
type: Integer
|
||||
outputs:
|
||||
- name: job_id
|
||||
description: 'The ID of the created job.'
|
||||
type: String
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.dataproc, submit_sparksql_job,
|
||||
--project_id, {inputValue: project_id},
|
||||
--region, {inputValue: region},
|
||||
--cluster_name, {inputValue: cluster_name},
|
||||
--queries, {inputValue: queries},
|
||||
--query_file_uri, {inputValue: query_file_uri},
|
||||
--script_variables, {inputValue: script_variables},
|
||||
--sparksql_job, {inputValue: sparksql_job},
|
||||
--job, {inputValue: job},
|
||||
--wait_interval, {inputValue: wait_interval},
|
||||
--job_id_output_path, {outputPath: job_id},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,261 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"Data preparation using SparkSQL on YARN with Cloud Dataproc\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"Cloud Dataproc, GCP, Cloud Storage, YARN, SparkSQL, Kubeflow, pipelines, components \n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"A Kubeflow Pipeline component to prepare data by submitting a SparkSql job on YARN to Cloud Dataproc.\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"\n",
|
||||
"## Intended use\n",
|
||||
"Use the component to run an Apache SparkSql job as one preprocessing step in a Kubeflow Pipeline.\n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"Argument| Description | Optional | Data type| Accepted values| Default |\n",
|
||||
":--- | :---------- | :--- | :------- | :------ | :------\n",
|
||||
"project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No| GCPProjectID | | |\n",
|
||||
"region | The Cloud Dataproc region to handle the request. | No | GCPRegion|\n",
|
||||
"cluster_name | The name of the cluster to run the job. | No | String| | |\n",
|
||||
"queries | The queries to execute the SparkSQL job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | | None | \n",
|
||||
"query_file_uri | The HCFS URI of the script that contains the SparkSQL queries.| Yes | GCSPath | | None |\n",
|
||||
"script_variables | Mapping of the query’s variable names to their values (equivalent to the SparkSQL command: SET name=\"value\";).| Yes| Dict | | None |\n",
|
||||
"sparksql_job | The payload of a [SparkSqlJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob). | Yes | Dict | | None |\n",
|
||||
"job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |\n",
|
||||
"wait_interval | The number of seconds to pause between polling the operation. | Yes |Integer | | 30 |\n",
|
||||
"\n",
|
||||
"## Output\n",
|
||||
"Name | Description | Type\n",
|
||||
":--- | :---------- | :---\n",
|
||||
"job_id | The ID of the created job. | String\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"To use the component, you must:\n",
|
||||
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
|
||||
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
|
||||
"\n",
|
||||
"## Detailed Description\n",
|
||||
"This component creates a Pig job from [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
|
||||
"\n",
|
||||
"Follow these steps to use the component in a pipeline:\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"dataproc_submit_sparksql_job_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_sparksql_job/component.yaml')\n",
|
||||
"help(dataproc_submit_sparksql_job_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sample\n",
|
||||
"\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
|
||||
"\n",
|
||||
"#### Setup a Dataproc cluster\n",
|
||||
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
|
||||
"\n",
|
||||
"#### Prepare a SparkSQL job\n",
|
||||
"Either put your SparkSQL queries in the `queires` list, or upload your SparkSQL queries into a file to a Cloud Storage bucket and then enter the Cloud Storage bucket’s path in `query_file_uri`. In this sample, we will use a hard coded query in the `queries` list to select data from a public CSV file from Cloud Storage.\n",
|
||||
"\n",
|
||||
"For more details about Spark SQL, see [Spark SQL, DataFrames and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html)\n",
|
||||
"\n",
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
|
||||
"REGION = 'us-central1'\n",
|
||||
"QUERY = '''\n",
|
||||
"DROP TABLE IF EXISTS natality_csv;\n",
|
||||
"CREATE EXTERNAL TABLE natality_csv (\n",
|
||||
" source_year BIGINT, year BIGINT, month BIGINT, day BIGINT, wday BIGINT,\n",
|
||||
" state STRING, is_male BOOLEAN, child_race BIGINT, weight_pounds FLOAT,\n",
|
||||
" plurality BIGINT, apgar_1min BIGINT, apgar_5min BIGINT,\n",
|
||||
" mother_residence_state STRING, mother_race BIGINT, mother_age BIGINT,\n",
|
||||
" gestation_weeks BIGINT, lmp STRING, mother_married BOOLEAN,\n",
|
||||
" mother_birth_state STRING, cigarette_use BOOLEAN, cigarettes_per_day BIGINT,\n",
|
||||
" alcohol_use BOOLEAN, drinks_per_week BIGINT, weight_gain_pounds BIGINT,\n",
|
||||
" born_alive_alive BIGINT, born_alive_dead BIGINT, born_dead BIGINT,\n",
|
||||
" ever_born BIGINT, father_race BIGINT, father_age BIGINT,\n",
|
||||
" record_weight BIGINT\n",
|
||||
")\n",
|
||||
"ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n",
|
||||
"LOCATION 'gs://public-datasets/natality/csv';\n",
|
||||
"\n",
|
||||
"SELECT * FROM natality_csv LIMIT 10;'''\n",
|
||||
"EXPERIMENT_NAME = 'Dataproc - Submit SparkSQL Job'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='Dataproc submit SparkSQL job pipeline',\n",
|
||||
" description='Dataproc submit SparkSQL job pipeline'\n",
|
||||
")\n",
|
||||
"def dataproc_submit_sparksql_job_pipeline(\n",
|
||||
" project_id = PROJECT_ID, \n",
|
||||
" region = REGION,\n",
|
||||
" cluster_name = CLUSTER_NAME,\n",
|
||||
" queries = json.dumps([QUERY]),\n",
|
||||
" query_file_uri = '',\n",
|
||||
" script_variables = '', \n",
|
||||
" sparksql_job='', \n",
|
||||
" job='', \n",
|
||||
" wait_interval='30'\n",
|
||||
"):\n",
|
||||
" dataproc_submit_sparksql_job_op(\n",
|
||||
" project_id=project_id, \n",
|
||||
" region=region, \n",
|
||||
" cluster_name=cluster_name, \n",
|
||||
" queries=queries, \n",
|
||||
" query_file_uri=query_file_uri,\n",
|
||||
" script_variables=script_variables, \n",
|
||||
" sparksql_job=sparksql_job, \n",
|
||||
" job=job, \n",
|
||||
" wait_interval=wait_interval)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = dataproc_submit_sparksql_job_pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Spark SQL, DataFrames and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html)\n",
|
||||
"* [SparkSqlJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob)\n",
|
||||
"* [Cloud Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,200 @@
|
|||
|
||||
# Name
|
||||
|
||||
Batch prediction using Cloud Machine Learning Engine
|
||||
|
||||
|
||||
# Label
|
||||
|
||||
Cloud Storage, Cloud ML Engine, Kubeflow, Pipeline, Component
|
||||
|
||||
|
||||
# Summary
|
||||
|
||||
A Kubeflow Pipeline component to submit a batch prediction job against a deployed model on Cloud ML Engine.
|
||||
|
||||
|
||||
# Details
|
||||
|
||||
|
||||
## Intended use
|
||||
|
||||
Use the component to run a batch prediction job against a deployed model on Cloud ML Engine. The prediction output is stored in a Cloud Storage bucket.
|
||||
|
||||
|
||||
## Runtime arguments
|
||||
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|--------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------------|-----------------|---------|
|
||||
| project_id | The ID of the Google Cloud Platform (GCP) project of the job. | No | GCPProjectID | | |
|
||||
| model_path | The path to the model. It can be one of the following:<br/> <ul> <li>projects/[PROJECT_ID]/models/[MODEL_ID]</li> <li>projects/[PROJECT_ID]/models/[MODEL_ID]/versions/[VERSION_ID]</li> <li>The path to a Cloud Storage location containing a model file.</li> </ul> | No | GCSPath | | |
|
||||
| input_paths | The path to the Cloud Storage location containing the input data files. It can contain wildcards, for example, `gs://foo/*.csv` | No | List | GCSPath | |
|
||||
| input_data_format | The format of the input data files. See [REST Resource: projects.jobs](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat) for more details. | No | String | DataFormat | |
|
||||
| output_path | The path to the Cloud Storage location for the output data. | No | GCSPath | | |
|
||||
| region | The Compute Engine region where the prediction job is run. | No | GCPRegion | | |
|
||||
| output_data_format | The format of the output data files. See [REST Resource: projects.jobs](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat) for more details. | Yes | String | DataFormat | JSON |
|
||||
| prediction_input | The JSON input parameters to create a prediction job. See [PredictionInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#PredictionInput) for more information. | Yes | Dict | | None |
|
||||
| job_id_prefix | The prefix of the generated job id. | Yes | String | | None |
|
||||
| wait_interval | The number of seconds to wait in case the operation has a long run time. | Yes | | | 30 |
|
||||
|
||||
|
||||
## Input data schema
|
||||
|
||||
The component accepts the following as input:
|
||||
|
||||
* A trained model: It can be a model file in Cloud Storage, a deployed model, or a version in Cloud ML Engine. Specify the path to the model in the `model_path `runtime argument.
|
||||
* Input data: The data used to make predictions against the trained model. The data can be in [multiple formats](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat). The data path is specified by `input_paths` and the format is specified by `input_data_format`.
|
||||
|
||||
## Output
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
job_id | The ID of the created batch job. | String
|
||||
|
||||
|
||||
## Cautions & requirements
|
||||
|
||||
To use the component, you must:
|
||||
|
||||
* Set up a cloud environment by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant the following types of access to the Kubeflow user service account:
|
||||
* Read access to the Cloud Storage buckets which contains the input data.
|
||||
* Write access to the Cloud Storage bucket of the output directory.
|
||||
|
||||
|
||||
## Detailed description
|
||||
|
||||
Follow these steps to use the component in a pipeline:
|
||||
|
||||
|
||||
|
||||
1. Install the Kubeflow Pipeline SDK:
|
||||
|
||||
|
||||
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using KFP SDK
|
||||
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
mlengine_batch_predict_op = comp.load_component_from_url(
|
||||
'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/batch_predict/component.yaml')
|
||||
help(mlengine_batch_predict_op)
|
||||
```
|
||||
|
||||
|
||||
### Sample Code
|
||||
Note: The following sample code works in an IPython notebook or directly in Python code.
|
||||
|
||||
In this sample, you batch predict against a pre-built trained model from `gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/` and use the test data from `gs://ml-pipeline-playground/samples/ml_engine/census/test.json`.
|
||||
|
||||
#### Inspect the test data
|
||||
|
||||
|
||||
```python
|
||||
!gsutil cat gs://ml-pipeline-playground/samples/ml_engine/census/test.json
|
||||
```
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
# Required Parameters
|
||||
PROJECT_ID = '<Please put your project ID here>'
|
||||
GCS_WORKING_DIR = 'gs://<Please put your GCS path here>' # No ending slash
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# Optional Parameters
|
||||
EXPERIMENT_NAME = 'CLOUDML - Batch Predict'
|
||||
OUTPUT_GCS_PATH = GCS_WORKING_DIR + '/batch_predict/output/'
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='CloudML batch predict pipeline',
|
||||
description='CloudML batch predict pipeline'
|
||||
)
|
||||
def pipeline(
|
||||
project_id = PROJECT_ID,
|
||||
model_path = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/',
|
||||
input_paths = '["gs://ml-pipeline-playground/samples/ml_engine/census/test.json"]',
|
||||
input_data_format = 'JSON',
|
||||
output_path = OUTPUT_GCS_PATH,
|
||||
region = 'us-central1',
|
||||
output_data_format='',
|
||||
prediction_input = json.dumps({
|
||||
'runtimeVersion': '1.10'
|
||||
}),
|
||||
job_id_prefix='',
|
||||
wait_interval='30'):
|
||||
mlengine_batch_predict_op(
|
||||
project_id=project_id,
|
||||
model_path=model_path,
|
||||
input_paths=input_paths,
|
||||
input_data_format=input_data_format,
|
||||
output_path=output_path,
|
||||
region=region,
|
||||
output_data_format=output_data_format,
|
||||
prediction_input=prediction_input,
|
||||
job_id_prefix=job_id_prefix,
|
||||
wait_interval=wait_interval)
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
pipeline_func = pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify pipeline argument values
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment and submit a pipeline run
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
#### Inspect prediction results
|
||||
|
||||
|
||||
```python
|
||||
OUTPUT_FILES_PATTERN = OUTPUT_GCS_PATH + '*'
|
||||
!gsutil cat OUTPUT_FILES_PATTERN
|
||||
```
|
||||
|
||||
## References
|
||||
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)
|
||||
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/batch_predict/sample.ipynb)
|
||||
* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Batch predict against a model with Cloud ML Engine
|
||||
description: |
|
||||
Creates a MLEngine batch prediction job.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: project_id
|
||||
description: 'Required. The ID of the parent project of the job.'
|
||||
type: GCPProjectID
|
||||
- name: model_path
|
||||
description: >-
|
||||
The path to the model. It can be either: `projects/[PROJECT_ID]/models/[MODEL_ID]`
|
||||
or `projects/[PROJECT_ID]/models/[MODEL_ID]/versions/[VERSION_ID]` or a GCS path
|
||||
of a model file.
|
||||
type: String
|
||||
- name: input_paths
|
||||
description: >-
|
||||
Required. The Google Cloud Storage location of the input data files. May contain
|
||||
wildcards.
|
||||
type: List
|
||||
- name: input_data_format
|
||||
description: >-
|
||||
Required. The format of the input data files. See
|
||||
https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat.
|
||||
type: String
|
||||
- name: output_path
|
||||
description: 'Required. The output Google Cloud Storage location.'
|
||||
type: GCSPath
|
||||
- name: region
|
||||
description: >-
|
||||
Required. The Google Compute Engine region to run the prediction job in.
|
||||
type: GCPRegion
|
||||
- name: output_data_format
|
||||
description: 'Optional. Format of the output data files, defaults to JSON.'
|
||||
default: ''
|
||||
type: String
|
||||
- name: prediction_input
|
||||
description: 'Input parameters to create a prediction job.'
|
||||
default: ''
|
||||
type: Dict
|
||||
- name: job_id_prefix
|
||||
description: 'The prefix of the generated job id.'
|
||||
default: ''
|
||||
type: String
|
||||
- name: wait_interval
|
||||
description: 'Optional wait interval between calls to get job status. Defaults to 30.'
|
||||
default: '30'
|
||||
type: Integer
|
||||
outputs:
|
||||
- name: job_id
|
||||
description: 'The ID of the created job.'
|
||||
type: String
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.ml_engine, batch_predict,
|
||||
--project_id, {inputValue: project_id},
|
||||
--model_path, {inputValue: model_path},
|
||||
--input_paths, {inputValue: input_paths},
|
||||
--input_data_format, {inputValue: input_data_format},
|
||||
--output_path, {inputValue: output_path},
|
||||
--region, {inputValue: region},
|
||||
--output_data_format, {inputValue: output_data_format},
|
||||
--prediction_input, {inputValue: prediction_input},
|
||||
--job_id_prefix, {inputValue: job_id_prefix},
|
||||
--wait_interval, {inputValue: wait_interval},
|
||||
--job_id_output_path, {outputPath: job_id},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,310 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"\n",
|
||||
"Batch prediction using Cloud Machine Learning Engine\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"\n",
|
||||
"Cloud Storage, Cloud ML Engine, Kubeflow, Pipeline, Component\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"\n",
|
||||
"A Kubeflow Pipeline component to submit a batch prediction job against a deployed model on Cloud ML Engine.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Intended use\n",
|
||||
"\n",
|
||||
"Use the component to run a batch prediction job against a deployed model on Cloud ML Engine. The prediction output is stored in a Cloud Storage bucket.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"\n",
|
||||
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
"|--------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------------|-----------------|---------|\n",
|
||||
"| project_id | The ID of the Google Cloud Platform (GCP) project of the job. | No | GCPProjectID | | |\n",
|
||||
"| model_path | The path to the model. It can be one of the following:<br/> <ul> <li>projects/[PROJECT_ID]/models/[MODEL_ID]</li> <li>projects/[PROJECT_ID]/models/[MODEL_ID]/versions/[VERSION_ID]</li> <li>The path to a Cloud Storage location containing a model file.</li> </ul> | No | GCSPath | | |\n",
|
||||
"| input_paths | The path to the Cloud Storage location containing the input data files. It can contain wildcards, for example, `gs://foo/*.csv` | No | List | GCSPath | |\n",
|
||||
"| input_data_format | The format of the input data files. See [REST Resource: projects.jobs](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat) for more details. | No | String | DataFormat | |\n",
|
||||
"| output_path | The path to the Cloud Storage location for the output data. | No | GCSPath | | |\n",
|
||||
"| region | The Compute Engine region where the prediction job is run. | No | GCPRegion | | |\n",
|
||||
"| output_data_format | The format of the output data files. See [REST Resource: projects.jobs](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat) for more details. | Yes | String | DataFormat | JSON |\n",
|
||||
"| prediction_input | The JSON input parameters to create a prediction job. See [PredictionInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#PredictionInput) for more information. | Yes | Dict | | None |\n",
|
||||
"| job_id_prefix | The prefix of the generated job id. | Yes | String | | None |\n",
|
||||
"| wait_interval | The number of seconds to wait in case the operation has a long run time. | Yes | | | 30 |\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Input data schema\n",
|
||||
"\n",
|
||||
"The component accepts the following as input:\n",
|
||||
"\n",
|
||||
"* A trained model: It can be a model file in Cloud Storage, a deployed model, or a version in Cloud ML Engine. Specify the path to the model in the `model_path `runtime argument.\n",
|
||||
"* Input data: The data used to make predictions against the trained model. The data can be in [multiple formats](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat). The data path is specified by `input_paths` and the format is specified by `input_data_format`.\n",
|
||||
"\n",
|
||||
"## Output\n",
|
||||
"Name | Description | Type\n",
|
||||
":--- | :---------- | :---\n",
|
||||
"job_id | The ID of the created batch job. | String\n",
|
||||
"output_path | The output path of the batch prediction job | GCSPath\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"\n",
|
||||
"To use the component, you must:\n",
|
||||
"\n",
|
||||
"* Set up a cloud environment by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant the following types of access to the Kubeflow user service account:\n",
|
||||
" * Read access to the Cloud Storage buckets which contains the input data.\n",
|
||||
" * Write access to the Cloud Storage bucket of the output directory.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"\n",
|
||||
"Follow these steps to use the component in a pipeline:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"mlengine_batch_predict_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/batch_predict/component.yaml')\n",
|
||||
"help(mlengine_batch_predict_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"### Sample Code\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code. \n",
|
||||
"\n",
|
||||
"In this sample, you batch predict against a pre-built trained model from `gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/` and use the test data from `gs://ml-pipeline-playground/samples/ml_engine/census/test.json`.\n",
|
||||
"\n",
|
||||
"#### Inspect the test data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gsutil cat gs://ml-pipeline-playground/samples/ml_engine/census/test.json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Required Parameters\n",
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"GCS_WORKING_DIR = 'gs://<Please put your GCS path here>' # No ending slash"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Optional Parameters\n",
|
||||
"EXPERIMENT_NAME = 'CLOUDML - Batch Predict'\n",
|
||||
"OUTPUT_GCS_PATH = GCS_WORKING_DIR + '/batch_predict/output/'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='CloudML batch predict pipeline',\n",
|
||||
" description='CloudML batch predict pipeline'\n",
|
||||
")\n",
|
||||
"def pipeline(\n",
|
||||
" project_id = PROJECT_ID, \n",
|
||||
" model_path = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/', \n",
|
||||
" input_paths = '[\"gs://ml-pipeline-playground/samples/ml_engine/census/test.json\"]', \n",
|
||||
" input_data_format = 'JSON', \n",
|
||||
" output_path = OUTPUT_GCS_PATH, \n",
|
||||
" region = 'us-central1', \n",
|
||||
" output_data_format='', \n",
|
||||
" prediction_input = json.dumps({\n",
|
||||
" 'runtimeVersion': '1.10'\n",
|
||||
" }), \n",
|
||||
" job_id_prefix='',\n",
|
||||
" wait_interval='30'):\n",
|
||||
" mlengine_batch_predict_op(\n",
|
||||
" project_id=project_id, \n",
|
||||
" model_path=model_path, \n",
|
||||
" input_paths=input_paths, \n",
|
||||
" input_data_format=input_data_format, \n",
|
||||
" output_path=output_path, \n",
|
||||
" region=region, \n",
|
||||
" output_data_format=output_data_format, \n",
|
||||
" prediction_input=prediction_input, \n",
|
||||
" job_id_prefix=job_id_prefix,\n",
|
||||
" wait_interval=wait_interval)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Inspect prediction results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"OUTPUT_FILES_PATTERN = OUTPUT_GCS_PATH + '*'\n",
|
||||
"!gsutil cat OUTPUT_FILES_PATTERN"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/batch_predict/sample.ipynb)\n",
|
||||
"* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,196 @@
|
|||
|
||||
# Name
|
||||
|
||||
Deploying a trained model to Cloud Machine Learning Engine
|
||||
|
||||
|
||||
# Label
|
||||
|
||||
Cloud Storage, Cloud ML Engine, Kubeflow, Pipeline
|
||||
|
||||
|
||||
# Summary
|
||||
|
||||
A Kubeflow Pipeline component to deploy a trained model from a Cloud Storage location to Cloud ML Engine.
|
||||
|
||||
|
||||
# Details
|
||||
|
||||
|
||||
## Intended use
|
||||
|
||||
Use the component to deploy a trained model to Cloud ML Engine. The deployed model can serve online or batch predictions in a Kubeflow Pipeline.
|
||||
|
||||
|
||||
## Runtime arguments
|
||||
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------------|-----------------|---------|
|
||||
| model_uri | The URI of a Cloud Storage directory that contains a trained model file.<br/> Or <br/> An [Estimator export base directory](https://www.tensorflow.org/guide/saved_model#perform_the_export) that contains a list of subdirectories named by timestamp. The directory with the latest timestamp is used to load the trained model file. | No | GCSPath | | |
|
||||
| project_id | The ID of the Google Cloud Platform (GCP) project of the serving model. | No | GCPProjectID | | |
|
||||
| model_id | The name of the trained model. | Yes | String | | None |
|
||||
| version_id | The name of the version of the model. If it is not provided, the operation uses a random name. | Yes | String | | None |
|
||||
| runtime_version | The Cloud ML Engine runtime version to use for this deployment. If it is not provided, the default stable version, 1.0, is used. | Yes | String | | None |
|
||||
| python_version | The version of Python used in the prediction. If it is not provided, version 2.7 is used. You can use Python 3.5 if runtime_version is set to 1.4 or above. Python 2.7 works with all supported runtime versions. | Yes | String | | 2.7 |
|
||||
| model | The JSON payload of the new [model](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models). | Yes | Dict | | None |
|
||||
| version | The new [version](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions) of the trained model. | Yes | Dict | | None |
|
||||
| replace_existing_version | Indicates whether to replace the existing version in case of a conflict (if the same version number is found.) | Yes | Boolean | | FALSE |
|
||||
| set_default | Indicates whether to set the new version as the default version in the model. | Yes | Boolean | | FALSE |
|
||||
| wait_interval | The number of seconds to wait in case the operation has a long run time. | Yes | Integer | | 30 |
|
||||
|
||||
|
||||
|
||||
## Input data schema
|
||||
|
||||
The component looks for a trained model in the location specified by the `model_uri` runtime argument. The accepted trained models are:
|
||||
|
||||
|
||||
* [Tensorflow SavedModel](https://cloud.google.com/ml-engine/docs/tensorflow/exporting-for-prediction)
|
||||
* [Scikit-learn & XGBoost model](https://cloud.google.com/ml-engine/docs/scikit/exporting-for-prediction)
|
||||
|
||||
The accepted file formats are:
|
||||
|
||||
* *.pb
|
||||
* *.pbtext
|
||||
* model.bst
|
||||
* model.joblib
|
||||
* model.pkl
|
||||
|
||||
`model_uri` can also be an [Estimator export base directory, ](https://www.tensorflow.org/guide/saved_model#perform_the_export)which contains a list of subdirectories named by timestamp. The directory with the latest timestamp is used to load the trained model file.
|
||||
|
||||
## Output
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
| model_uri | The Cloud Storage URI of the trained model. | GCSPath |
|
||||
| model_name | The name of the deployed model. | String |
|
||||
| version_name | The name of the deployed version. | String |
|
||||
|
||||
|
||||
## Cautions & requirements
|
||||
|
||||
To use the component, you must:
|
||||
|
||||
* [Set up the cloud environment](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant read access to the Cloud Storage bucket that contains the trained model to the Kubeflow user service account.
|
||||
|
||||
## Detailed description
|
||||
|
||||
Use the component to:
|
||||
* Locate the trained model at the Cloud Storage location you specify.
|
||||
* Create a new model if a model provided by you doesn’t exist.
|
||||
* Delete the existing model version if `replace_existing_version` is enabled.
|
||||
* Create a new version of the model from the trained model.
|
||||
* Set the new version as the default version of the model if `set_default` is enabled.
|
||||
|
||||
Follow these steps to use the component in a pipeline:
|
||||
|
||||
1. Install the Kubeflow Pipeline SDK:
|
||||
|
||||
|
||||
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using KFP SDK
|
||||
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
mlengine_deploy_op = comp.load_component_from_url(
|
||||
'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/deploy/component.yaml')
|
||||
help(mlengine_deploy_op)
|
||||
```
|
||||
|
||||
### Sample
|
||||
Note: The following sample code works in IPython notebook or directly in Python code.
|
||||
|
||||
In this sample, you deploy a pre-built trained model from `gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/` to Cloud ML Engine. The deployed model is `kfp_sample_model`. A new version is created every time the sample is run, and the latest version is set as the default version of the deployed model.
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
# Required Parameters
|
||||
PROJECT_ID = '<Please put your project ID here>'
|
||||
|
||||
# Optional Parameters
|
||||
EXPERIMENT_NAME = 'CLOUDML - Deploy'
|
||||
TRAINED_MODEL_PATH = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/'
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='CloudML deploy pipeline',
|
||||
description='CloudML deploy pipeline'
|
||||
)
|
||||
def pipeline(
|
||||
model_uri = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/',
|
||||
project_id = PROJECT_ID,
|
||||
model_id = 'kfp_sample_model',
|
||||
version_id = '',
|
||||
runtime_version = '1.10',
|
||||
python_version = '',
|
||||
version = {},
|
||||
replace_existing_version = 'False',
|
||||
set_default = 'True',
|
||||
wait_interval = '30'):
|
||||
task = mlengine_deploy_op(
|
||||
model_uri=model_uri,
|
||||
project_id=project_id,
|
||||
model_id=model_id,
|
||||
version_id=version_id,
|
||||
runtime_version=runtime_version,
|
||||
python_version=python_version,
|
||||
version=version,
|
||||
replace_existing_version=replace_existing_version,
|
||||
set_default=set_default,
|
||||
wait_interval=wait_interval)
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
pipeline_func = pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify pipeline argument values
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment and submit a pipeline run
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
## References
|
||||
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)
|
||||
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/deploy/sample.ipynb)
|
||||
* [Cloud Machine Learning Engine Model REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models)
|
||||
* [Cloud Machine Learning Engine Version REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.versions)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,119 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Deploying a trained model to Cloud Machine Learning Engine
|
||||
description: |
|
||||
A Kubeflow Pipeline component to deploy a trained model from a Cloud Storage
|
||||
path to a Cloud Machine Learning Engine service.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: model_uri
|
||||
description: >-
|
||||
Required. The Cloud Storage URI which contains a model file. Commonly
|
||||
used TF model search paths (export/exporter) will be used if they exist.
|
||||
type: GCSPath
|
||||
- name: project_id
|
||||
description: 'Required.The ID of the parent project of the serving model.'
|
||||
type: GCPProjectID
|
||||
- name: model_id
|
||||
description: >-
|
||||
Optional. The user-specified name of the model. If it is not provided,
|
||||
the operation uses a random name.
|
||||
default: ''
|
||||
type: String
|
||||
- name: version_id
|
||||
description: >-
|
||||
Optional. The user-specified name of the version. If it is not provided,
|
||||
the operation uses a random name.
|
||||
default: ''
|
||||
type: String
|
||||
- name: runtime_version
|
||||
description: >-
|
||||
Optional. The [Cloud ML Engine runtime version](https://cloud.google.com/ml-engine/docs/tensorflow/runtime-version-list) to use for
|
||||
this deployment. If it is not set, the Cloud ML Engine uses the default
|
||||
stable version, 1.0.
|
||||
default: ''
|
||||
type: String
|
||||
- name: python_version
|
||||
description: >-
|
||||
Optional. The version of Python used in the prediction. If it is not set,
|
||||
the default version is `2.7`. Python `3.5` is available when the
|
||||
runtime_version is set to `1.4` and above. Python `2.7` works with all
|
||||
supported runtime versions.
|
||||
default: ''
|
||||
type: String
|
||||
- name: model
|
||||
description: >-
|
||||
Optional. The JSON payload of the new
|
||||
[Model](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models), if it does not exist.
|
||||
default: ''
|
||||
type: Dict
|
||||
- name: version
|
||||
description: >-
|
||||
Optional. The JSON payload of the new
|
||||
[Version](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions).
|
||||
default: ''
|
||||
type: Dict
|
||||
- name: replace_existing_version
|
||||
description: >-
|
||||
A Boolean flag that indicates whether to replace existing version in case of conflict.
|
||||
default: 'False'
|
||||
type: Bool
|
||||
- name: set_default
|
||||
description: >-
|
||||
A Boolean flag that indicates whether to set the new version as default version in the model.
|
||||
default: 'False'
|
||||
type: Bool
|
||||
- name: wait_interval
|
||||
description: 'A time-interval to wait for in case the operation has a long run time.'
|
||||
default: '30'
|
||||
type: Integer
|
||||
outputs:
|
||||
- name: model_uri
|
||||
description: 'The Cloud Storage URI of the trained model.'
|
||||
type: GCSPath
|
||||
- name: model_name
|
||||
description: 'The name of the deployed model.'
|
||||
type: String
|
||||
- name: version_name
|
||||
description: 'The name of the deployed version.'
|
||||
type: String
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ["python", -u, -m, "kfp_component.launcher"]
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.ml_engine, deploy,
|
||||
--model_uri, {inputValue: model_uri},
|
||||
--project_id, {inputValue: project_id},
|
||||
--model_id, {inputValue: model_id},
|
||||
--version_id, {inputValue: version_id},
|
||||
--runtime_version, {inputValue: runtime_version},
|
||||
--python_version, {inputValue: python_version},
|
||||
--model, {inputValue: model},
|
||||
--version, {inputValue: version},
|
||||
--replace_existing_version, {inputValue: replace_existing_version},
|
||||
--set_default, {inputValue: set_default},
|
||||
--wait_interval, {inputValue: wait_interval},
|
||||
--model_uri_output_path, {outputPath: model_uri},
|
||||
--model_name_output_path, {outputPath: model_name},
|
||||
--version_name_output_path, {outputPath: version_name},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,282 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"\n",
|
||||
"Deploying a trained model to Cloud Machine Learning Engine \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"\n",
|
||||
"Cloud Storage, Cloud ML Engine, Kubeflow, Pipeline\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"\n",
|
||||
"A Kubeflow Pipeline component to deploy a trained model from a Cloud Storage location to Cloud ML Engine.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Intended use\n",
|
||||
"\n",
|
||||
"Use the component to deploy a trained model to Cloud ML Engine. The deployed model can serve online or batch predictions in a Kubeflow Pipeline.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"\n",
|
||||
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
"|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------------|-----------------|---------|\n",
|
||||
"| model_uri | The URI of a Cloud Storage directory that contains a trained model file.<br/> Or <br/> An [Estimator export base directory](https://www.tensorflow.org/guide/saved_model#perform_the_export) that contains a list of subdirectories named by timestamp. The directory with the latest timestamp is used to load the trained model file. | No | GCSPath | | |\n",
|
||||
"| project_id | The ID of the Google Cloud Platform (GCP) project of the serving model. | No | GCPProjectID | | |\n",
|
||||
"| model_id | The name of the trained model. | Yes | String | | None |\n",
|
||||
"| version_id | The name of the version of the model. If it is not provided, the operation uses a random name. | Yes | String | | None |\n",
|
||||
"| runtime_version | The Cloud ML Engine runtime version to use for this deployment. If it is not provided, the default stable version, 1.0, is used. | Yes | String | | None |\n",
|
||||
"| python_version | The version of Python used in the prediction. If it is not provided, version 2.7 is used. You can use Python 3.5 if runtime_version is set to 1.4 or above. Python 2.7 works with all supported runtime versions. | Yes | String | | 2.7 |\n",
|
||||
"| model | The JSON payload of the new [model](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models). | Yes | Dict | | None |\n",
|
||||
"| version | The new [version](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions) of the trained model. | Yes | Dict | | None |\n",
|
||||
"| replace_existing_version | Indicates whether to replace the existing version in case of a conflict (if the same version number is found.) | Yes | Boolean | | FALSE |\n",
|
||||
"| set_default | Indicates whether to set the new version as the default version in the model. | Yes | Boolean | | FALSE |\n",
|
||||
"| wait_interval | The number of seconds to wait in case the operation has a long run time. | Yes | Integer | | 30 |\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Input data schema\n",
|
||||
"\n",
|
||||
"The component looks for a trained model in the location specified by the `model_uri` runtime argument. The accepted trained models are:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"* [Tensorflow SavedModel](https://cloud.google.com/ml-engine/docs/tensorflow/exporting-for-prediction) \n",
|
||||
"* [Scikit-learn & XGBoost model](https://cloud.google.com/ml-engine/docs/scikit/exporting-for-prediction)\n",
|
||||
"\n",
|
||||
"The accepted file formats are:\n",
|
||||
"\n",
|
||||
"* *.pb\n",
|
||||
"* *.pbtext\n",
|
||||
"* model.bst\n",
|
||||
"* model.joblib\n",
|
||||
"* model.pkl\n",
|
||||
"\n",
|
||||
"`model_uri` can also be an [Estimator export base directory, ](https://www.tensorflow.org/guide/saved_model#perform_the_export)which contains a list of subdirectories named by timestamp. The directory with the latest timestamp is used to load the trained model file.\n",
|
||||
"\n",
|
||||
"## Output\n",
|
||||
"| Name | Description | Type |\n",
|
||||
"|:------- |:---- | :--- |\n",
|
||||
"| job_id | The ID of the created job. | String |\n",
|
||||
"| job_dir | The Cloud Storage path that contains the trained model output files. | GCSPath |\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"\n",
|
||||
"To use the component, you must:\n",
|
||||
"\n",
|
||||
"* [Set up the cloud environment](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant read access to the Cloud Storage bucket that contains the trained model to the Kubeflow user service account.\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"\n",
|
||||
"Use the component to: \n",
|
||||
"* Locate the trained model at the Cloud Storage location you specify.\n",
|
||||
"* Create a new model if a model provided by you doesn’t exist.\n",
|
||||
"* Delete the existing model version if `replace_existing_version` is enabled.\n",
|
||||
"* Create a new version of the model from the trained model.\n",
|
||||
"* Set the new version as the default version of the model if `set_default` is enabled.\n",
|
||||
"\n",
|
||||
"Follow these steps to use the component in a pipeline:\n",
|
||||
"\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"mlengine_deploy_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/deploy/component.yaml')\n",
|
||||
"help(mlengine_deploy_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sample\n",
|
||||
"Note: The following sample code works in IPython notebook or directly in Python code.\n",
|
||||
"\n",
|
||||
"In this sample, you deploy a pre-built trained model from `gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/` to Cloud ML Engine. The deployed model is `kfp_sample_model`. A new version is created every time the sample is run, and the latest version is set as the default version of the deployed model.\n",
|
||||
"\n",
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Required Parameters\n",
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"\n",
|
||||
"# Optional Parameters\n",
|
||||
"EXPERIMENT_NAME = 'CLOUDML - Deploy'\n",
|
||||
"TRAINED_MODEL_PATH = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='CloudML deploy pipeline',\n",
|
||||
" description='CloudML deploy pipeline'\n",
|
||||
")\n",
|
||||
"def pipeline(\n",
|
||||
" model_uri = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/',\n",
|
||||
" project_id = PROJECT_ID,\n",
|
||||
" model_id = 'kfp_sample_model',\n",
|
||||
" version_id = '',\n",
|
||||
" runtime_version = '1.10',\n",
|
||||
" python_version = '',\n",
|
||||
" version = {},\n",
|
||||
" replace_existing_version = 'False',\n",
|
||||
" set_default = 'True',\n",
|
||||
" wait_interval = '30'):\n",
|
||||
" task = mlengine_deploy_op(\n",
|
||||
" model_uri=model_uri, \n",
|
||||
" project_id=project_id, \n",
|
||||
" model_id=model_id, \n",
|
||||
" version_id=version_id, \n",
|
||||
" runtime_version=runtime_version, \n",
|
||||
" python_version=python_version,\n",
|
||||
" version=version, \n",
|
||||
" replace_existing_version=replace_existing_version, \n",
|
||||
" set_default=set_default, \n",
|
||||
" wait_interval=wait_interval)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/deploy/sample.ipynb)\n",
|
||||
"* [Cloud Machine Learning Engine Model REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models)\n",
|
||||
"* [Cloud Machine Learning Engine Version REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.versions)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,241 @@
|
|||
|
||||
# Name
|
||||
Component: Submitting an AI Platform training job as a pipeline step
|
||||
|
||||
# Label
|
||||
AI Platform, Kubeflow
|
||||
|
||||
# Summary
|
||||
A Kubeflow pipeline component to submit an AI Platform training job as a step in a pipeline.
|
||||
|
||||
# Facets
|
||||
<!--Make sure the asset has data for the following facets:
|
||||
Use case
|
||||
Technique
|
||||
Input data type
|
||||
ML workflow
|
||||
|
||||
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
|
||||
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
|
||||
--->
|
||||
Use case:
|
||||
Other
|
||||
|
||||
Technique:
|
||||
Other
|
||||
|
||||
Input data type:
|
||||
Tabular
|
||||
|
||||
ML workflow:
|
||||
Training
|
||||
|
||||
# Details
|
||||
## Intended use
|
||||
Use this component to submit a training job to AI Platform from a Kubeflow pipeline.
|
||||
|
||||
## Runtime arguments
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|:------------------|:------------------|:----------|:--------------|:-----------------|:-------------|
|
||||
| project_id | The Google Cloud Platform (GCP) project ID of the job. | No | GCPProjectID | - | - |
|
||||
| python_module | The name of the Python module to run after installing the training program. | Yes | String | - | None |
|
||||
| package_uris | The Cloud Storage location of the packages that contain the training program and any additional dependencies. The maximum number of package URIs is 100. | Yes | List | -| None |
|
||||
| region | The Compute Engine region in which the training job is run. | Yes | GCPRegion | -| us-central1 |
|
||||
| args | The command line arguments to pass to the training program. | Yes | List | - | None |
|
||||
| job_dir | A Cloud Storage path in which to store the training outputs and other data needed for training. This path is passed to your TensorFlow program as the command-line argument, `job-dir`. The benefit of specifying this field is that Cloud ML validates the path for use in training. | Yes | GCSPath | - | None |
|
||||
| python_version | The version of Python used in training. If it is not set, the default version is 2.7. Python 3.5 is available when the runtime version is set to 1.4 and above. | Yes | String | - | None |
|
||||
| runtime_version | The runtime version of AI Platform to use for training. If it is not set, AI Platform uses the default. | Yes | String | - | 1 |
|
||||
| master_image_uri | The Docker image to run on the master replica. This image must be in Container Registry. | Yes | GCRPath | - | None |
|
||||
| worker_image_uri | The Docker image to run on the worker replica. This image must be in Container Registry. | Yes | GCRPath |- | None |
|
||||
| training_input | The input parameters to create a training job. | Yes | Dict | [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput) | None |
|
||||
| job_id_prefix | The prefix of the job ID that is generated. | Yes | String | - | None |
|
||||
| job_id | The ID of the job to create, takes precedence over generated job id if set. | Yes | String | - | None |
|
||||
| wait_interval | The number of seconds to wait between API calls to get the status of the job. | Yes | Integer | - | 30 |
|
||||
|
||||
|
||||
|
||||
## Input data schema
|
||||
|
||||
The component accepts two types of inputs:
|
||||
* A list of Python packages from Cloud Storage.
|
||||
* You can manually build a Python package and upload it to Cloud Storage by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/packaging-trainer#manual-build).
|
||||
* A Docker container from Container Registry.
|
||||
* Follow this [guide](https://cloud.google.com/ml-engine/docs/using-containers) to publish and use a Docker container with this component.
|
||||
|
||||
## Output
|
||||
| Name | Description | Type |
|
||||
|:------- |:---- | :--- |
|
||||
| job_id | The ID of the created job. | String |
|
||||
| job_dir | The Cloud Storage path that contains the output files with the trained model. | GCSPath |
|
||||
|
||||
|
||||
## Cautions & requirements
|
||||
|
||||
To use the component, you must:
|
||||
|
||||
* Set up a cloud environment by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* Grant the following access to the Kubeflow user service account:
|
||||
* Read access to the Cloud Storage buckets which contain the input data, packages, or Docker images.
|
||||
* Write access to the Cloud Storage bucket of the output directory.
|
||||
|
||||
## Detailed description
|
||||
|
||||
The component builds the [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput) payload and submits a job via the [AI Platform REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs).
|
||||
|
||||
The steps to use the component in a pipeline are:
|
||||
|
||||
|
||||
1. Install the Kubeflow pipeline's SDK:
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using the Kubeflow pipeline's SDK:
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
mlengine_train_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/train/component.yaml')
|
||||
help(mlengine_train_op)
|
||||
```
|
||||
### Sample
|
||||
The following sample code works in an IPython notebook or directly in Python code.
|
||||
|
||||
In this sample, you use the code from the [census estimator sample](https://github.com/GoogleCloudPlatform/cloudml-samples/tree/master/census/estimator) to train a model on AI Platform. To upload the code to AI Platform, package the Python code and upload it to a Cloud Storage bucket.
|
||||
|
||||
Note: You must have read and write permissions on the bucket that you use as the working directory.
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
```python
|
||||
# Required parameters
|
||||
PROJECT_ID = '<Put your project ID here>'
|
||||
GCS_WORKING_DIR = 'gs://<Put your GCS path here>' # No ending slash
|
||||
```
|
||||
|
||||
```python
|
||||
# Optional parameters
|
||||
EXPERIMENT_NAME = 'CLOUDML - Train'
|
||||
TRAINER_GCS_PATH = GCS_WORKING_DIR + '/train/trainer.tar.gz'
|
||||
OUTPUT_GCS_PATH = GCS_WORKING_DIR + '/train/output/'
|
||||
```
|
||||
|
||||
#### Clean up the working directory
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
!gsutil rm -r $GCS_WORKING_DIR
|
||||
```
|
||||
|
||||
#### Download the sample trainer code to a local directory
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
!wget https://github.com/GoogleCloudPlatform/cloudml-samples/archive/master.zip
|
||||
!unzip master.zip
|
||||
```
|
||||
|
||||
#### Package code and upload the package to Cloud Storage
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
%%bash -s "$TRAINER_GCS_PATH"
|
||||
pushd ./cloudml-samples-master/census/estimator/
|
||||
python setup.py sdist
|
||||
gsutil cp dist/preprocessing-1.0.tar.gz $1
|
||||
popd
|
||||
rm -fr ./cloudml-samples-master/ ./master.zip ./dist
|
||||
```
|
||||
|
||||
#### Example pipeline that uses the component
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='CloudML training pipeline',
|
||||
description='CloudML training pipeline'
|
||||
)
|
||||
def pipeline(
|
||||
project_id = PROJECT_ID,
|
||||
python_module = 'trainer.task',
|
||||
package_uris = json.dumps([TRAINER_GCS_PATH]),
|
||||
region = 'us-central1',
|
||||
args = json.dumps([
|
||||
'--train-files', 'gs://cloud-samples-data/ml-engine/census/data/adult.data.csv',
|
||||
'--eval-files', 'gs://cloud-samples-data/ml-engine/census/data/adult.test.csv',
|
||||
'--train-steps', '1000',
|
||||
'--eval-steps', '100',
|
||||
'--verbosity', 'DEBUG'
|
||||
]),
|
||||
job_dir = OUTPUT_GCS_PATH,
|
||||
python_version = '',
|
||||
runtime_version = '1.10',
|
||||
master_image_uri = '',
|
||||
worker_image_uri = '',
|
||||
training_input = '',
|
||||
job_id_prefix = '',
|
||||
job_id = '',
|
||||
wait_interval = '30'):
|
||||
task = mlengine_train_op(
|
||||
project_id=project_id,
|
||||
python_module=python_module,
|
||||
package_uris=package_uris,
|
||||
region=region,
|
||||
args=args,
|
||||
job_dir=job_dir,
|
||||
python_version=python_version,
|
||||
runtime_version=runtime_version,
|
||||
master_image_uri=master_image_uri,
|
||||
worker_image_uri=worker_image_uri,
|
||||
training_input=training_input,
|
||||
job_id_prefix=job_id_prefix,
|
||||
job_id=job_id,
|
||||
wait_interval=wait_interval)
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
```python
|
||||
pipeline_func = pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
```python
|
||||
#Specify values for the pipeline's arguments
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
#### Inspect the results
|
||||
|
||||
Use the following command to inspect the contents in the output directory:
|
||||
|
||||
```python
|
||||
!gsutil ls $OUTPUT_GCS_PATH
|
||||
```
|
||||
|
||||
## References
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/train/sample.ipynb)
|
||||
* [AI Platform REST API - Resource: Job](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -0,0 +1,135 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Submitting a Cloud ML training job as a pipeline step
|
||||
description: |
|
||||
A Kubeflow Pipeline component to submit a Cloud Machine Learning (Cloud ML)
|
||||
Engine training job as a step in a pipeline.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: project_id
|
||||
description: 'Required. The ID of the parent project of the job.'
|
||||
type: GCPProjectID
|
||||
- name: python_module
|
||||
description: 'The Python module name to run after installing the packages.'
|
||||
default: ''
|
||||
type: String
|
||||
- name: package_uris
|
||||
description: >-
|
||||
The Cloud Storage location of the packages (that contain the training program
|
||||
and any additional dependencies). The maximum number of package URIs is 100.
|
||||
default: ''
|
||||
type: List
|
||||
- name: region
|
||||
description: 'The Compute Engine region in which the training job is run.'
|
||||
default: ''
|
||||
type: GCPRegion
|
||||
- name: args
|
||||
description: 'The command line arguments to pass to the program.'
|
||||
default: ''
|
||||
type: List
|
||||
- name: job_dir
|
||||
description: >-
|
||||
A Cloud Storage path in which to store the training outputs and other data
|
||||
needed for training. This path is passed to your TensorFlow program as the
|
||||
`job-dir` command-line argument. The benefit of specifying this field is
|
||||
that Cloud ML validates the path for use in training.
|
||||
default: ''
|
||||
type: GCSPath
|
||||
- name: python_version
|
||||
description: >-
|
||||
The version of Python used in training. If not set, the default
|
||||
version is `2.7`. Python `3.5` is available when runtimeVersion is set to `1.4`
|
||||
and above.
|
||||
default: ''
|
||||
type: String
|
||||
- name: runtime_version
|
||||
description: >-
|
||||
The Cloud ML Engine runtime version to use for training. If not set,
|
||||
Cloud ML Engine uses the default stable version, 1.0.
|
||||
default: ''
|
||||
type: String
|
||||
- name: master_image_uri
|
||||
description: >-
|
||||
The Docker image to run on the master replica. This image must be in
|
||||
Container Registry.
|
||||
default: ''
|
||||
type: GCRPath
|
||||
- name: worker_image_uri
|
||||
description: >-
|
||||
The Docker image to run on the worker replica. This image must be in
|
||||
Container Registry.
|
||||
default: ''
|
||||
type: GCRPath
|
||||
- name: training_input
|
||||
description: >-
|
||||
The input parameters to create a training job. It is the JSON payload
|
||||
of a [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput)
|
||||
default: ''
|
||||
type: Dict
|
||||
- name: job_id_prefix
|
||||
description: 'The prefix of the generated job id.'
|
||||
default: ''
|
||||
type: String
|
||||
- name: job_id
|
||||
description: >-
|
||||
The ID of the job to create, takes precedence over generated
|
||||
job id if set.
|
||||
default: ''
|
||||
type: String
|
||||
- name: wait_interval
|
||||
description: >-
|
||||
Optional. A time-interval to wait for between calls to get the job status.
|
||||
Defaults to 30.'
|
||||
default: '30'
|
||||
type: Integer
|
||||
outputs:
|
||||
- name: job_id
|
||||
description: 'The ID of the created job.'
|
||||
type: String
|
||||
- name: job_dir
|
||||
description: >-
|
||||
The output path in Cloud Storage of the training job, which contains
|
||||
the trained model files.
|
||||
type: GCSPath
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ["python", -u, -m, "kfp_component.launcher"]
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.ml_engine, train,
|
||||
--project_id, {inputValue: project_id},
|
||||
--python_module, {inputValue: python_module},
|
||||
--package_uris, {inputValue: package_uris},
|
||||
--region, {inputValue: region},
|
||||
--args, {inputValue: args},
|
||||
--job_dir, {inputValue: job_dir},
|
||||
--python_version, {inputValue: python_version},
|
||||
--runtime_version, {inputValue: runtime_version},
|
||||
--master_image_uri, {inputValue: master_image_uri},
|
||||
--worker_image_uri, {inputValue: worker_image_uri},
|
||||
--training_input, {inputValue: training_input},
|
||||
--job_id_prefix, {inputValue: job_id_prefix},
|
||||
--job_id, {inputValue: job_id},
|
||||
--wait_interval, {inputValue: wait_interval},
|
||||
--job_id_output_path, {outputPath: job_id},
|
||||
--job_dir_output_path, {outputPath: job_dir},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -0,0 +1,359 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"Submitting a Cloud Machine Learning Engine training job as a pipeline step\n",
|
||||
"\n",
|
||||
"# Label\n",
|
||||
"GCP, Cloud ML Engine, Machine Learning, pipeline, component, Kubeflow, Kubeflow Pipeline\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"A Kubeflow Pipeline component to submit a Cloud ML Engine training job as a step in a pipeline.\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"## Intended use\n",
|
||||
"Use this component to submit a training job to Cloud ML Engine from a Kubeflow Pipeline. \n",
|
||||
"\n",
|
||||
"## Runtime arguments\n",
|
||||
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
"|:------------------|:------------------|:----------|:--------------|:-----------------|:-------------|\n",
|
||||
"| project_id | The ID of the Google Cloud Platform (GCP) project of the job. | No | GCPProjectID | | |\n",
|
||||
"| python_module | The name of the Python module to run after installing the training program. | Yes | String | | None |\n",
|
||||
"| package_uris | The Cloud Storage location of the packages that contain the training program and any additional dependencies. The maximum number of package URIs is 100. | Yes | List | | None |\n",
|
||||
"| region | The Compute Engine region in which the training job is run. | Yes | GCPRegion | | us-central1 |\n",
|
||||
"| args | The command line arguments to pass to the training program. | Yes | List | | None |\n",
|
||||
"| job_dir | A Cloud Storage path in which to store the training outputs and other data needed for training. This path is passed to your TensorFlow program as the `job-dir` command-line argument. The benefit of specifying this field is that Cloud ML validates the path for use in training. | Yes | GCSPath | | None |\n",
|
||||
"| python_version | The version of Python used in training. If it is not set, the default version is 2.7. Python 3.5 is available when the runtime version is set to 1.4 and above. | Yes | String | | None |\n",
|
||||
"| runtime_version | The runtime version of Cloud ML Engine to use for training. If it is not set, Cloud ML Engine uses the default. | Yes | String | | 1 |\n",
|
||||
"| master_image_uri | The Docker image to run on the master replica. This image must be in Container Registry. | Yes | GCRPath | | None |\n",
|
||||
"| worker_image_uri | The Docker image to run on the worker replica. This image must be in Container Registry. | Yes | GCRPath | | None |\n",
|
||||
"| training_input | The input parameters to create a training job. | Yes | Dict | [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput) | None |\n",
|
||||
"| job_id_prefix | The prefix of the job ID that is generated. | Yes | String | | None |\n",
|
||||
"| job_id | The ID of the job to create, takes precedence over generated job id if set. | Yes | String | - | None |\n",
|
||||
"| wait_interval | The number of seconds to wait between API calls to get the status of the job. | Yes | Integer | | 30 |\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Input data schema\n",
|
||||
"\n",
|
||||
"The component accepts two types of inputs:\n",
|
||||
"* A list of Python packages from Cloud Storage.\n",
|
||||
" * You can manually build a Python package and upload it to Cloud Storage by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/packaging-trainer#manual-build).\n",
|
||||
"* A Docker container from Container Registry. \n",
|
||||
" * Follow this [guide](https://cloud.google.com/ml-engine/docs/using-containers) to publish and use a Docker container with this component.\n",
|
||||
"\n",
|
||||
"## Output\n",
|
||||
"| Name | Description | Type |\n",
|
||||
"|:------- |:---- | :--- |\n",
|
||||
"| job_id | The ID of the created job. | String |\n",
|
||||
"| job_dir | The Cloud Storage path that contains the trained model output files. | GCSPath |\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"\n",
|
||||
"To use the component, you must:\n",
|
||||
"\n",
|
||||
"* Set up a cloud environment by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).\n",
|
||||
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* Grant the following access to the Kubeflow user service account: \n",
|
||||
" * Read access to the Cloud Storage buckets which contain the input data, packages, or Docker images.\n",
|
||||
" * Write access to the Cloud Storage bucket of the output directory.\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"\n",
|
||||
"The component builds the [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput) payload and submits a job via the [Cloud ML Engine REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs).\n",
|
||||
"\n",
|
||||
"The steps to use the component in a pipeline are:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"1. Install the Kubeflow Pipeline SDK:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"mlengine_train_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/train/component.yaml')\n",
|
||||
"help(mlengine_train_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sample\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code.\n",
|
||||
"\n",
|
||||
"In this sample, you use the code from the [census estimator sample](https://github.com/GoogleCloudPlatform/cloudml-samples/tree/master/census/estimator) to train a model in Cloud ML Engine. To upload the code to Cloud ML Engine, package the Python code and upload it to a Cloud Storage bucket. \n",
|
||||
"\n",
|
||||
"Note: You must have read and write permissions on the bucket that you use as the working directory.\n",
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Required Parameters\n",
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"GCS_WORKING_DIR = 'gs://<Please put your GCS path here>' # No ending slash"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Optional Parameters\n",
|
||||
"EXPERIMENT_NAME = 'CLOUDML - Train'\n",
|
||||
"TRAINER_GCS_PATH = GCS_WORKING_DIR + '/train/trainer.tar.gz'\n",
|
||||
"OUTPUT_GCS_PATH = GCS_WORKING_DIR + '/train/output/'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Clean up the working directory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"!gsutil rm -r $GCS_WORKING_DIR"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Download the sample trainer code to local"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"!wget https://github.com/GoogleCloudPlatform/cloudml-samples/archive/master.zip\n",
|
||||
"!unzip master.zip"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Package code and upload the package to Cloud Storage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"%%bash -s \"$TRAINER_GCS_PATH\"\n",
|
||||
"pushd ./cloudml-samples-master/census/estimator/\n",
|
||||
"python setup.py sdist\n",
|
||||
"gsutil cp dist/preprocessing-1.0.tar.gz $1\n",
|
||||
"popd\n",
|
||||
"rm -fr ./cloudml-samples-master/ ./master.zip ./dist"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Example pipeline that uses the component"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='CloudML training pipeline',\n",
|
||||
" description='CloudML training pipeline'\n",
|
||||
")\n",
|
||||
"def pipeline(\n",
|
||||
" project_id = PROJECT_ID,\n",
|
||||
" python_module = 'trainer.task',\n",
|
||||
" package_uris = json.dumps([TRAINER_GCS_PATH]),\n",
|
||||
" region = 'us-central1',\n",
|
||||
" args = json.dumps([\n",
|
||||
" '--train-files', 'gs://cloud-samples-data/ml-engine/census/data/adult.data.csv',\n",
|
||||
" '--eval-files', 'gs://cloud-samples-data/ml-engine/census/data/adult.test.csv',\n",
|
||||
" '--train-steps', '1000',\n",
|
||||
" '--eval-steps', '100',\n",
|
||||
" '--verbosity', 'DEBUG'\n",
|
||||
" ]),\n",
|
||||
" job_dir = OUTPUT_GCS_PATH,\n",
|
||||
" python_version = '',\n",
|
||||
" runtime_version = '1.10',\n",
|
||||
" master_image_uri = '',\n",
|
||||
" worker_image_uri = '',\n",
|
||||
" training_input = '',\n",
|
||||
" job_id_prefix = '',\n",
|
||||
" job_id = '',\n",
|
||||
" wait_interval = '30'):\n",
|
||||
" task = mlengine_train_op(\n",
|
||||
" project_id=project_id, \n",
|
||||
" python_module=python_module, \n",
|
||||
" package_uris=package_uris, \n",
|
||||
" region=region, \n",
|
||||
" args=args, \n",
|
||||
" job_dir=job_dir, \n",
|
||||
" python_version=python_version,\n",
|
||||
" runtime_version=runtime_version, \n",
|
||||
" master_image_uri=master_image_uri, \n",
|
||||
" worker_image_uri=worker_image_uri, \n",
|
||||
" training_input=training_input, \n",
|
||||
" job_id_prefix=job_id_prefix,\n",
|
||||
" job_id=job_id,\n",
|
||||
" wait_interval=wait_interval)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Inspect the results\n",
|
||||
"\n",
|
||||
"Use the following command to inspect the contents in the output directory:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gsutil ls $OUTPUT_GCS_PATH"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/train/sample.ipynb)\n",
|
||||
"* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
name: Download from GCS
|
||||
inputs:
|
||||
- {name: GCS path, type: String}
|
||||
outputs:
|
||||
- {name: Data}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/download/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: google/cloud-sdk
|
||||
command:
|
||||
- bash # Pattern comparison only works in Bash
|
||||
- -ex
|
||||
- -c
|
||||
- |
|
||||
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
|
||||
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
|
||||
fi
|
||||
|
||||
uri="$0"
|
||||
output_path="$1"
|
||||
|
||||
# Checking whether the URI points to a single blob, a directory or a URI pattern
|
||||
# URI points to a blob when that URI does not end with slash and listing that URI only yields the same URI
|
||||
if [[ "$uri" != */ ]] && (gsutil ls "$uri" | grep --fixed-strings --line-regexp "$uri"); then
|
||||
mkdir -p "$(dirname "$output_path")"
|
||||
gsutil -m cp -r "$uri" "$output_path"
|
||||
else
|
||||
mkdir -p "$output_path" # When source path is a directory, gsutil requires the destination to also be a directory
|
||||
gsutil -m rsync -r "$uri" "$output_path" # gsutil cp has different path handling than Linux cp. It always puts the source directory (name) inside the destination directory. gsutil rsync does not have that problem.
|
||||
fi
|
||||
- inputValue: GCS path
|
||||
- outputPath: Data
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
name: Download from GCS
|
||||
inputs:
|
||||
- {name: GCS path, type: String}
|
||||
outputs:
|
||||
- {name: Data}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/download_blob/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: google/cloud-sdk
|
||||
command:
|
||||
- sh
|
||||
- -ex
|
||||
- -c
|
||||
- |
|
||||
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
|
||||
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
|
||||
fi
|
||||
mkdir -p "$(dirname "$1")"
|
||||
gsutil -m cp -r "$0" "$1"
|
||||
- inputValue: GCS path
|
||||
- outputPath: Data
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
name: Download from GCS
|
||||
inputs:
|
||||
- {name: GCS path, type: String}
|
||||
outputs:
|
||||
- {name: Data}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/download_dir/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: google/cloud-sdk
|
||||
command:
|
||||
- sh
|
||||
- -ex
|
||||
- -c
|
||||
- |
|
||||
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
|
||||
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
|
||||
fi
|
||||
mkdir -p "$1"
|
||||
gsutil -m cp -r "$0" "$1"
|
||||
- inputValue: GCS path
|
||||
- outputPath: Data
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
name: List blobs
|
||||
inputs:
|
||||
- {name: GCS path, type: String, description: 'GCS path for listing. For recursive listing use the "gs://bucket/path/**" syntax".'}
|
||||
outputs:
|
||||
- {name: Paths}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/list/component.yaml'
|
||||
volatile_component: 'true'
|
||||
implementation:
|
||||
container:
|
||||
image: google/cloud-sdk
|
||||
command:
|
||||
- sh
|
||||
- -ex
|
||||
- -c
|
||||
- |
|
||||
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
|
||||
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
|
||||
fi
|
||||
mkdir -p "$(dirname "$1")"
|
||||
gsutil ls "$0" > "$1"
|
||||
- inputValue: GCS path
|
||||
- outputPath: Paths
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
name: Upload to GCS
|
||||
inputs:
|
||||
- {name: Data}
|
||||
- {name: GCS path, type: String}
|
||||
outputs:
|
||||
- {name: GCS path, type: String}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/upload_to_explicit_uri/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: google/cloud-sdk
|
||||
command:
|
||||
- sh
|
||||
- -ex
|
||||
- -c
|
||||
- |
|
||||
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
|
||||
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
|
||||
fi
|
||||
gsutil cp -r "$0" "$1"
|
||||
mkdir -p "$(dirname "$2")"
|
||||
echo "$1" > "$2"
|
||||
- inputPath: Data
|
||||
- inputValue: GCS path
|
||||
- outputPath: GCS path
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
name: Upload to GCS
|
||||
description: Upload to GCS with unique URI suffix
|
||||
inputs:
|
||||
- {name: Data}
|
||||
- {name: GCS path prefix, type: String}
|
||||
outputs:
|
||||
- {name: GCS path, type: String}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/upload_to_unique_uri/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: google/cloud-sdk
|
||||
command:
|
||||
- sh
|
||||
- -ex
|
||||
- -c
|
||||
- |
|
||||
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
|
||||
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
|
||||
fi
|
||||
gsutil cp -r "$0" "$1"
|
||||
mkdir -p "$(dirname "$2")"
|
||||
echo "$1" > "$2"
|
||||
- inputPath: Data
|
||||
- concat: [{inputValue: GCS path prefix}, '{{workflow.uid}}_{{pod.name}}']
|
||||
- outputPath: GCS path
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
# Deprecation Warning
|
||||
|
||||
The components in this directory is now moved to [components/contrib/google-cloud/automl](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/automl). This directory will be removed by the end of 2021.
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
# Deprecation Warning
|
||||
|
||||
The components in this directory is now moved to [components/contrib/google-cloud/dataproc](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/dataproc). This directory will be removed by the end of 2021.
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
# Deprecation Warning
|
||||
|
||||
The components in this directory is now moved to [components/contrib/google-cloud/ml_engine](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/ml_engine). This directory will be removed by the end of 2021.
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
# Deprecation Warning
|
||||
|
||||
The components in this directory is now moved to [components/contrib/google-cloud/automl](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/automl). This directory will be removed by the end of September 2021.
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
# Deprecation Warning
|
||||
|
||||
The components in this directory is now moved to [components/contrib/google-cloud/Optimizer](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/Optimizer). This directory will be removed by the end of September 2021.
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
# Deprecation Warning
|
||||
|
||||
The components in this directory is now moved to [components/contrib/google-cloud/storage](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/storage). This directory will be removed by the end of September 2021.
|
||||
Loading…
Reference in New Issue