feat(components): Copy the GCP components to contrib folder (#6421)

* move a few components to contrib

* move a few components to contrib

* move a few components to contrib

* move a few components to contrib

* move a few components to contrib
This commit is contained in:
IronPan 2021-08-25 07:59:45 -07:00 committed by GitHub
parent d864db16f7
commit c783705c0e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
75 changed files with 10103 additions and 0 deletions

View File

@ -0,0 +1,122 @@
from typing import NamedTuple
from kfp.components import create_component_from_func
def add_measurement_for_trial_in_gcp_ai_platform_optimizer(
trial_name: str,
metric_value: float,
complete_trial: bool = True,
step_count: float = None,
gcp_project_id: str = None,
gcp_region: str = "us-central1",
) -> NamedTuple('Outputs', [
("trial_name", list),
("trial", dict),
("stop_trial", bool),
]):
"""Add measurement for a trial and check whether to continue.
See https://cloud.google.com/ai-platform/optimizer/docs
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
trial_name: Full trial resource name.
metric_value: Result of the trial evaluation.
step_count: Optional. The number of training steps performed with the model. Can be used when checking early stopping.
complete_trial: Whether the trial should be completed. Only completed trials are used to suggest new trials. Default is True.
"""
import logging
import time
import google.auth
from googleapiclient import discovery
logging.getLogger().setLevel(logging.INFO)
client_id = 'client1'
metric_name = 'metric'
credentials, default_project_id = google.auth.default()
# Validating and inferring the arguments
if not gcp_project_id:
gcp_project_id = default_project_id
# Building the API client.
# The main API does not work, so we need to build from the published discovery document.
def create_caip_optimizer_client(project_id):
from google.cloud import storage
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
client = storage.Client(project_id)
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
discovery_document = blob.download_as_bytes()
return discovery.build_from_document(service=discovery_document)
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
def get_project_number(project_id):
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
response = service.projects().get(projectId=project_id).execute()
return response['projectNumber']
gcp_project_number = get_project_number(gcp_project_id)
def fix_resource_name(name):
return name.replace(gcp_project_number, gcp_project_id)
ml_api = create_caip_optimizer_client(gcp_project_id)
trials_api = ml_api.projects().locations().studies().trials()
operations_api = ml_api.projects().locations().operations()
measurement = {
'measurement': {
'stepCount': step_count,
'metrics': [{
'metric': metric_name,
'value': metric_value,
}],
},
}
add_measurement_response = trials_api.addMeasurement(
name=fix_resource_name(trial_name),
body=measurement,
).execute()
if complete_trial:
should_stop_trial = True
complete_response = trials_api.complete(
name=fix_resource_name(trial_name),
).execute()
return (trial_name, complete_response, should_stop_trial)
else:
check_early_stopping_response = trials_api.checkEarlyStoppingState(
name=fix_resource_name(trial_name),
).execute()
operation_name = check_early_stopping_response['name']
while True:
get_operation_response = operations_api.get(
name=fix_resource_name(operation_name),
).execute()
if get_operation_response.get('done'):
break
logging.info('Not finished yet: ' + str(get_operation_response))
time.sleep(10)
operation_response = get_operation_response['response']
should_stop_trial = operation_response['shouldStop']
return (trial_name, add_measurement_response, should_stop_trial)
if __name__ == '__main__':
add_measurement_for_trial_in_gcp_ai_platform_optimizer_op = create_component_from_func(
add_measurement_for_trial_in_gcp_ai_platform_optimizer,
base_image='python:3.8',
packages_to_install=['google-api-python-client==1.12.3', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3'],
output_component_file='component.yaml',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Add_measurement_for_trial/component.yaml",
},
)

View File

@ -0,0 +1,220 @@
name: Add measurement for trial in gcp ai platform optimizer
description: Add measurement for a trial and check whether to continue.
inputs:
- {name: trial_name, type: String, description: Full trial resource name.}
- {name: metric_value, type: Float, description: Result of the trial evaluation.}
- name: complete_trial
type: Boolean
description: Whether the trial should be completed. Only completed trials are used
to suggest new trials. Default is True.
default: "True"
optional: true
- {name: step_count, type: Float, description: Optional. The number of training steps
performed with the model. Can be used when checking early stopping., optional: true}
- {name: gcp_project_id, type: String, optional: true}
- {name: gcp_region, type: String, default: us-central1, optional: true}
outputs:
- {name: trial_name, type: JsonArray}
- {name: trial, type: JsonObject}
- {name: stop_trial, type: Boolean}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Add_measurement_for_trial/component.yaml'
implementation:
container:
image: python:3.8
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|| PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
--user) && "$0" "$@"
- python3
- -u
- -c
- |
def add_measurement_for_trial_in_gcp_ai_platform_optimizer(
trial_name,
metric_value,
complete_trial = True,
step_count = None,
gcp_project_id = None,
gcp_region = "us-central1",
):
"""Add measurement for a trial and check whether to continue.
See https://cloud.google.com/ai-platform/optimizer/docs
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
trial_name: Full trial resource name.
metric_value: Result of the trial evaluation.
step_count: Optional. The number of training steps performed with the model. Can be used when checking early stopping.
complete_trial: Whether the trial should be completed. Only completed trials are used to suggest new trials. Default is True.
"""
import logging
import time
import google.auth
from googleapiclient import discovery
logging.getLogger().setLevel(logging.INFO)
client_id = 'client1'
metric_name = 'metric'
credentials, default_project_id = google.auth.default()
# Validating and inferring the arguments
if not gcp_project_id:
gcp_project_id = default_project_id
# Building the API client.
# The main API does not work, so we need to build from the published discovery document.
def create_caip_optimizer_client(project_id):
from google.cloud import storage
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
client = storage.Client(project_id)
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
discovery_document = blob.download_as_bytes()
return discovery.build_from_document(service=discovery_document)
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
def get_project_number(project_id):
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
response = service.projects().get(projectId=project_id).execute()
return response['projectNumber']
gcp_project_number = get_project_number(gcp_project_id)
def fix_resource_name(name):
return name.replace(gcp_project_number, gcp_project_id)
ml_api = create_caip_optimizer_client(gcp_project_id)
trials_api = ml_api.projects().locations().studies().trials()
operations_api = ml_api.projects().locations().operations()
measurement = {
'measurement': {
'stepCount': step_count,
'metrics': [{
'metric': metric_name,
'value': metric_value,
}],
},
}
add_measurement_response = trials_api.addMeasurement(
name=fix_resource_name(trial_name),
body=measurement,
).execute()
if complete_trial:
should_stop_trial = True
complete_response = trials_api.complete(
name=fix_resource_name(trial_name),
).execute()
return (trial_name, complete_response, should_stop_trial)
else:
check_early_stopping_response = trials_api.checkEarlyStoppingState(
name=fix_resource_name(trial_name),
).execute()
operation_name = check_early_stopping_response['name']
while True:
get_operation_response = operations_api.get(
name=fix_resource_name(operation_name),
).execute()
if get_operation_response.get('done'):
break
logging.info('Not finished yet: ' + str(get_operation_response))
time.sleep(10)
operation_response = get_operation_response['response']
should_stop_trial = operation_response['shouldStop']
return (trial_name, add_measurement_response, should_stop_trial)
def _serialize_bool(bool_value: bool) -> str:
if isinstance(bool_value, str):
return bool_value
if not isinstance(bool_value, bool):
raise TypeError('Value "{}" has type "{}" instead of bool.'.format(str(bool_value), str(type(bool_value))))
return str(bool_value)
def _serialize_json(obj) -> str:
if isinstance(obj, str):
return obj
import json
def default_serializer(obj):
if hasattr(obj, 'to_struct'):
return obj.to_struct()
else:
raise TypeError("Object of type '%s' is not JSON serializable and does not have .to_struct() method." % obj.__class__.__name__)
return json.dumps(obj, default=default_serializer, sort_keys=True)
def _deserialize_bool(s) -> bool:
from distutils.util import strtobool
return strtobool(s) == 1
import argparse
_parser = argparse.ArgumentParser(prog='Add measurement for trial in gcp ai platform optimizer', description='Add measurement for a trial and check whether to continue.')
_parser.add_argument("--trial-name", dest="trial_name", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--metric-value", dest="metric_value", type=float, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--complete-trial", dest="complete_trial", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--step-count", dest="step_count", type=float, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=3)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = add_measurement_for_trial_in_gcp_ai_platform_optimizer(**_parsed_args)
_output_serializers = [
_serialize_json,
_serialize_json,
_serialize_bool,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --trial-name
- {inputValue: trial_name}
- --metric-value
- {inputValue: metric_value}
- if:
cond: {isPresent: complete_trial}
then:
- --complete-trial
- {inputValue: complete_trial}
- if:
cond: {isPresent: step_count}
then:
- --step-count
- {inputValue: step_count}
- if:
cond: {isPresent: gcp_project_id}
then:
- --gcp-project-id
- {inputValue: gcp_project_id}
- if:
cond: {isPresent: gcp_region}
then:
- --gcp-region
- {inputValue: gcp_region}
- '----output-paths'
- {outputPath: trial_name}
- {outputPath: trial}
- {outputPath: stop_trial}

View File

@ -0,0 +1,84 @@
from typing import NamedTuple
from kfp.components import create_component_from_func
def create_study_in_gcp_ai_platform_optimizer(
study_id: str,
parameter_specs: list,
optimization_goal: str = 'MAXIMIZE',
metric_specs: list = None,
gcp_project_id: str = None,
gcp_region: str = "us-central1",
) -> NamedTuple('Outputs', [
("study_name", str),
]):
"""Creates a Google Cloud AI Plaform Optimizer study.
See https://cloud.google.com/ai-platform/optimizer/docs
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
study_id: Name of the study.
parameter_specs: List of parameter specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec
optimization_goal: Optimization goal when optimizing a single metric. Can be MAXIMIZE (default) or MINIMIZE. Ignored if metric_specs list is provided.
metric_specs: List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec
"""
import logging
import google.auth
logging.getLogger().setLevel(logging.INFO)
# Validating and inferring the arguments
if not gcp_project_id:
_, gcp_project_id = google.auth.default()
# Building the API client.
# The main API does not work, so we need to build from the published discovery document.
def create_caip_optimizer_client(project_id):
from google.cloud import storage
from googleapiclient import discovery
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
client = storage.Client(project_id)
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
discovery_document = blob.download_as_bytes()
return discovery.build_from_document(service=discovery_document)
ml_api = create_caip_optimizer_client(gcp_project_id)
if not metric_specs:
metric_specs=[{
'metric': 'metric',
'goal': optimization_goal,
}]
study_config = {
'algorithm': 'ALGORITHM_UNSPECIFIED', # Let the service choose the `default` algorithm.
'parameters': parameter_specs,
'metrics': metric_specs,
}
study = {'study_config': study_config}
create_study_request = ml_api.projects().locations().studies().create(
parent=f'projects/{gcp_project_id}/locations/{gcp_region}',
studyId=study_id,
body=study,
)
create_study_response = create_study_request.execute()
study_name = create_study_response['name']
return (study_name,)
if __name__ == '__main__':
create_study_in_gcp_ai_platform_optimizer_op = create_component_from_func(
create_study_in_gcp_ai_platform_optimizer,
base_image='python:3.8',
packages_to_install=['google-api-python-client==1.12.3', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3'],
output_component_file='component.yaml',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Create_study/component.yaml",
},
)

View File

@ -0,0 +1,160 @@
name: Create study in gcp ai platform optimizer
description: Creates a Google Cloud AI Plaform Optimizer study.
inputs:
- {name: study_id, type: String, description: Name of the study.}
- {name: parameter_specs, type: JsonArray, description: 'List of parameter specs.
See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec'}
- {name: optimization_goal, type: String, description: Optimization goal when optimizing
a single metric. Can be MAXIMIZE (default) or MINIMIZE. Ignored if metric_specs
list is provided., default: MAXIMIZE, optional: true}
- {name: metric_specs, type: JsonArray, description: 'List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec',
optional: true}
- {name: gcp_project_id, type: String, optional: true}
- {name: gcp_region, type: String, default: us-central1, optional: true}
outputs:
- {name: study_name, type: String}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Create_study/component.yaml'
implementation:
container:
image: python:3.8
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|| PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
--user) && "$0" "$@"
- python3
- -u
- -c
- |
def create_study_in_gcp_ai_platform_optimizer(
study_id,
parameter_specs,
optimization_goal = 'MAXIMIZE',
metric_specs = None,
gcp_project_id = None,
gcp_region = "us-central1",
):
"""Creates a Google Cloud AI Plaform Optimizer study.
See https://cloud.google.com/ai-platform/optimizer/docs
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
study_id: Name of the study.
parameter_specs: List of parameter specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec
optimization_goal: Optimization goal when optimizing a single metric. Can be MAXIMIZE (default) or MINIMIZE. Ignored if metric_specs list is provided.
metric_specs: List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec
"""
import logging
import google.auth
logging.getLogger().setLevel(logging.INFO)
# Validating and inferring the arguments
if not gcp_project_id:
_, gcp_project_id = google.auth.default()
# Building the API client.
# The main API does not work, so we need to build from the published discovery document.
def create_caip_optimizer_client(project_id):
from google.cloud import storage
from googleapiclient import discovery
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
client = storage.Client(project_id)
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
discovery_document = blob.download_as_bytes()
return discovery.build_from_document(service=discovery_document)
ml_api = create_caip_optimizer_client(gcp_project_id)
if not metric_specs:
metric_specs=[{
'metric': 'metric',
'goal': optimization_goal,
}]
study_config = {
'algorithm': 'ALGORITHM_UNSPECIFIED', # Let the service choose the `default` algorithm.
'parameters': parameter_specs,
'metrics': metric_specs,
}
study = {'study_config': study_config}
create_study_request = ml_api.projects().locations().studies().create(
parent=f'projects/{gcp_project_id}/locations/{gcp_region}',
studyId=study_id,
body=study,
)
create_study_response = create_study_request.execute()
study_name = create_study_response['name']
return (study_name,)
def _serialize_str(str_value: str) -> str:
if not isinstance(str_value, str):
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
return str_value
import json
import argparse
_parser = argparse.ArgumentParser(prog='Create study in gcp ai platform optimizer', description='Creates a Google Cloud AI Plaform Optimizer study.')
_parser.add_argument("--study-id", dest="study_id", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--parameter-specs", dest="parameter_specs", type=json.loads, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--optimization-goal", dest="optimization_goal", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--metric-specs", dest="metric_specs", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = create_study_in_gcp_ai_platform_optimizer(**_parsed_args)
_output_serializers = [
_serialize_str,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --study-id
- {inputValue: study_id}
- --parameter-specs
- {inputValue: parameter_specs}
- if:
cond: {isPresent: optimization_goal}
then:
- --optimization-goal
- {inputValue: optimization_goal}
- if:
cond: {isPresent: metric_specs}
then:
- --metric-specs
- {inputValue: metric_specs}
- if:
cond: {isPresent: gcp_project_id}
then:
- --gcp-project-id
- {inputValue: gcp_project_id}
- if:
cond: {isPresent: gcp_region}
then:
- --gcp-region
- {inputValue: gcp_region}
- '----output-paths'
- {outputPath: study_name}

View File

@ -0,0 +1,188 @@
from typing import NamedTuple
from kfp.components import create_component_from_func
def suggest_parameter_sets_from_measurements_using_gcp_ai_platform_optimizer(
parameter_specs: list,
metrics_for_parameter_sets: list,
suggestion_count: int,
maximize: bool = False,
metric_specs: list = None,
gcp_project_id: str = None,
gcp_region: str = "us-central1",
) -> NamedTuple('Outputs', [
("suggested_parameter_sets", list),
]):
"""Suggests trials (parameter sets) to evaluate.
See https://cloud.google.com/ai-platform/optimizer/docs
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
parameter_specs: List of parameter specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec
metrics_for_parameter_sets: List of parameter sets and evaluation metrics for them. Each list item contains "parameters" dict and "metrics" dict. Example: {"parameters": {"p1": 1.1, "p2": 2.2}, "metrics": {"metric1": 101, "metric2": 102} }
maximize: Whether to miaximize or minimize when optimizing a single metric.Default is to minimize. Ignored if metric_specs list is provided.
metric_specs: List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec
suggestion_count: Number of suggestions to request.
suggested_parameter_sets: List of parameter set dictionaries.
"""
import logging
import random
import time
import google.auth
from googleapiclient import discovery
logging.getLogger().setLevel(logging.INFO)
client_id = 'client1'
credentials, default_project_id = google.auth.default()
# Validating and inferring the arguments
if not gcp_project_id:
gcp_project_id = default_project_id
# Building the API client.
# The main API does not work, so we need to build from the published discovery document.
def create_caip_optimizer_client(project_id):
from google.cloud import storage
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
client = storage.Client(project_id)
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
discovery_document = blob.download_as_bytes()
return discovery.build_from_document(service=discovery_document)
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
def get_project_number(project_id):
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
response = service.projects().get(projectId=project_id).execute()
return response['projectNumber']
gcp_project_number = get_project_number(gcp_project_id)
def fix_resource_name(name):
return name.replace(gcp_project_number, gcp_project_id)
ml_api = create_caip_optimizer_client(gcp_project_id)
studies_api = ml_api.projects().locations().studies()
trials_api = ml_api.projects().locations().studies().trials()
operations_api = ml_api.projects().locations().operations()
random_integer = random.SystemRandom().getrandbits(256)
study_id = '{:064x}'.format(random_integer)
if not metric_specs:
metric_specs=[{
'metric': 'metric',
'goal': 'MAXIMIZE' if maximize else 'MINIMIZE',
}]
study_config = {
'algorithm': 'ALGORITHM_UNSPECIFIED', # Let the service choose the `default` algorithm.
'parameters': parameter_specs,
'metrics': metric_specs,
}
study = {'study_config': study_config}
logging.info(f'Creating temporary study {study_id}')
create_study_request = studies_api.create(
parent=f'projects/{gcp_project_id}/locations/{gcp_region}',
studyId=study_id,
body=study,
)
create_study_response = create_study_request.execute()
study_name = create_study_response['name']
paremeter_type_names = {parameter_spec['parameter']: parameter_spec['type'] for parameter_spec in parameter_specs}
def parameter_name_and_value_to_dict(parameter_name: str, parameter_value) -> dict:
result = {'parameter': parameter_name}
paremeter_type_name = paremeter_type_names[parameter_name]
if paremeter_type_name in ['DOUBLE', 'DISCRETE']:
result['floatValue'] = parameter_value
elif paremeter_type_name == 'INTEGER':
result['intValue'] = parameter_value
elif paremeter_type_name == 'CATEGORICAL':
result['stringValue'] = parameter_value
else:
raise TypeError(f'Unsupported parameter type "{paremeter_type_name}"')
return result
try:
logging.info(f'Adding {len(metrics_for_parameter_sets)} measurements to the study.')
for parameters_and_metrics in metrics_for_parameter_sets:
parameter_set = parameters_and_metrics['parameters']
metrics_set = parameters_and_metrics['metrics']
trial = {
'parameters': [
parameter_name_and_value_to_dict(parameter_name, parameter_value)
for parameter_name, parameter_value in parameter_set.items()
],
'finalMeasurement': {
'metrics': [
{
'metric': metric_name,
'value': metric_value,
}
for metric_name, metric_value in metrics_set.items()
],
},
'state': 'COMPLETED',
}
create_trial_response = trials_api.create(
parent=fix_resource_name(study_name),
body=trial,
).execute()
trial_name = create_trial_response["name"]
logging.info(f'Added trial "{trial_name}" to the study.')
logging.info(f'Requesting suggestions.')
suggest_trials_request = trials_api.suggest(
parent=fix_resource_name(study_name),
body=dict(
suggestionCount=suggestion_count,
clientId=client_id,
),
)
suggest_trials_response = suggest_trials_request.execute()
operation_name = suggest_trials_response['name']
while True:
get_operation_response = operations_api.get(
name=fix_resource_name(operation_name),
).execute()
# Knowledge: The "done" key is just missing until the result is available
if get_operation_response.get('done'):
break
logging.info('Operation not finished yet: ' + str(get_operation_response))
time.sleep(10)
operation_response = get_operation_response['response']
suggested_trials = operation_response['trials']
suggested_parameter_sets = [
{
parameter['parameter']: parameter.get('floatValue') or parameter.get('intValue') or parameter.get('stringValue') or 0.0
for parameter in trial['parameters']
}
for trial in suggested_trials
]
return (suggested_parameter_sets,)
finally:
logging.info(f'Deleting study: "{study_name}"')
studies_api.delete(name=fix_resource_name(study_name))
if __name__ == '__main__':
suggest_parameter_sets_from_measurements_using_gcp_ai_platform_optimizer_op = create_component_from_func(
suggest_parameter_sets_from_measurements_using_gcp_ai_platform_optimizer,
base_image='python:3.8',
packages_to_install=['google-api-python-client==1.12.3', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3'],
output_component_file='component.yaml',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Suggest_parameter_sets_based_on_measurements/component.yaml",
},
)

View File

@ -0,0 +1,284 @@
name: Suggest parameter sets from measurements using gcp ai platform optimizer
description: Suggests trials (parameter sets) to evaluate.
inputs:
- {name: parameter_specs, type: JsonArray, description: 'List of parameter specs.
See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec'}
- {name: metrics_for_parameter_sets, type: JsonArray, description: 'List of parameter
sets and evaluation metrics for them. Each list item contains "parameters" dict
and "metrics" dict. Example: {"parameters": {"p1": 1.1, "p2": 2.2}, "metrics":
{"metric1": 101, "metric2": 102} }'}
- {name: suggestion_count, type: Integer, description: Number of suggestions to request.}
- name: maximize
type: Boolean
description: Whether to miaximize or minimize when optimizing a single metric.Default
is to minimize. Ignored if metric_specs list is provided.
default: "False"
optional: true
- {name: metric_specs, type: JsonArray, description: 'List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec',
optional: true}
- {name: gcp_project_id, type: String, optional: true}
- {name: gcp_region, type: String, default: us-central1, optional: true}
outputs:
- {name: suggested_parameter_sets, type: JsonArray}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Suggest_parameter_sets_based_on_measurements/component.yaml'
implementation:
container:
image: python:3.8
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|| PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
--user) && "$0" "$@"
- python3
- -u
- -c
- |
def suggest_parameter_sets_from_measurements_using_gcp_ai_platform_optimizer(
parameter_specs,
metrics_for_parameter_sets,
suggestion_count,
maximize = False,
metric_specs = None,
gcp_project_id = None,
gcp_region = "us-central1",
):
"""Suggests trials (parameter sets) to evaluate.
See https://cloud.google.com/ai-platform/optimizer/docs
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
parameter_specs: List of parameter specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#parameterspec
metrics_for_parameter_sets: List of parameter sets and evaluation metrics for them. Each list item contains "parameters" dict and "metrics" dict. Example: {"parameters": {"p1": 1.1, "p2": 2.2}, "metrics": {"metric1": 101, "metric2": 102} }
maximize: Whether to miaximize or minimize when optimizing a single metric.Default is to minimize. Ignored if metric_specs list is provided.
metric_specs: List of metric specs. See https://cloud.google.com/ai-platform/optimizer/docs/reference/rest/v1/projects.locations.studies#metricspec
suggestion_count: Number of suggestions to request.
suggested_parameter_sets: List of parameter set dictionaries.
"""
import logging
import random
import time
import google.auth
from googleapiclient import discovery
logging.getLogger().setLevel(logging.INFO)
client_id = 'client1'
credentials, default_project_id = google.auth.default()
# Validating and inferring the arguments
if not gcp_project_id:
gcp_project_id = default_project_id
# Building the API client.
# The main API does not work, so we need to build from the published discovery document.
def create_caip_optimizer_client(project_id):
from google.cloud import storage
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
client = storage.Client(project_id)
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
discovery_document = blob.download_as_bytes()
return discovery.build_from_document(service=discovery_document)
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
def get_project_number(project_id):
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
response = service.projects().get(projectId=project_id).execute()
return response['projectNumber']
gcp_project_number = get_project_number(gcp_project_id)
def fix_resource_name(name):
return name.replace(gcp_project_number, gcp_project_id)
ml_api = create_caip_optimizer_client(gcp_project_id)
studies_api = ml_api.projects().locations().studies()
trials_api = ml_api.projects().locations().studies().trials()
operations_api = ml_api.projects().locations().operations()
random_integer = random.SystemRandom().getrandbits(256)
study_id = '{:064x}'.format(random_integer)
if not metric_specs:
metric_specs=[{
'metric': 'metric',
'goal': 'MAXIMIZE' if maximize else 'MINIMIZE',
}]
study_config = {
'algorithm': 'ALGORITHM_UNSPECIFIED', # Let the service choose the `default` algorithm.
'parameters': parameter_specs,
'metrics': metric_specs,
}
study = {'study_config': study_config}
logging.info(f'Creating temporary study {study_id}')
create_study_request = studies_api.create(
parent=f'projects/{gcp_project_id}/locations/{gcp_region}',
studyId=study_id,
body=study,
)
create_study_response = create_study_request.execute()
study_name = create_study_response['name']
paremeter_type_names = {parameter_spec['parameter']: parameter_spec['type'] for parameter_spec in parameter_specs}
def parameter_name_and_value_to_dict(parameter_name, parameter_value):
result = {'parameter': parameter_name}
paremeter_type_name = paremeter_type_names[parameter_name]
if paremeter_type_name in ['DOUBLE', 'DISCRETE']:
result['floatValue'] = parameter_value
elif paremeter_type_name == 'INTEGER':
result['intValue'] = parameter_value
elif paremeter_type_name == 'CATEGORICAL':
result['stringValue'] = parameter_value
else:
raise TypeError(f'Unsupported parameter type "{paremeter_type_name}"')
return result
try:
logging.info(f'Adding {len(metrics_for_parameter_sets)} measurements to the study.')
for parameters_and_metrics in metrics_for_parameter_sets:
parameter_set = parameters_and_metrics['parameters']
metrics_set = parameters_and_metrics['metrics']
trial = {
'parameters': [
parameter_name_and_value_to_dict(parameter_name, parameter_value)
for parameter_name, parameter_value in parameter_set.items()
],
'finalMeasurement': {
'metrics': [
{
'metric': metric_name,
'value': metric_value,
}
for metric_name, metric_value in metrics_set.items()
],
},
'state': 'COMPLETED',
}
create_trial_response = trials_api.create(
parent=fix_resource_name(study_name),
body=trial,
).execute()
trial_name = create_trial_response["name"]
logging.info(f'Added trial "{trial_name}" to the study.')
logging.info(f'Requesting suggestions.')
suggest_trials_request = trials_api.suggest(
parent=fix_resource_name(study_name),
body=dict(
suggestionCount=suggestion_count,
clientId=client_id,
),
)
suggest_trials_response = suggest_trials_request.execute()
operation_name = suggest_trials_response['name']
while True:
get_operation_response = operations_api.get(
name=fix_resource_name(operation_name),
).execute()
# Knowledge: The "done" key is just missing until the result is available
if get_operation_response.get('done'):
break
logging.info('Operation not finished yet: ' + str(get_operation_response))
time.sleep(10)
operation_response = get_operation_response['response']
suggested_trials = operation_response['trials']
suggested_parameter_sets = [
{
parameter['parameter']: parameter.get('floatValue') or parameter.get('intValue') or parameter.get('stringValue') or 0.0
for parameter in trial['parameters']
}
for trial in suggested_trials
]
return (suggested_parameter_sets,)
finally:
logging.info(f'Deleting study: "{study_name}"')
studies_api.delete(name=fix_resource_name(study_name))
import json
def _serialize_json(obj) -> str:
if isinstance(obj, str):
return obj
import json
def default_serializer(obj):
if hasattr(obj, 'to_struct'):
return obj.to_struct()
else:
raise TypeError("Object of type '%s' is not JSON serializable and does not have .to_struct() method." % obj.__class__.__name__)
return json.dumps(obj, default=default_serializer, sort_keys=True)
def _deserialize_bool(s) -> bool:
from distutils.util import strtobool
return strtobool(s) == 1
import argparse
_parser = argparse.ArgumentParser(prog='Suggest parameter sets from measurements using gcp ai platform optimizer', description='Suggests trials (parameter sets) to evaluate.')
_parser.add_argument("--parameter-specs", dest="parameter_specs", type=json.loads, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--metrics-for-parameter-sets", dest="metrics_for_parameter_sets", type=json.loads, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--suggestion-count", dest="suggestion_count", type=int, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--maximize", dest="maximize", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--metric-specs", dest="metric_specs", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = suggest_parameter_sets_from_measurements_using_gcp_ai_platform_optimizer(**_parsed_args)
_output_serializers = [
_serialize_json,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --parameter-specs
- {inputValue: parameter_specs}
- --metrics-for-parameter-sets
- {inputValue: metrics_for_parameter_sets}
- --suggestion-count
- {inputValue: suggestion_count}
- if:
cond: {isPresent: maximize}
then:
- --maximize
- {inputValue: maximize}
- if:
cond: {isPresent: metric_specs}
then:
- --metric-specs
- {inputValue: metric_specs}
- if:
cond: {isPresent: gcp_project_id}
then:
- --gcp-project-id
- {inputValue: gcp_project_id}
- if:
cond: {isPresent: gcp_region}
then:
- --gcp-region
- {inputValue: gcp_region}
- '----output-paths'
- {outputPath: suggested_parameter_sets}

View File

@ -0,0 +1,100 @@
from typing import NamedTuple
from kfp.components import create_component_from_func
def suggest_trials_in_gcp_ai_platform_optimizer(
study_name: str,
suggestion_count: int,
gcp_project_id: str = None,
gcp_region: str = "us-central1",
) -> NamedTuple('Outputs', [
("suggested_trials", list),
]):
"""Suggests trials (parameter sets) to evaluate.
See https://cloud.google.com/ai-platform/optimizer/docs
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
study_name: Full resource name of the study.
suggestion_count: Number of suggestions to request.
"""
import logging
import time
import google.auth
from googleapiclient import discovery
logging.getLogger().setLevel(logging.INFO)
client_id = 'client1'
credentials, default_project_id = google.auth.default()
# Validating and inferring the arguments
if not gcp_project_id:
gcp_project_id = default_project_id
# Building the API client.
# The main API does not work, so we need to build from the published discovery document.
def create_caip_optimizer_client(project_id):
from google.cloud import storage
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
client = storage.Client(project_id)
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
discovery_document = blob.download_as_bytes()
return discovery.build_from_document(service=discovery_document)
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
def get_project_number(project_id):
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
response = service.projects().get(projectId=project_id).execute()
return response['projectNumber']
gcp_project_number = get_project_number(gcp_project_id)
def fix_resource_name(name):
return name.replace(gcp_project_number, gcp_project_id)
ml_api = create_caip_optimizer_client(gcp_project_id)
trials_api = ml_api.projects().locations().studies().trials()
operations_api = ml_api.projects().locations().operations()
suggest_trials_request = trials_api.suggest(
parent=fix_resource_name(study_name),
body=dict(
suggestionCount=suggestion_count,
clientId=client_id,
),
)
suggest_trials_response = suggest_trials_request.execute()
operation_name = suggest_trials_response['name']
while True:
get_operation_response = operations_api.get(
name=fix_resource_name(operation_name),
).execute()
# Knowledge: The "done" key is just missing until the result is available
if get_operation_response.get('done'):
break
logging.info('Not finished yet: ' + str(get_operation_response))
time.sleep(10)
operation_response = get_operation_response['response']
suggested_trials = operation_response['trials']
return (suggested_trials,)
if __name__ == '__main__':
suggest_trials_in_gcp_ai_platform_optimizer_op = create_component_from_func(
suggest_trials_in_gcp_ai_platform_optimizer,
base_image='python:3.8',
packages_to_install=['google-api-python-client==1.12.3', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3'],
output_component_file='component.yaml',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Suggest_trials/component.yaml",
},
)

View File

@ -0,0 +1,163 @@
name: Suggest trials in gcp ai platform optimizer
description: Suggests trials (parameter sets) to evaluate.
inputs:
- {name: study_name, type: String, description: Full resource name of the study.}
- {name: suggestion_count, type: Integer, description: Number of suggestions to request.}
- {name: gcp_project_id, type: String, optional: true}
- {name: gcp_region, type: String, default: us-central1, optional: true}
outputs:
- {name: suggested_trials, type: JsonArray}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Suggest_trials/component.yaml'
implementation:
container:
image: python:3.8
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|| PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-api-python-client==1.12.3' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
--user) && "$0" "$@"
- python3
- -u
- -c
- |
def suggest_trials_in_gcp_ai_platform_optimizer(
study_name,
suggestion_count,
gcp_project_id = None,
gcp_region = "us-central1",
):
"""Suggests trials (parameter sets) to evaluate.
See https://cloud.google.com/ai-platform/optimizer/docs
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
study_name: Full resource name of the study.
suggestion_count: Number of suggestions to request.
"""
import logging
import time
import google.auth
from googleapiclient import discovery
logging.getLogger().setLevel(logging.INFO)
client_id = 'client1'
credentials, default_project_id = google.auth.default()
# Validating and inferring the arguments
if not gcp_project_id:
gcp_project_id = default_project_id
# Building the API client.
# The main API does not work, so we need to build from the published discovery document.
def create_caip_optimizer_client(project_id):
from google.cloud import storage
_OPTIMIZER_API_DOCUMENT_BUCKET = 'caip-optimizer-public'
_OPTIMIZER_API_DOCUMENT_FILE = 'api/ml_public_google_rest_v1.json'
client = storage.Client(project_id)
bucket = client.get_bucket(_OPTIMIZER_API_DOCUMENT_BUCKET)
blob = bucket.get_blob(_OPTIMIZER_API_DOCUMENT_FILE)
discovery_document = blob.download_as_bytes()
return discovery.build_from_document(service=discovery_document)
# Workaround for the Optimizer bug: Optimizer returns resource names that use project number, but only supports resource names with project IDs when making requests
def get_project_number(project_id):
service = discovery.build('cloudresourcemanager', 'v1', credentials=credentials)
response = service.projects().get(projectId=project_id).execute()
return response['projectNumber']
gcp_project_number = get_project_number(gcp_project_id)
def fix_resource_name(name):
return name.replace(gcp_project_number, gcp_project_id)
ml_api = create_caip_optimizer_client(gcp_project_id)
trials_api = ml_api.projects().locations().studies().trials()
operations_api = ml_api.projects().locations().operations()
suggest_trials_request = trials_api.suggest(
parent=fix_resource_name(study_name),
body=dict(
suggestionCount=suggestion_count,
clientId=client_id,
),
)
suggest_trials_response = suggest_trials_request.execute()
operation_name = suggest_trials_response['name']
while True:
get_operation_response = operations_api.get(
name=fix_resource_name(operation_name),
).execute()
# Knowledge: The "done" key is just missing until the result is available
if get_operation_response.get('done'):
break
logging.info('Not finished yet: ' + str(get_operation_response))
time.sleep(10)
operation_response = get_operation_response['response']
suggested_trials = operation_response['trials']
return (suggested_trials,)
def _serialize_json(obj) -> str:
if isinstance(obj, str):
return obj
import json
def default_serializer(obj):
if hasattr(obj, 'to_struct'):
return obj.to_struct()
else:
raise TypeError("Object of type '%s' is not JSON serializable and does not have .to_struct() method." % obj.__class__.__name__)
return json.dumps(obj, default=default_serializer, sort_keys=True)
import argparse
_parser = argparse.ArgumentParser(prog='Suggest trials in gcp ai platform optimizer', description='Suggests trials (parameter sets) to evaluate.')
_parser.add_argument("--study-name", dest="study_name", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--suggestion-count", dest="suggestion_count", type=int, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = suggest_trials_in_gcp_ai_platform_optimizer(**_parsed_args)
_output_serializers = [
_serialize_json,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --study-name
- {inputValue: study_name}
- --suggestion-count
- {inputValue: suggestion_count}
- if:
cond: {isPresent: gcp_project_id}
then:
- --gcp-project-id
- {inputValue: gcp_project_id}
- if:
cond: {isPresent: gcp_region}
then:
- --gcp-region
- {inputValue: gcp_region}
- '----output-paths'
- {outputPath: suggested_trials}

View File

@ -0,0 +1,153 @@
# This pipeline demonstrates hyper-parameter optimization.
# The goal is to find a set of hyper-parameter values that helps train the best model.
# We launch several optimization stages sequentially.
# At each stage the optimizer suggests several parameter sets to explore based on the available measurements.
# For each suggested parameter set we train a model (semi-dummy) and measure its quality metrics.
# We then collect the metrics for all suggested parameter sets and update out measurements set.
# With the expanded set of measurements, each new optimization stage should result in better parameter set suggestions.
#
# One aspect of this pipeline is the atomicity of the parameter set suggestion.
# Some optimizers have a persistent mutable global state that is changed when parameter set metrics are submitted.
# The presence of mutable global state may cause reproducibility issues where suggestions for a new model might be based on measurements from a different model.
# The "suggest_parameter_sets_from_measurements_op" in this pipeline is a single operation, which behaves like a pure function and does not rely on external global state.
kfp_endpoint = None
import kfp
from kfp import components
suggest_parameter_sets_from_measurements_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/382c4d109fbd489bd85de54dd9171150e326b401/components/google-cloud/Optimizer/Suggest_parameter_sets_based_on_measurements/component.yaml')
get_element_by_index_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/55ef28a9d51edc4eeed2a5c6f44cc7457e8a41d8/components/json/Get_element_by_index/component.yaml')
build_dict_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/4a4be6b748b0d1284d65a417ce4ab5bec596e9fe/components/json/Build_dict/component.yaml')
build_list_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/4a4be6b748b0d1284d65a417ce4ab5bec596e9fe/components/json/Build_list/component.yaml')
combine_lists_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/4a4be6b748b0d1284d65a417ce4ab5bec596e9fe/components/json/Combine_lists/component.yaml')
# The train_and_measure_model is a semi-dummy component that creates a model given the [hyper]parameters and evaluates that model.
# In this case, the model is a polinomial model.
# The evaluation procedure compares the model with the real function that our model is trying to learn
# and calculates the mean squared error based on a random sample of data points.
# In real world cases this component will be substituted by a sequence of model trainer, predictor and evaluator components.
@components.create_component_from_func
def train_and_measure_model(parameters: dict) -> float:
import random
def real_function(x):
p1 = 3
p2 = -1
p3 = 2
return p1 * x**2 + p2 * x + p3
def get_eval_set() -> dict:
eval_set = {}
num_samples = 100
for i in range(num_samples):
x = random.normalvariate(0, 1) * 5
eval_set[x] = real_function(x)
return eval_set
def train_model(parameters):
def apply_model(x):
return parameters['p1'] * x**2 + parameters['p2'] * x + parameters['p3']
return apply_model
model = train_model(parameters)
eval_set = get_eval_set()
sum_squared_error = 0
for x, expected_y in eval_set.items():
actual_y = model(x)
error = abs(expected_y - actual_y)
squared_error = error ** 2
sum_squared_error += squared_error
mean_squared_error = sum_squared_error / len(eval_set)
return mean_squared_error
parameter_specs=[
{
'parameter': 'p1',
'type': 'DOUBLE',
'double_value_spec' : {
'min_value' : -5,
'max_value' : 5,
}
},
{
'parameter': 'p2',
'type': 'DOUBLE',
'double_value_spec': {
'min_value': -5,
'max_value': 5,
}
},
{
'parameter': 'p3',
'type': 'DOUBLE',
'double_value_spec': {
'min_value': -5,
'max_value': 5,
}
},
]
def optimizer_pipeline():
# Number of optimization stages and suggestions per stage.
# Note that these numbers cannot be parametrized, since they're used in compile-time python loops.
optimization_stages = 3
suggestions_per_stage = 5
# We launch several optimization stages sequentially.
# At each stage the optimizer suggests several parameter sets to explore based on the available measurements.
# Each stage depends on the completion of all trials in the previous stage (since only completed trials affect new trial suggesions).
# Each optimization stage should result in better parameter set suggestions.
all_metrics_for_parameter_sets = []
for stage in range(optimization_stages):
parameter_sets = suggest_parameter_sets_from_measurements_op(
parameter_specs=parameter_specs,
metrics_for_parameter_sets=all_metrics_for_parameter_sets,
suggestion_count=suggestions_per_stage,
maximize=False,
).output
# Evaluate each suggested set of parameters.
# Loop over the suggested trials.
# We need to collect the created tasks in the `trial_measurement_tasks` list so that the next round of suggestions can depend on their completion.
# Cannot use dsl.ParallelFor here due to a bug in Argo https://github.com/argoproj/argo-workflows/issues/2660
# Without ParallelFor we have to use python loop
# and explicitly get individual suggections using the get_element_by_index_op component
# then extract the trial name and parameter sets using get_element_by_key_op and query_json_op components.
new_metrics_for_parameter_sets = []
for siggestion_index in range(suggestions_per_stage):
parameter_set = get_element_by_index_op(
json=parameter_sets,
index=siggestion_index,
).output
model_error = train_and_measure_model(
parameters=parameter_set,
).output
metric_for_parameter_set = build_dict_op(
key_1='parameters',
value_1=parameter_set,
key_2='metrics',
value_2={
'metric': model_error,
},
).output
new_metrics_for_parameter_sets.append(metric_for_parameter_set)
# Collecting metrics for the current stage
new_list_of_metrics_for_parameter_sets = build_list_op(*new_metrics_for_parameter_sets).output
# Collecting metrics for all stages
all_metrics_for_parameter_sets = combine_lists_op(all_metrics_for_parameter_sets, new_list_of_metrics_for_parameter_sets).output
if __name__ == '__main__':
kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(optimizer_pipeline, arguments={})

View File

@ -0,0 +1,133 @@
kfp_endpoint = None
import kfp
from kfp import components
optimizer_create_study_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/40e117cca61fd923a57a1e84cbd08c22dce4bf00/components/google-cloud/Optimizer/Create_study/component.yaml')
optimizer_suggest_trials_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/40e117cca61fd923a57a1e84cbd08c22dce4bf00/components/google-cloud/Optimizer/Suggest_trials/component.yaml')
optimizer_add_measurement_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/40e117cca61fd923a57a1e84cbd08c22dce4bf00/components/google-cloud/Optimizer/Add_measurement_for_trial/component.yaml')
get_element_by_index_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/55ef28a9d51edc4eeed2a5c6f44cc7457e8a41d8/components/json/Get_element_by_index/component.yaml')
get_element_by_key_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/55ef28a9d51edc4eeed2a5c6f44cc7457e8a41d8/components/json/Get_element_by_key/component.yaml')
query_json_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/55ef28a9d51edc4eeed2a5c6f44cc7457e8a41d8/components/json/Query/component.yaml')
# Component that builds a model given the [hyper]parameters and evaluates that model.
# In this case, the model is a polinomial model.
# The evaluation procedure compares the model with the real function that our model is trying to learn
# and calculates the mean squared error based on a random sample of data points.
# In real world cases this component will be substituted by a sequence of model trainer, predictor and evaluator components.
@components.create_component_from_func
def evaluate_model(parameters: dict) -> float:
import random
def real_function(x):
p1 = 3
p2 = -1
p3 = 2
return p1 * x**2 + p2 * x + p3
def evaluate_model(parameters, x):
return parameters['p1'] * x**2 + parameters['p2'] * x + parameters['p3']
sum_squared_error = 0
num_samples = 100
for i in range(num_samples):
x = random.normalvariate(0, 1) * 5
real_y = real_function(x)
actual_y = evaluate_model(parameters, x)
error = abs(real_y - actual_y)
squared_error = error ** 2
sum_squared_error += squared_error
mean_squared_error = sum_squared_error / num_samples
return mean_squared_error
def optimizer_pipeline(
):
optimization_stages = 3
trials_per_stage = 5
study_name = optimizer_create_study_op(
study_id='Study4',
parameter_specs=[
{
'parameter': 'p1',
'type': 'DOUBLE',
'double_value_spec' : {
'min_value' : -5,
'max_value' : 5,
}
},
{
'parameter': 'p2',
'type': 'DOUBLE',
'double_value_spec': {
'min_value': -5,
'max_value': 5,
}
},
{
'parameter': 'p3',
'type': 'DOUBLE',
'double_value_spec': {
'min_value': -5,
'max_value': 5,
}
},
],
optimization_goal='MINIMIZE',
).outputs['study_name']
# We launch several optimization stages sequentially.
# Each stage depends on the completion of all trials in the previous stage (since only completed trials affect new trial suggesions).
# Each optimization stage should result in better parameter set suggestions.
trial_measurement_tasks = []
for stage in range(optimization_stages):
suggest_trials_task = optimizer_suggest_trials_op(
study_name=study_name,
suggestion_count=suggestion_count,
)
suggest_trials_task.after(*trial_measurement_tasks)
trials = suggest_trials_task.output
# Evaluate each suggested set of parameters.
# Loop over the suggested trials.
# We need to collect the created tasks in the `trial_measurement_tasks` list so that the next round of suggestions can depend on their completion.
# Cannot use dsl.ParallelFor here due to a bug in Argo https://github.com/argoproj/argo-workflows/issues/2660
# Without ParallelFor we have to use python loop
# and explicitly get individual suggections using the get_element_by_index_op component
# then extract the trial name and parameter sets using get_element_by_key_op and query_json_op components.
trial_measurement_tasks = []
for trial_index in range(trials_per_stage):
trial = get_element_by_index_op(
json=trials,
index=trial_index,
).output
trial_name = get_element_by_key_op(
json=trial,
key='name',
).output
trial_parameters = query_json_op(
json=trial,
query='.parameters | map( {(.parameter): (.floatValue // .intValue // .stringValue)} ) | add',
).output
model_error = evaluate_model(
parameters=trial_parameters,
).output
add_measurement_task = optimizer_add_measurement_op(
trial_name=trial_name,
metric_value=model_error,
)
trial_measurement_tasks.append(add_measurement_task)
if __name__ == '__main__':
kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(optimizer_pipeline, arguments={})

View File

@ -0,0 +1,182 @@
from typing import NamedTuple
from kfp.components import create_component_from_func, InputPath, OutputPath
def automl_create_tables_dataset_from_csv(
data_path: InputPath('CSV'),
target_column_name: str = None,
column_nullability: dict = {},
column_types: dict = {},
gcs_staging_uri: str = None, # Currently AutoML Tables only supports regional buckets in "us-central1".
gcp_project_id: str = None,
gcp_region: str = 'us-central1', # Currently "us-central1" is the only region supported by AutoML tables.
) -> NamedTuple('Outputs', [
('dataset_name', str),
('dataset_url', 'URI'),
]):
'''Creates Google Cloud AutoML Tables Dataset from CSV data.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
data_path: Data in CSV format that will be imported to the dataset.
target_column_name: Name of the target column for training.
column_nullability: Maps column name to boolean specifying whether the column should be marked as nullable.
column_types: Maps column name to column type. Supported types: FLOAT64, CATEGORY, STRING.
gcs_staging_uri: URI of the data staging location in Google Cloud Storage. The bucket must have the us-central1 region. If not specified, a new staging bucket will be created.
gcp_project_id: Google Cloud project ID. If not set, the default one will be used.
gcp_region: Google Cloud region. AutoML Tables only supports us-central1.
Returns:
dataset_name: AutoML dataset name (fully-qualified)
'''
import logging
import random
import google.auth
from google.cloud import automl_v1beta1 as automl
from google.cloud import storage
logging.getLogger().setLevel(logging.INFO)
# Validating and inferring the arguments
if not gcp_project_id:
_, gcp_project_id = google.auth.default()
if not gcp_region:
gcp_region = 'us-central1'
if gcp_region != 'us-central1':
logging.warn('AutoML only supports the us-central1 region')
dataset_display_name = 'Dataset' # Allowed characters for displayName are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9
column_nullability = column_nullability or {}
for name, nullability in column_nullability.items():
assert isinstance(name, str)
assert isinstance(nullability, bool)
column_types = column_types or {}
for name, data_type in column_types.items():
assert isinstance(name, str)
if not hasattr(automl.TypeCode, data_type):
supported_types = [type_name for type_name in dir(automl.TypeCode) if type_name[0] != '_']
raise ValueError(f'Unknow column type "{data_type}". Supported types: {supported_types}')
# Generating execution ID for data staging
random_integer = random.SystemRandom().getrandbits(256)
execution_id = '{:064x}'.format(random_integer)
logging.info(f'Execution ID: {execution_id}')
logging.info('Uploading the data to storage')
# TODO: Split table into < 100MB chunks as required by AutoML Tables
storage_client = storage.Client()
if gcs_staging_uri:
if not gcs_staging_uri.startswith('gs://'):
raise ValueError(f"Invalid staging storage URI: {gcs_staging_uri}")
(bucket_name, blob_prefix) = gcs_staging_uri[5:].split('/', 1)
bucket = storage_client.get_bucket(bucket_name)
else:
bucket_name = gcp_project_id + '_staging_' + gcp_region
try:
bucket = storage_client.get_bucket(bucket_name)
except Exception as ex:
logging.info(f'Creating Storage bucket {bucket_name}')
bucket = storage_client.create_bucket(
bucket_or_name=bucket_name,
project=gcp_project_id,
location=gcp_region,
)
logging.info(f'Created Storage bucket {bucket.name}')
blob_prefix = 'google.cloud.automl_tmp'
# AutoML Tables import data requires that "the file name must have a (case-insensitive) '.CSV' file extension"
training_data_blob_name = blob_prefix.rstrip('/') + '/' + execution_id + '/' + 'training_data.csv'
training_data_blob_uri = f'gs://{bucket.name}/{training_data_blob_name}'
training_data_blob = bucket.blob(training_data_blob_name)
logging.info(f'Uploading training data to {training_data_blob_uri}')
training_data_blob.upload_from_filename(data_path)
logging.info(f'Creating AutoML Tables dataset.')
automl_client = automl.AutoMlClient()
project_location_path = f'projects/{gcp_project_id}/locations/{gcp_region}'
dataset = automl.Dataset(
display_name=dataset_display_name,
tables_dataset_metadata=automl.TablesDatasetMetadata(),
# labels={},
)
dataset = automl_client.create_dataset(
dataset=dataset,
parent=project_location_path,
)
dataset_id = dataset.name.split('/')[-1]
dataset_web_url = f'https://console.cloud.google.com/automl-tables/locations/{gcp_region}/datasets/{dataset_id}'
logging.info(f'Created dataset {dataset.name}. Link: {dataset_web_url}')
logging.info(f'Importing data to the dataset: {dataset.name}.')
import_data_input_config = automl.InputConfig(
gcs_source=automl.GcsSource(
input_uris=[training_data_blob_uri],
)
)
import_data_response = automl_client.import_data(
name=dataset.name,
input_config=import_data_input_config,
)
import_data_response.result()
dataset = automl_client.get_dataset(
name=dataset.name,
)
logging.info(f'Finished importing data.')
logging.info('Updating column specs')
target_column_spec = None
primary_table_spec_name = dataset.name + '/tableSpecs/' + dataset.tables_dataset_metadata.primary_table_spec_id
table_specs_list = list(automl_client.list_table_specs(
parent=dataset.name,
))
for table_spec in table_specs_list:
table_spec_id = table_spec.name.split('/')[-1]
column_specs_list = list(automl_client.list_column_specs(
parent=table_spec.name,
))
is_primary_table = table_spec.name == primary_table_spec_name
for column_spec in column_specs_list:
if column_spec.display_name == target_column_name and is_primary_table:
target_column_spec = column_spec
column_updated = False
if column_spec.display_name in column_nullability:
column_spec.data_type.nullable = column_nullability[column_spec.display_name]
column_updated = True
if column_spec.display_name in column_types:
new_column_type = column_types[column_spec.display_name]
column_spec.data_type.type_code = getattr(automl.TypeCode, new_column_type)
column_updated = True
if column_updated:
automl_client.update_column_spec(column_spec=column_spec)
if target_column_name:
logging.info('Setting target column')
if not target_column_spec:
raise ValueError(f'Primary table does not have column "{target_column_name}"')
target_column_spec_id = target_column_spec.name.split('/')[-1]
dataset.tables_dataset_metadata.target_column_spec_id = target_column_spec_id
dataset = automl_client.update_dataset(dataset=dataset)
return (dataset.name, dataset_web_url)
if __name__ == '__main__':
automl_create_tables_dataset_from_csv_op = create_component_from_func(
automl_create_tables_dataset_from_csv,
base_image='python:3.8',
packages_to_install=['google-cloud-automl==2.0.0', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3'],
output_component_file='component.yaml',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/AutoML/Tables/Create_dataset/from_CSV/component.yaml",
},
)

View File

@ -0,0 +1,274 @@
name: Automl create tables dataset from csv
description: Creates Google Cloud AutoML Tables Dataset from CSV data.
inputs:
- {name: data, type: CSV, description: Data in CSV format that will be imported to
the dataset.}
- {name: target_column_name, type: String, description: Name of the target column
for training., optional: true}
- {name: column_nullability, type: JsonObject, description: Maps column name to boolean
specifying whether the column should be marked as nullable., default: '{}', optional: true}
- {name: column_types, type: JsonObject, description: 'Maps column name to column
type. Supported types: FLOAT64, CATEGORY, STRING.', default: '{}', optional: true}
- {name: gcs_staging_uri, type: String, description: 'URI of the data staging location
in Google Cloud Storage. The bucket must have the us-central1 region. If not specified,
a new staging bucket will be created.', optional: true}
- {name: gcp_project_id, type: String, description: 'Google Cloud project ID. If not
set, the default one will be used.', optional: true}
- {name: gcp_region, type: String, description: Google Cloud region. AutoML Tables
only supports us-central1., default: us-central1, optional: true}
outputs:
- {name: dataset_name, type: String}
- {name: dataset_url, type: URI}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/AutoML/Tables/Create_dataset/from_CSV/component.yaml'
implementation:
container:
image: python:3.8
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-automl==2.0.0' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
|| PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-automl==2.0.0' 'google-cloud-storage==1.31.2' 'google-auth==1.21.3'
--user) && "$0" "$@"
- python3
- -u
- -c
- |
def automl_create_tables_dataset_from_csv(
data_path,
target_column_name = None,
column_nullability = {},
column_types = {},
gcs_staging_uri = None, # Currently AutoML Tables only supports regional buckets in "us-central1".
gcp_project_id = None,
gcp_region = 'us-central1', # Currently "us-central1" is the only region supported by AutoML tables.
):
'''Creates Google Cloud AutoML Tables Dataset from CSV data.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
data_path: Data in CSV format that will be imported to the dataset.
target_column_name: Name of the target column for training.
column_nullability: Maps column name to boolean specifying whether the column should be marked as nullable.
column_types: Maps column name to column type. Supported types: FLOAT64, CATEGORY, STRING.
gcs_staging_uri: URI of the data staging location in Google Cloud Storage. The bucket must have the us-central1 region. If not specified, a new staging bucket will be created.
gcp_project_id: Google Cloud project ID. If not set, the default one will be used.
gcp_region: Google Cloud region. AutoML Tables only supports us-central1.
Returns:
dataset_name: AutoML dataset name (fully-qualified)
'''
import logging
import random
import google.auth
from google.cloud import automl_v1beta1 as automl
from google.cloud import storage
logging.getLogger().setLevel(logging.INFO)
# Validating and inferring the arguments
if not gcp_project_id:
_, gcp_project_id = google.auth.default()
if not gcp_region:
gcp_region = 'us-central1'
if gcp_region != 'us-central1':
logging.warn('AutoML only supports the us-central1 region')
dataset_display_name = 'Dataset' # Allowed characters for displayName are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9
column_nullability = column_nullability or {}
for name, nullability in column_nullability.items():
assert isinstance(name, str)
assert isinstance(nullability, bool)
column_types = column_types or {}
for name, data_type in column_types.items():
assert isinstance(name, str)
if not hasattr(automl.TypeCode, data_type):
supported_types = [type_name for type_name in dir(automl.TypeCode) if type_name[0] != '_']
raise ValueError(f'Unknow column type "{data_type}". Supported types: {supported_types}')
# Generating execution ID for data staging
random_integer = random.SystemRandom().getrandbits(256)
execution_id = '{:064x}'.format(random_integer)
logging.info(f'Execution ID: {execution_id}')
logging.info('Uploading the data to storage')
# TODO: Split table into < 100MB chunks as required by AutoML Tables
storage_client = storage.Client()
if gcs_staging_uri:
if not gcs_staging_uri.startswith('gs://'):
raise ValueError(f"Invalid staging storage URI: {gcs_staging_uri}")
(bucket_name, blob_prefix) = gcs_staging_uri[5:].split('/', 1)
bucket = storage_client.get_bucket(bucket_name)
else:
bucket_name = gcp_project_id + '_staging_' + gcp_region
try:
bucket = storage_client.get_bucket(bucket_name)
except Exception as ex:
logging.info(f'Creating Storage bucket {bucket_name}')
bucket = storage_client.create_bucket(
bucket_or_name=bucket_name,
project=gcp_project_id,
location=gcp_region,
)
logging.info(f'Created Storage bucket {bucket.name}')
blob_prefix = 'google.cloud.automl_tmp'
# AutoML Tables import data requires that "the file name must have a (case-insensitive) '.CSV' file extension"
training_data_blob_name = blob_prefix.rstrip('/') + '/' + execution_id + '/' + 'training_data.csv'
training_data_blob_uri = f'gs://{bucket.name}/{training_data_blob_name}'
training_data_blob = bucket.blob(training_data_blob_name)
logging.info(f'Uploading training data to {training_data_blob_uri}')
training_data_blob.upload_from_filename(data_path)
logging.info(f'Creating AutoML Tables dataset.')
automl_client = automl.AutoMlClient()
project_location_path = f'projects/{gcp_project_id}/locations/{gcp_region}'
dataset = automl.Dataset(
display_name=dataset_display_name,
tables_dataset_metadata=automl.TablesDatasetMetadata(),
# labels={},
)
dataset = automl_client.create_dataset(
dataset=dataset,
parent=project_location_path,
)
dataset_id = dataset.name.split('/')[-1]
dataset_web_url = f'https://console.cloud.google.com/automl-tables/locations/{gcp_region}/datasets/{dataset_id}'
logging.info(f'Created dataset {dataset.name}. Link: {dataset_web_url}')
logging.info(f'Importing data to the dataset: {dataset.name}.')
import_data_input_config = automl.InputConfig(
gcs_source=automl.GcsSource(
input_uris=[training_data_blob_uri],
)
)
import_data_response = automl_client.import_data(
name=dataset.name,
input_config=import_data_input_config,
)
import_data_response.result()
dataset = automl_client.get_dataset(
name=dataset.name,
)
logging.info(f'Finished importing data.')
logging.info('Updating column specs')
target_column_spec = None
primary_table_spec_name = dataset.name + '/tableSpecs/' + dataset.tables_dataset_metadata.primary_table_spec_id
table_specs_list = list(automl_client.list_table_specs(
parent=dataset.name,
))
for table_spec in table_specs_list:
table_spec_id = table_spec.name.split('/')[-1]
column_specs_list = list(automl_client.list_column_specs(
parent=table_spec.name,
))
is_primary_table = table_spec.name == primary_table_spec_name
for column_spec in column_specs_list:
if column_spec.display_name == target_column_name and is_primary_table:
target_column_spec = column_spec
column_updated = False
if column_spec.display_name in column_nullability:
column_spec.data_type.nullable = column_nullability[column_spec.display_name]
column_updated = True
if column_spec.display_name in column_types:
new_column_type = column_types[column_spec.display_name]
column_spec.data_type.type_code = getattr(automl.TypeCode, new_column_type)
column_updated = True
if column_updated:
automl_client.update_column_spec(column_spec=column_spec)
if target_column_name:
logging.info('Setting target column')
if not target_column_spec:
raise ValueError(f'Primary table does not have column "{target_column_name}"')
target_column_spec_id = target_column_spec.name.split('/')[-1]
dataset.tables_dataset_metadata.target_column_spec_id = target_column_spec_id
dataset = automl_client.update_dataset(dataset=dataset)
return (dataset.name, dataset_web_url)
def _serialize_str(str_value: str) -> str:
if not isinstance(str_value, str):
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
return str_value
import json
import argparse
_parser = argparse.ArgumentParser(prog='Automl create tables dataset from csv', description='Creates Google Cloud AutoML Tables Dataset from CSV data.')
_parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--target-column-name", dest="target_column_name", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--column-nullability", dest="column_nullability", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--column-types", dest="column_types", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--gcs-staging-uri", dest="gcs_staging_uri", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_create_tables_dataset_from_csv(**_parsed_args)
_output_serializers = [
_serialize_str,
str,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --data
- {inputPath: data}
- if:
cond: {isPresent: target_column_name}
then:
- --target-column-name
- {inputValue: target_column_name}
- if:
cond: {isPresent: column_nullability}
then:
- --column-nullability
- {inputValue: column_nullability}
- if:
cond: {isPresent: column_types}
then:
- --column-types
- {inputValue: column_types}
- if:
cond: {isPresent: gcs_staging_uri}
then:
- --gcs-staging-uri
- {inputValue: gcs_staging_uri}
- if:
cond: {isPresent: gcp_project_id}
then:
- --gcp-project-id
- {inputValue: gcp_project_id}
- if:
cond: {isPresent: gcp_region}
then:
- --gcp-region
- {inputValue: gcp_region}
- '----output-paths'
- {outputPath: dataset_name}
- {outputPath: dataset_url}

View File

@ -0,0 +1,69 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_create_dataset_for_tables(
gcp_project_id: str,
gcp_region: str,
display_name: str,
description: str = None,
tables_dataset_metadata: dict = {},
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('dataset_path', str), ('create_time', str), ('dataset_id', str), ('dataset_url', 'URI')]):
'''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables
'''
import google
from google.cloud import automl
client = automl.AutoMlClient()
location_path = client.location_path(gcp_project_id, gcp_region)
dataset_dict = {
'display_name': display_name,
'description': description,
'tables_dataset_metadata': tables_dataset_metadata,
}
dataset = client.create_dataset(
location_path,
dataset_dict,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
print(dataset)
dataset_id = dataset.name.rsplit('/', 1)[-1]
dataset_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id}/schemav2?project={project_id}'.format(
project_id=gcp_project_id,
region=gcp_region,
dataset_id=dataset_id,
)
return (dataset.name, str(dataset.create_time), dataset_id, dataset_url)
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_create_dataset_for_tables_op = create_component_from_func(
automl_create_dataset_for_tables,
output_component_file='component.yaml',
base_image='python:3.7',
packages_to_install=['google-cloud-automl==0.4.0'],
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_dataset_for_tables/component.yaml",
},
)

View File

@ -0,0 +1,148 @@
name: Automl create dataset for tables
description: automl_create_dataset_for_tables creates an empty Dataset for AutoML
tables
inputs:
- {name: gcp_project_id, type: String}
- {name: gcp_region, type: String}
- {name: display_name, type: String}
- {name: description, type: String, optional: true}
- {name: tables_dataset_metadata, type: JsonObject, default: '{}', optional: true}
- {name: retry, optional: true}
- {name: timeout, type: Float, optional: true}
- {name: metadata, type: JsonObject, optional: true}
outputs:
- {name: dataset_path, type: String}
- {name: create_time, type: String}
- {name: dataset_id, type: String}
- {name: dataset_url, type: URI}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_dataset_for_tables/component.yaml'
implementation:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-automl==0.4.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
install --quiet --no-warn-script-location 'google-cloud-automl==0.4.0' --user)
&& "$0" "$@"
- python3
- -u
- -c
- |
def automl_create_dataset_for_tables(
gcp_project_id ,
gcp_region ,
display_name ,
description = None,
tables_dataset_metadata = {},
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout = None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata = None,
) :
'''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables
'''
import google
from google.cloud import automl
client = automl.AutoMlClient()
location_path = client.location_path(gcp_project_id, gcp_region)
dataset_dict = {
'display_name': display_name,
'description': description,
'tables_dataset_metadata': tables_dataset_metadata,
}
dataset = client.create_dataset(
location_path,
dataset_dict,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
print(dataset)
dataset_id = dataset.name.rsplit('/', 1)[-1]
dataset_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id}/schemav2?project={project_id}'.format(
project_id=gcp_project_id,
region=gcp_region,
dataset_id=dataset_id,
)
return (dataset.name, str(dataset.create_time), dataset_id, dataset_url)
import json
def _serialize_str(str_value: str) -> str:
if not isinstance(str_value, str):
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
return str_value
import argparse
_parser = argparse.ArgumentParser(prog='Automl create dataset for tables', description='automl_create_dataset_for_tables creates an empty Dataset for AutoML tables')
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--description", dest="description", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--tables-dataset-metadata", dest="tables_dataset_metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=4)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_create_dataset_for_tables(**_parsed_args)
_output_serializers = [
_serialize_str,
_serialize_str,
_serialize_str,
str,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --gcp-project-id
- {inputValue: gcp_project_id}
- --gcp-region
- {inputValue: gcp_region}
- --display-name
- {inputValue: display_name}
- if:
cond: {isPresent: description}
then:
- --description
- {inputValue: description}
- if:
cond: {isPresent: tables_dataset_metadata}
then:
- --tables-dataset-metadata
- {inputValue: tables_dataset_metadata}
- if:
cond: {isPresent: retry}
then:
- --retry
- {inputValue: retry}
- if:
cond: {isPresent: timeout}
then:
- --timeout
- {inputValue: timeout}
- if:
cond: {isPresent: metadata}
then:
- --metadata
- {inputValue: metadata}
- '----output-paths'
- {outputPath: dataset_path}
- {outputPath: create_time}
- {outputPath: dataset_id}
- {outputPath: dataset_url}

View File

@ -0,0 +1,71 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_create_model_for_tables(
gcp_project_id: str,
gcp_region: str,
display_name: str,
dataset_id: str,
target_column_path: str = None,
input_feature_column_paths: list = None,
optimization_objective: str = 'MAXIMIZE_AU_PRC',
train_budget_milli_node_hours: int = 1000,
) -> NamedTuple('Outputs', [('model_path', str), ('model_id', str), ('model_page_url', 'URI'),]):
from google.cloud import automl
client = automl.AutoMlClient()
location_path = client.location_path(gcp_project_id, gcp_region)
model_dict = {
'display_name': display_name,
'dataset_id': dataset_id,
'tables_model_metadata': {
'target_column_spec': automl.types.ColumnSpec(name=target_column_path),
'input_feature_column_specs': [automl.types.ColumnSpec(name=path) for path in input_feature_column_paths] if input_feature_column_paths else None,
'optimization_objective': optimization_objective,
'train_budget_milli_node_hours': train_budget_milli_node_hours,
},
}
create_model_response = client.create_model(location_path, model_dict)
print('Create model operation: {}'.format(create_model_response.operation))
result = create_model_response.result()
print(result)
model_name = result.name
model_id = model_name.rsplit('/', 1)[-1]
model_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id};modelId={model_id};task=basic/train?project={project_id}'.format(
project_id=gcp_project_id,
region=gcp_region,
dataset_id=dataset_id,
model_id=model_id,
)
return (model_name, model_id, model_url)
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_create_model_for_tables_op = create_component_from_func(
automl_create_model_for_tables,
output_component_file='component.yaml',
base_image='python:3.7',
packages_to_install=['google-cloud-automl==0.4.0'],
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_model_for_tables/component.yaml",
},
)

View File

@ -0,0 +1,142 @@
name: Automl create model for tables
inputs:
- {name: gcp_project_id, type: String}
- {name: gcp_region, type: String}
- {name: display_name, type: String}
- {name: dataset_id, type: String}
- {name: target_column_path, type: String, optional: true}
- {name: input_feature_column_paths, type: JsonArray, optional: true}
- {name: optimization_objective, type: String, default: MAXIMIZE_AU_PRC, optional: true}
- {name: train_budget_milli_node_hours, type: Integer, default: '1000', optional: true}
outputs:
- {name: model_path, type: String}
- {name: model_id, type: String}
- {name: model_page_url, type: URI}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_model_for_tables/component.yaml'
implementation:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-automl==0.4.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
install --quiet --no-warn-script-location 'google-cloud-automl==0.4.0' --user)
&& "$0" "$@"
- python3
- -u
- -c
- |
def automl_create_model_for_tables(
gcp_project_id ,
gcp_region ,
display_name ,
dataset_id ,
target_column_path = None,
input_feature_column_paths = None,
optimization_objective = 'MAXIMIZE_AU_PRC',
train_budget_milli_node_hours = 1000,
) :
from google.cloud import automl
client = automl.AutoMlClient()
location_path = client.location_path(gcp_project_id, gcp_region)
model_dict = {
'display_name': display_name,
'dataset_id': dataset_id,
'tables_model_metadata': {
'target_column_spec': automl.types.ColumnSpec(name=target_column_path),
'input_feature_column_specs': [automl.types.ColumnSpec(name=path) for path in input_feature_column_paths] if input_feature_column_paths else None,
'optimization_objective': optimization_objective,
'train_budget_milli_node_hours': train_budget_milli_node_hours,
},
}
create_model_response = client.create_model(location_path, model_dict)
print('Create model operation: {}'.format(create_model_response.operation))
result = create_model_response.result()
print(result)
model_name = result.name
model_id = model_name.rsplit('/', 1)[-1]
model_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id};modelId={model_id};task=basic/train?project={project_id}'.format(
project_id=gcp_project_id,
region=gcp_region,
dataset_id=dataset_id,
model_id=model_id,
)
return (model_name, model_id, model_url)
def _serialize_str(str_value: str) -> str:
if not isinstance(str_value, str):
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
return str_value
import json
import argparse
_parser = argparse.ArgumentParser(prog='Automl create model for tables', description='')
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--dataset-id", dest="dataset_id", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--target-column-path", dest="target_column_path", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--input-feature-column-paths", dest="input_feature_column_paths", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--optimization-objective", dest="optimization_objective", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--train-budget-milli-node-hours", dest="train_budget_milli_node_hours", type=int, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=3)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_create_model_for_tables(**_parsed_args)
_output_serializers = [
_serialize_str,
_serialize_str,
str,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --gcp-project-id
- {inputValue: gcp_project_id}
- --gcp-region
- {inputValue: gcp_region}
- --display-name
- {inputValue: display_name}
- --dataset-id
- {inputValue: dataset_id}
- if:
cond: {isPresent: target_column_path}
then:
- --target-column-path
- {inputValue: target_column_path}
- if:
cond: {isPresent: input_feature_column_paths}
then:
- --input-feature-column-paths
- {inputValue: input_feature_column_paths}
- if:
cond: {isPresent: optimization_objective}
then:
- --optimization-objective
- {inputValue: optimization_objective}
- if:
cond: {isPresent: train_budget_milli_node_hours}
then:
- --train-budget-milli-node-hours
- {inputValue: train_budget_milli_node_hours}
- '----output-paths'
- {outputPath: model_path}
- {outputPath: model_id}
- {outputPath: model_page_url}

View File

@ -0,0 +1,44 @@
from typing import NamedTuple
from kfp.components import create_component_from_func
def automl_deploy_model(
model_path: str,
) -> NamedTuple('Outputs', [
('model_path', str),
]):
"""Deploys a trained model.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
"""
from google.cloud import automl
client = automl.AutoMlClient()
response = client.deploy_model(
name=model_path,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (model_path, )
if __name__ == '__main__':
automl_deploy_model_op = create_component_from_func(
automl_deploy_model,
output_component_file='component.yaml',
base_image='python:3.8',
packages_to_install=[
'google-cloud-automl==2.0.0',
],
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/deploy_model/component.yaml",
},
)

View File

@ -0,0 +1,87 @@
name: Automl deploy model
description: |-
Deploys a trained model.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
inputs:
- {name: model_path, type: String}
outputs:
- {name: model_path, type: String}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/deploy_model/component.yaml'
implementation:
container:
image: python:3.8
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-automl==2.0.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
install --quiet --no-warn-script-location 'google-cloud-automl==2.0.0' --user)
&& "$0" "$@"
- python3
- -u
- -c
- |
def automl_deploy_model(
model_path,
):
"""Deploys a trained model.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
"""
from google.cloud import automl
client = automl.AutoMlClient()
response = client.deploy_model(
name=model_path,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (model_path, )
def _serialize_str(str_value: str) -> str:
if not isinstance(str_value, str):
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
return str_value
import argparse
_parser = argparse.ArgumentParser(prog='Automl deploy model', description="Deploys a trained model.\n\n Args:\n model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>")
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_deploy_model(**_parsed_args)
_output_serializers = [
_serialize_str,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --model-path
- {inputValue: model_path}
- '----output-paths'
- {outputPath: model_path}

View File

@ -0,0 +1,61 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_export_data_to_gcs(
dataset_path: str,
gcs_output_uri_prefix: str = None,
#retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = {},
) -> NamedTuple('Outputs', [('gcs_output_uri_prefix', str)]):
"""Exports dataset data to GCS."""
import sys
import subprocess
subprocess.run([sys.executable, "-m", "pip", "install", "google-cloud-automl==0.4.0", "--quiet", "--no-warn-script-location"], env={"PIP_DISABLE_PIP_VERSION_CHECK": "1"}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
output_config = {"gcs_destination": {"output_uri_prefix": gcs_output_uri_prefix}}
response = client.export_data(
name=dataset_path,
output_config=output_config,
#retry=retry or google.api_core.gapic_v1.method.DEFAULT
timeout=timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata=metadata,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (gcs_output_uri_prefix, )
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_export_data_to_gcs_op = create_component_from_func(
automl_export_data_to_gcs,
output_component_file='component.yaml',base_image='python:3.7',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_data_to_gcs/component.yaml",
},
)

View File

@ -0,0 +1,117 @@
name: Automl export data to gcs
description: |
Exports dataset data to GCS.
inputs:
- name: dataset_path
type: String
- name: gcs_output_uri_prefix
optional: true
type: String
- name: timeout
optional: true
type: Float
- default: '{}'
name: metadata
optional: true
type: JsonObject
outputs:
- name: gcs_output_uri_prefix
type: String
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_data_to_gcs/component.yaml'
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_export_data_to_gcs(
dataset_path: str,
gcs_output_uri_prefix: str = None,
#retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = {},
) -> NamedTuple('Outputs', [('gcs_output_uri_prefix', str)]):
"""Exports dataset data to GCS."""
import sys
import subprocess
subprocess.run([sys.executable, "-m", "pip", "install", "google-cloud-automl==0.4.0", "--quiet", "--no-warn-script-location"], env={"PIP_DISABLE_PIP_VERSION_CHECK": "1"}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
output_config = {"gcs_destination": {"output_uri_prefix": gcs_output_uri_prefix}}
response = client.export_data(
name=dataset_path,
output_config=output_config,
#retry=retry or google.api_core.gapic_v1.method.DEFAULT
timeout=timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata=metadata,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (gcs_output_uri_prefix, )
import json
import argparse
_parser = argparse.ArgumentParser(prog='Automl export data to gcs', description='Exports dataset data to GCS.\n')
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_export_data_to_gcs(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
_output_serializers = [
str
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --dataset-path
- inputValue: dataset_path
- if:
cond:
isPresent: gcs_output_uri_prefix
then:
- --gcs-output-uri-prefix
- inputValue: gcs_output_uri_prefix
- if:
cond:
isPresent: timeout
then:
- --timeout
- inputValue: timeout
- if:
cond:
isPresent: metadata
then:
- --metadata
- inputValue: metadata
- '----output-paths'
- outputPath: gcs_output_uri_prefix

View File

@ -0,0 +1,56 @@
from typing import NamedTuple
from kfp.components import create_component_from_func
def automl_export_model_to_gcs(
model_path: str,
gcs_output_uri_prefix: str,
model_format: str = 'tf_saved_model',
) -> NamedTuple('Outputs', [
('model_directory', 'Uri'),
]):
"""Exports a trained model to a user specified Google Cloud Storage location.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
"""
from google.cloud import automl
client = automl.AutoMlClient()
response = client.export_model(
name=model_path,
output_config=automl.ModelExportOutputConfig(
model_format=model_format,
gcs_destination=automl.GcsDestination(
output_uri_prefix=gcs_output_uri_prefix,
),
),
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (metadata.export_model_details.output_info.gcs_output_directory, )
if __name__ == '__main__':
automl_export_model_to_gcs_op = create_component_from_func(
automl_export_model_to_gcs,
output_component_file='component.yaml',
base_image='python:3.8',
packages_to_install=[
'google-cloud-automl==2.0.0',
],
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_model_to_gcs/component.yaml",
},
)

View File

@ -0,0 +1,107 @@
name: Automl export model to gcs
description: |-
Exports a trained model to a user specified Google Cloud Storage location.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
inputs:
- {name: model_path, type: String}
- {name: gcs_output_uri_prefix, type: String}
- {name: model_format, type: String, default: tf_saved_model, optional: true}
outputs:
- {name: model_directory, type: Uri}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_model_to_gcs/component.yaml'
implementation:
container:
image: python:3.8
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-automl==2.0.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
install --quiet --no-warn-script-location 'google-cloud-automl==2.0.0' --user)
&& "$0" "$@"
- python3
- -u
- -c
- |
def automl_export_model_to_gcs(
model_path,
gcs_output_uri_prefix,
model_format = 'tf_saved_model',
):
"""Exports a trained model to a user specified Google Cloud Storage location.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
"""
from google.cloud import automl
client = automl.AutoMlClient()
response = client.export_model(
name=model_path,
output_config=automl.ModelExportOutputConfig(
model_format=model_format,
gcs_destination=automl.GcsDestination(
output_uri_prefix=gcs_output_uri_prefix,
),
),
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (metadata.export_model_details.output_info.gcs_output_directory, )
import argparse
_parser = argparse.ArgumentParser(prog='Automl export model to gcs', description="Exports a trained model to a user specified Google Cloud Storage location.\n\n Args:\n model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'\n gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.\n model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>")
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--model-format", dest="model_format", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_export_model_to_gcs(**_parsed_args)
_output_serializers = [
str,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --model-path
- {inputValue: model_path}
- --gcs-output-uri-prefix
- {inputValue: gcs_output_uri_prefix}
- if:
cond: {isPresent: model_format}
then:
- --model-format
- {inputValue: model_format}
- '----output-paths'
- {outputPath: model_directory}

View File

@ -0,0 +1,61 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_import_data_from_bigquery(
dataset_path,
input_uri: str,
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('dataset_path', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
input_config = {
'bigquery_source': {
'input_uri': input_uri,
},
}
response = client.import_data(
dataset_path,
input_config,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
result = response.result()
print(result)
metadata = response.metadata
print(metadata)
return (dataset_path)
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_import_data_from_bigquery_op = create_component_from_func(
automl_import_data_from_bigquery,
output_component_file='component.yaml',
base_image='python:3.7',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_bigquery/component.yaml",
},
)

View File

@ -0,0 +1,112 @@
name: Automl import data from bigquery
inputs:
- name: dataset_path
- name: input_uri
type: String
- name: retry
optional: true
- name: timeout
optional: true
- name: metadata
type: JsonObject
optional: true
outputs:
- name: dataset_path
type: String
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_bigquery/component.yaml'
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_import_data_from_bigquery(
dataset_path,
input_uri: str,
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('dataset_path', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
input_config = {
'bigquery_source': {
'input_uri': input_uri,
},
}
response = client.import_data(
dataset_path,
input_config,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
result = response.result()
print(result)
metadata = response.metadata
print(metadata)
return (dataset_path)
import json
import argparse
_missing_arg = object()
_parser = argparse.ArgumentParser(prog='Automl import data from bigquery', description='')
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
_parser.add_argument("--input-uri", dest="input_uri", type=str, required=True, default=_missing_arg)
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_import_data_from_bigquery(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(str(_outputs[idx]))
args:
- --dataset-path
- inputValue: dataset_path
- --input-uri
- inputValue: input_uri
- if:
cond:
isPresent: retry
then:
- --retry
- inputValue: retry
- if:
cond:
isPresent: timeout
then:
- --timeout
- inputValue: timeout
- if:
cond:
isPresent: metadata
then:
- --metadata
- inputValue: metadata
- '----output-paths'
- outputPath: dataset_path

View File

@ -0,0 +1,62 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_import_data_from_gcs(
dataset_path: str,
input_uris: list,
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('dataset_path', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
input_config = {
'gcs_source': {
'input_uris': input_uris,
},
}
response = client.import_data(
dataset_path,
input_config,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
result = response.result()
print(result)
metadata = response.metadata
print(metadata)
return (dataset_path)
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_import_data_from_gcs_op = create_component_from_func(
automl_import_data_from_gcs,
output_component_file='component.yaml',
base_image='python:3.7',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_gcs/component.yaml",
},
)

View File

@ -0,0 +1,113 @@
name: Automl import data from gcs
inputs:
- name: dataset_path
type: String
- name: input_uris
type: JsonArray
- name: retry
optional: true
- name: timeout
optional: true
- name: metadata
type: JsonObject
optional: true
outputs:
- name: dataset_path
type: String
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_gcs/component.yaml'
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_import_data_from_gcs(
dataset_path: str,
input_uris: list,
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('dataset_path', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
input_config = {
'gcs_source': {
'input_uris': input_uris,
},
}
response = client.import_data(
dataset_path,
input_config,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
result = response.result()
print(result)
metadata = response.metadata
print(metadata)
return (dataset_path)
import json
import argparse
_missing_arg = object()
_parser = argparse.ArgumentParser(prog='Automl import data from gcs', description='')
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
_parser.add_argument("--input-uris", dest="input_uris", type=json.loads, required=True, default=_missing_arg)
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_import_data_from_gcs(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(str(_outputs[idx]))
args:
- --dataset-path
- inputValue: dataset_path
- --input-uris
- inputValue: input_uris
- if:
cond:
isPresent: retry
then:
- --retry
- inputValue: retry
- if:
cond:
isPresent: timeout
then:
- --timeout
- inputValue: timeout
- if:
cond:
isPresent: metadata
then:
- --metadata
- inputValue: metadata
- '----output-paths'
- outputPath: dataset_path

View File

@ -0,0 +1,78 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_prediction_service_batch_predict(
model_path,
gcs_input_uris: list = None,
gcs_output_uri_prefix: str = None,
bq_input_uri: str = None,
bq_output_uri: str = None,
params=None,
retry=None, #google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('gcs_output_directory', str), ('bigquery_output_dataset', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
input_config = {}
if gcs_input_uris:
input_config['gcs_source'] = {'input_uris': gcs_input_uris}
if bq_input_uri:
input_config['bigquery_source'] = {'input_uri': bq_input_uri}
output_config = {}
if gcs_output_uri_prefix:
output_config['gcs_destination'] = {'output_uri_prefix': gcs_output_uri_prefix}
if bq_output_uri:
output_config['bigquery_destination'] = {'output_uri': bq_output_uri}
from google.cloud import automl
client = automl.PredictionServiceClient()
response = client.batch_predict(
model_path,
input_config,
output_config,
params,
retry,
timeout,
metadata,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
output_info = metadata.batch_predict_details.output_info
# Workaround for Argo issue - it fails when output is empty: https://github.com/argoproj/argo-workflows/pull/1277/files#r326028422
return (output_info.gcs_output_directory or '-', output_info.bigquery_output_dataset or '-')
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_prediction_service_batch_predict_op = create_component_from_func(
automl_prediction_service_batch_predict,
output_component_file='component.yaml',
base_image='python:3.7',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/prediction_service_batch_predict/component.yaml",
},
)

View File

@ -0,0 +1,175 @@
name: Automl prediction service batch predict
inputs:
- name: model_path
- name: gcs_input_uris
type: JsonArray
optional: true
- name: gcs_output_uri_prefix
type: String
optional: true
- name: bq_input_uri
type: String
optional: true
- name: bq_output_uri
type: String
optional: true
- name: params
optional: true
- name: retry
optional: true
- name: timeout
optional: true
- name: metadata
type: JsonObject
optional: true
outputs:
- name: gcs_output_directory
type: String
- name: bigquery_output_dataset
type: String
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/prediction_service_batch_predict/component.yaml'
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_prediction_service_batch_predict(
model_path,
gcs_input_uris: str = None,
gcs_output_uri_prefix: str = None,
bq_input_uri: str = None,
bq_output_uri: str = None,
params=None,
retry=None, #google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('gcs_output_directory', str), ('bigquery_output_dataset', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
input_config = {}
if gcs_input_uris:
input_config['gcs_source'] = {'input_uris': gcs_input_uris}
if bq_input_uri:
input_config['bigquery_source'] = {'input_uri': bq_input_uri}
output_config = {}
if gcs_output_uri_prefix:
output_config['gcs_destination'] = {'output_uri_prefix': gcs_output_uri_prefix}
if bq_output_uri:
output_config['bigquery_destination'] = {'output_uri': bq_output_uri}
from google.cloud import automl
client = automl.PredictionServiceClient()
response = client.batch_predict(
model_path,
input_config,
output_config,
params,
retry,
timeout,
metadata,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
output_info = metadata.batch_predict_details.output_info
# Workaround for Argo issue - it fails when output is empty: https://github.com/argoproj/argo-workflows/pull/1277/files#r326028422
return (output_info.gcs_output_directory or '-', output_info.bigquery_output_dataset or '-')
import json
import argparse
_missing_arg = object()
_parser = argparse.ArgumentParser(prog='Automl prediction service batch predict', description='')
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=_missing_arg)
_parser.add_argument("--gcs-input-uris", dest="gcs_input_uris", type=json.loads, required=False, default=_missing_arg)
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=False, default=_missing_arg)
_parser.add_argument("--bq-input-uri", dest="bq_input_uri", type=str, required=False, default=_missing_arg)
_parser.add_argument("--bq-output-uri", dest="bq_output_uri", type=str, required=False, default=_missing_arg)
_parser.add_argument("--params", dest="params", type=str, required=False, default=_missing_arg)
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_prediction_service_batch_predict(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(str(_outputs[idx]))
args:
- --model-path
- inputValue: model_path
- if:
cond:
isPresent: gcs_input_uris
then:
- --gcs-input-uris
- inputValue: gcs_input_uris
- if:
cond:
isPresent: gcs_output_uri_prefix
then:
- --gcs-output-uri-prefix
- inputValue: gcs_output_uri_prefix
- if:
cond:
isPresent: bq_input_uri
then:
- --bq-input-uri
- inputValue: bq_input_uri
- if:
cond:
isPresent: bq_output_uri
then:
- --bq-output-uri
- inputValue: bq_output_uri
- if:
cond:
isPresent: params
then:
- --params
- inputValue: params
- if:
cond:
isPresent: retry
then:
- --retry
- inputValue: retry
- if:
cond:
isPresent: timeout
then:
- --timeout
- inputValue: timeout
- if:
cond:
isPresent: metadata
then:
- --metadata
- inputValue: metadata
- '----output-paths'
- outputPath: gcs_output_directory
- outputPath: bigquery_output_dataset

View File

@ -0,0 +1,59 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_split_dataset_table_column_names(
dataset_path: str,
target_column_name: str,
table_index: int = 0,
) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
from google.cloud import automl
client = automl.AutoMlClient()
list_table_specs_response = client.list_table_specs(dataset_path)
table_specs = [s for s in list_table_specs_response]
print('table_specs=')
print(table_specs)
table_spec_name = table_specs[table_index].name
list_column_specs_response = client.list_column_specs(table_spec_name)
column_specs = [s for s in list_column_specs_response]
print('column_specs=')
print(column_specs)
target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0]
feature_column_specs = [s for s in column_specs if s.display_name != target_column_name]
feature_column_names = [s.name for s in feature_column_specs]
import json
return (target_column_spec.name, json.dumps(feature_column_names))
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_split_dataset_table_column_names_op = create_component_from_func(
automl_split_dataset_table_column_names,
output_component_file='component.yaml',
base_image='python:3.7',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/split_dataset_table_column_names/component.yaml",
},
)

View File

@ -0,0 +1,95 @@
name: Automl split dataset table column names
inputs:
- name: dataset_path
type: String
- name: target_column_name
type: String
- name: table_index
type: Integer
default: '0'
optional: true
outputs:
- name: target_column_path
type: String
- name: feature_column_paths
type: JsonArray
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/split_dataset_table_column_names/component.yaml'
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_split_dataset_table_column_names(
dataset_path: str,
target_column_name: str,
table_index: int = 0,
) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
from google.cloud import automl
client = automl.AutoMlClient()
list_table_specs_response = client.list_table_specs(dataset_path)
table_specs = [s for s in list_table_specs_response]
print('table_specs=')
print(table_specs)
table_spec_name = table_specs[table_index].name
list_column_specs_response = client.list_column_specs(table_spec_name)
column_specs = [s for s in list_column_specs_response]
print('column_specs=')
print(column_specs)
target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0]
feature_column_specs = [s for s in column_specs if s.display_name != target_column_name]
feature_column_names = [s.name for s in feature_column_specs]
import json
return (target_column_spec.name, json.dumps(feature_column_names))
import argparse
_missing_arg = object()
_parser = argparse.ArgumentParser(prog='Automl split dataset table column names', description='')
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
_parser.add_argument("--target-column-name", dest="target_column_name", type=str, required=True, default=_missing_arg)
_parser.add_argument("--table-index", dest="table_index", type=int, required=False, default=_missing_arg)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_split_dataset_table_column_names(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(str(_outputs[idx]))
args:
- --dataset-path
- inputValue: dataset_path
- --target-column-name
- inputValue: target_column_name
- if:
cond:
isPresent: table_index
then:
- --table-index
- inputValue: table_index
- '----output-paths'
- outputPath: target_column_path
- outputPath: feature_column_paths

View File

@ -0,0 +1,176 @@
# Name
Component: Data processing by creating a cluster in Cloud Dataproc
# Label
Cloud Dataproc, Kubeflow
# Facets
<!--Make sure the asset has data for the following facets:
Use case
Technique
Input data type
ML workflow
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
--->
Use case:
Other
Technique:
Other
Input data type:
Tabular
ML workflow:
Data preparation
# Summary
A Kubeflow pipeline component to create a cluster in Cloud Dataproc.
# Details
## Intended use
Use this component at the start of a Kubeflow pipeline to create a temporary Cloud Dataproc cluster to run Cloud Dataproc jobs as steps in the pipeline.
## Runtime arguments
| Argument | Description | Optional | Data type | Accepted values | Default |
|----------|-------------|----------|-----------|-----------------|---------|
| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | | |
| region | The Cloud Dataproc region to create the cluster in. | No | GCPRegion | | |
| name | The name of the cluster. Cluster names within a project must be unique. You can reuse the names of deleted clusters. | Yes | String | | None |
| name_prefix | The prefix of the cluster name. | Yes | String | | None |
| initialization_actions | A list of Cloud Storage URIs identifying the executables on each node after the configuration is completed. By default, executables are run on the master and all the worker nodes. | Yes | List | | None |
| config_bucket | The Cloud Storage bucket to use to stage the job dependencies, the configuration files, and the job driver consoles output. | Yes | GCSPath | | None |
| image_version | The version of the software inside the cluster. | Yes | String | | None |
| cluster | The full [cluster configuration](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#Cluster). | Yes | Dict | | None |
| wait_interval | The number of seconds to pause before polling the operation. | Yes | Integer | | 30 |
## Output
Name | Description | Type
:--- | :---------- | :---
cluster_name | The name of the cluster. | String
Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/delete_cluster).
## Cautions & requirements
To use the component, you must:
* Set up the GCP project by following these [steps](https://cloud.google.com/dataproc/docs/guides/setup-project).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant the following types of access to the Kubeflow user service account:
* Read access to the Cloud Storage buckets which contain the initialization action files.
* The role, `roles/dataproc.editor`, on the project.
## Detailed description
This component creates a new Dataproc cluster by using the [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create).
Follow these steps to use the component in a pipeline:
1. Install the Kubeflow pipeline's SDK
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using the Kubeflow pipeline's SDK
```python
import kfp.components as comp
dataproc_create_cluster_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/create_cluster/component.yaml')
help(dataproc_create_cluster_op)
```
### Sample
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
#### Set sample parameters
```python
# Required parameters
PROJECT_ID = '<Put your project ID here>'
# Optional parameters
EXPERIMENT_NAME = 'Dataproc - Create Cluster'
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Dataproc create cluster pipeline',
description='Dataproc create cluster pipeline'
)
def dataproc_create_cluster_pipeline(
project_id = PROJECT_ID,
region = 'us-central1',
name='',
name_prefix='',
initialization_actions='',
config_bucket='',
image_version='',
cluster='',
wait_interval='30'
):
dataproc_create_cluster_op(
project_id=project_id,
region=region,
name=name,
name_prefix=name_prefix,
initialization_actions=initialization_actions,
config_bucket=config_bucket,
image_version=image_version,
cluster=cluster,
wait_interval=wait_interval)
```
#### Compile the pipeline
```python
#Compile the pipeline
pipeline_func = dataproc_create_cluster_pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify values for the pipeline's arguments
arguments = {}
#Get or create an experiment
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
## References
* [Kubernetes Engine for Kubeflow](https://www.kubeflow.org/docs/started/getting-started-gke/#gcp-service-accounts)
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/create_cluster/sample.ipynb)
* [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,90 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: dataproc_create_cluster
description: |
Creates a DataProc cluster under a project.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: >-
Required. The ID of the Google Cloud Platform project that the cluster belongs to.
type: GCPProjectID
- name: region
description: 'Required. The Cloud Dataproc region in which to handle the request.'
type: GCPRegion
- name: name
description: >-
Optional. The cluster name. Cluster names within a project must be unique. Names of
deleted clusters can be reused
default: ''
type: String
- name: name_prefix
description: 'Optional. The prefix of the cluster name.'
default: ''
type: String
- name: initialization_actions
description: >-
Optional. List of GCS URIs of executables to execute on each node after config
is completed. By default, executables are run on master and all worker nodes.
default: ''
type: List
- name: config_bucket
description: >-
Optional. A Google Cloud Storage bucket used to stage job dependencies, config
files, and job driver console output.
default: ''
type: GCSPath
- name: image_version
description: 'Optional. The version of software inside the cluster.'
default: ''
type: String
- name: cluster
description: >-
Optional. The full cluster config. See
[full details](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#Cluster)
default: ''
type: Dict
- name: wait_interval
default: '30'
description: 'Optional. The wait seconds between polling the operation. Defaults to 30.'
type: Integer
outputs:
- name: cluster_name
description: 'The cluster name of the created cluster.'
type: String
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.dataproc, create_cluster,
--project_id, {inputValue: project_id},
--region, {inputValue: region},
--name, {inputValue: name},
--name_prefix, {inputValue: name_prefix},
--initialization_actions, {inputValue: initialization_actions},
--config_bucket, {inputValue: config_bucket},
--image_version, {inputValue: image_version},
--cluster, {inputValue: cluster},
--wait_interval, {inputValue: wait_interval},
--cluster_name_output_path, {outputPath: cluster_name},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,245 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"Data processing by creating a cluster in Cloud Dataproc\n",
"\n",
"\n",
"# Label\n",
"Cloud Dataproc, cluster, GCP, Cloud Storage, KubeFlow, Pipeline\n",
"\n",
"\n",
"# Summary\n",
"A Kubeflow Pipeline component to create a cluster in Cloud Dataproc.\n",
"\n",
"# Details\n",
"## Intended use\n",
"\n",
"Use this component at the start of a Kubeflow Pipeline to create a temporary Cloud Dataproc cluster to run Cloud Dataproc jobs as steps in the pipeline.\n",
"\n",
"## Runtime arguments\n",
"\n",
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
"|----------|-------------|----------|-----------|-----------------|---------|\n",
"| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | | |\n",
"| region | The Cloud Dataproc region to create the cluster in. | No | GCPRegion | | |\n",
"| name | The name of the cluster. Cluster names within a project must be unique. You can reuse the names of deleted clusters. | Yes | String | | None |\n",
"| name_prefix | The prefix of the cluster name. | Yes | String | | None |\n",
"| initialization_actions | A list of Cloud Storage URIs identifying executables to execute on each node after the configuration is completed. By default, executables are run on the master and all the worker nodes. | Yes | List | | None |\n",
"| config_bucket | The Cloud Storage bucket to use to stage the job dependencies, the configuration files, and the job driver consoles output. | Yes | GCSPath | | None |\n",
"| image_version | The version of the software inside the cluster. | Yes | String | | None |\n",
"| cluster | The full [cluster configuration](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#Cluster). | Yes | Dict | | None |\n",
"| wait_interval | The number of seconds to pause before polling the operation. | Yes | Integer | | 30 |\n",
"\n",
"## Output\n",
"Name | Description | Type\n",
":--- | :---------- | :---\n",
"cluster_name | The name of the cluster. | String\n",
"\n",
"Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/delete_cluster).\n",
"\n",
"\n",
"## Cautions & requirements\n",
"\n",
"To use the component, you must:\n",
"* Set up the GCP project by following these [steps](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant the following types of access to the Kubeflow user service account:\n",
" * Read access to the Cloud Storage buckets which contains initialization action files.\n",
" * The role, `roles/dataproc.editor` on the project.\n",
"\n",
"## Detailed description\n",
"\n",
"This component creates a new Dataproc cluster by using the [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create). \n",
"\n",
"Follow these steps to use the component in a pipeline:\n",
"\n",
"1. Install the Kubeflow Pipeline SDK:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"dataproc_create_cluster_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/create_cluster/component.yaml')\n",
"help(dataproc_create_cluster_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample\n",
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
"\n",
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Required Parameters\n",
"PROJECT_ID = '<Please put your project ID here>'\n",
"\n",
"# Optional Parameters\n",
"EXPERIMENT_NAME = 'Dataproc - Create Cluster'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='Dataproc create cluster pipeline',\n",
" description='Dataproc create cluster pipeline'\n",
")\n",
"def dataproc_create_cluster_pipeline(\n",
" project_id = PROJECT_ID, \n",
" region = 'us-central1', \n",
" name='', \n",
" name_prefix='',\n",
" initialization_actions='', \n",
" config_bucket='', \n",
" image_version='', \n",
" cluster='', \n",
" wait_interval='30'\n",
"):\n",
" dataproc_create_cluster_op(\n",
" project_id=project_id, \n",
" region=region, \n",
" name=name, \n",
" name_prefix=name_prefix, \n",
" initialization_actions=initialization_actions, \n",
" config_bucket=config_bucket, \n",
" image_version=image_version, \n",
" cluster=cluster, \n",
" wait_interval=wait_interval)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = dataproc_create_cluster_pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"* [Kubernetes Engine for Kubeflow](https://www.kubeflow.org/docs/started/getting-started-gke/#gcp-service-accounts)\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/create_cluster/sample.ipynb)\n",
"* [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create)\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,148 @@
# Name
Component: Data preparation by deleting a cluster in Cloud Dataproc
# Label
Cloud Dataproc, Kubeflow
# Summary
A Kubeflow pipeline component to delete a cluster in Cloud Dataproc.
## Intended use
Use this component at the start of a Kubeflow pipeline to delete a temporary Cloud Dataproc cluster when running Cloud Dataproc jobs as steps in the pipeline. This component is usually used with an [exit handler](https://github.com/kubeflow/pipelines/blob/master/samples/core/exit_handler/exit_handler.py) to run at the end of a pipeline.
# Facets
<!--Make sure the asset has data for the following facets:
Use case
Technique
Input data type
ML workflow
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
--->
Use case:
Technique:
Input data type:
ML workflow:
## Runtime arguments
| Argument | Description | Optional | Data type | Accepted values | Default |
|:----------|:-------------|:----------|:-----------|:-----------------|:---------|
| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | - | - |
| region | The Cloud Dataproc region in which to handle the request. | No | GCPRegion | - | - |
| name | The name of the cluster to delete. | No | String | - | - |
| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | - | 30 |
## Cautions & requirements
To use the component, you must:
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant the Kubeflow user service account the role, `roles/dataproc.editor`, on the project.
## Detailed description
This component deletes a Dataproc cluster by using [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete).
Follow these steps to use the component in a pipeline:
1. Install the Kubeflow pipeline's SDK:
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using the Kubeflow pipeline's SDK:
```python
import kfp.components as comp
dataproc_delete_cluster_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/delete_cluster/component.yaml')
help(dataproc_delete_cluster_op)
```
### Sample
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
#### Prerequisites
[Create a Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) before running the sample code.
#### Set sample parameters
```python
PROJECT_ID = '<Put your project ID here>'
CLUSTER_NAME = '<Put your existing cluster name here>'
REGION = 'us-central1'
EXPERIMENT_NAME = 'Dataproc - Delete Cluster'
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Dataproc delete cluster pipeline',
description='Dataproc delete cluster pipeline'
)
def dataproc_delete_cluster_pipeline(
project_id = PROJECT_ID,
region = REGION,
name = CLUSTER_NAME
):
dataproc_delete_cluster_op(
project_id=project_id,
region=region,
name=name)
```
#### Compile the pipeline
```python
pipeline_func = dataproc_delete_cluster_pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify values for the pipeline's arguments
arguments = {}
#Get or create an experiment
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
## References
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/delete_cluster/sample.ipynb)
* [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,49 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: dataproc_delete_cluster
description: |
Deletes a DataProc cluster.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: >-
Required. The ID of the Google Cloud Platform project that the cluster belongs to.
type: GCPProjectID
- name: region
description: >-
Required. The Cloud Dataproc region in which to handle the request.
type: GCPRegion
- name: name
description: 'Required. The cluster name to delete.'
type: String
- name: wait_interval
default: '30'
description: 'Optional. The wait seconds between polling the operation. Defaults to 30.'
type: Integer
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
kfp_component.google.dataproc, delete_cluster,
--project_id, {inputValue: project_id},
--region, {inputValue: region},
--name, {inputValue: name},
--wait_interval, {inputValue: wait_interval}
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,231 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"\n",
"Data preparation by deleting a cluster in Cloud Dataproc\n",
"\n",
"# Label\n",
"Cloud Dataproc, cluster, GCP, Cloud Storage, Kubeflow, Pipeline\n",
"\n",
"\n",
"# Summary\n",
"A Kubeflow Pipeline component to delete a cluster in Cloud Dataproc.\n",
"\n",
"## Intended use\n",
"Use this component at the start of a Kubeflow Pipeline to delete a temporary Cloud Dataproc \n",
"cluster to run Cloud Dataproc jobs as steps in the pipeline. This component is usually \n",
"used with an [exit handler](https://github.com/kubeflow/pipelines/blob/master/samples/core/exit_handler/exit_handler.py) to run at the end of a pipeline.\n",
"\n",
"\n",
"## Runtime arguments\n",
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
"|----------|-------------|----------|-----------|-----------------|---------|\n",
"| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | | |\n",
"| region | The Cloud Dataproc region in which to handle the request. | No | GCPRegion | | |\n",
"| name | The name of the cluster to delete. | No | String | | |\n",
"| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | | 30 |\n",
"\n",
"\n",
"## Cautions & requirements\n",
"To use the component, you must:\n",
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
"\n",
"## Detailed description\n",
"This component deletes a Dataproc cluster by using [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete).\n",
"\n",
"Follow these steps to use the component in a pipeline:\n",
"1. Install the Kubeflow Pipeline SDK:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"dataproc_delete_cluster_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/delete_cluster/component.yaml')\n",
"help(dataproc_delete_cluster_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample\n",
"\n",
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
"\n",
"#### Prerequisites\n",
"\n",
"[Create a Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) before running the sample code.\n",
"\n",
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"PROJECT_ID = '<Please put your project ID here>'\n",
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
"\n",
"REGION = 'us-central1'\n",
"EXPERIMENT_NAME = 'Dataproc - Delete Cluster'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='Dataproc delete cluster pipeline',\n",
" description='Dataproc delete cluster pipeline'\n",
")\n",
"def dataproc_delete_cluster_pipeline(\n",
" project_id = PROJECT_ID, \n",
" region = REGION,\n",
" name = CLUSTER_NAME\n",
"):\n",
" dataproc_delete_cluster_op(\n",
" project_id=project_id, \n",
" region=region, \n",
" name=name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = dataproc_delete_cluster_pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/delete_cluster/sample.ipynb)\n",
"* [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete)\n",
"\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,205 @@
# Name
Component: Data preparation using Hadoop MapReduce on YARN with Cloud Dataproc
# Labels
Cloud Dataproc, Hadoop, YARN, Apache, MapReduce
# Summary
A Kubeflow pipeline component to prepare data by submitting an Apache Hadoop MapReduce job on Apache Hadoop YARN to Cloud Dataproc.
# Facets
<!--Make sure the asset has data for the following facets:
Use case
Technique
Input data type
ML workflow
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
--->
Use case:
Technique:
Input data type:
ML workflow:
# Details
## Intended use
Use the component to run an Apache Hadoop MapReduce job as one preprocessing step in a Kubeflow pipeline.
## Runtime arguments
| Argument | Description | Optional | Data type | Accepted values | Default |
|----------|-------------|----------|-----------|-----------------|---------|
| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | - | - |
| region | The Dataproc region to handle the request. | No | GCPRegion | - | - |
| cluster_name | The name of the cluster to run the job. | No | String | - | - |
| main_jar_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the JAR file containing the main class to execute. | No | List |- |- |
| main_class | The name of the driver's main class. The JAR file that contains the class must be either in the default CLASSPATH or specified in `hadoop_job.jarFileUris`. | No | String |- | - |
| args | The arguments to pass to the driver. Do not include arguments, such as -libjars or -Dfoo=bar, that can be set as job properties, since a collision may occur that causes an incorrect job submission. | Yes | List | - | None |
| hadoop_job | The payload of a [HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob). | Yes | Dict | - | None |
| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | -| None |
| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | - | 30 |
Note:
`main_jar_file_uri`: The examples for the files are:
- `gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar`
- `hdfs:/tmp/test-samples/custom-wordcount.jarfile:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar`
## Output
Name | Description | Type
:--- | :---------- | :---
job_id | The ID of the created job. | String
## Cautions & requirements
To use the component, you must:
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant the Kubeflow user service account the role, `roles/dataproc.editor`, on the project.
## Detailed description
This component creates a Hadoop job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
Follow these steps to use the component in a pipeline:
1. Install the Kubeflow pipeline's SDK:
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using the Kubeflow pipeline's SDK:
```python
import kfp.components as comp
dataproc_submit_hadoop_job_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_hadoop_job/component.yaml')
help(dataproc_submit_hadoop_job_op)
```
### Sample
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
#### Setup a Dataproc cluster
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
### Prepare a Hadoop job
Upload your Hadoop JAR file to a Cloud Storage bucket. In the sample, we will use a JAR file that is preinstalled in the main cluster, so you don't have to provide the argument, `main_jar_file_uri`.
To package a self-contained Hadoop MapReduce application from the [WordCount example source code](https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordCount.java), follow the [MapReduce Tutorial](https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html).
#### Set sample parameters
```python
PROJECT_ID = '<Put your project ID here>'
CLUSTER_NAME = '<Put your existing cluster name here>'
OUTPUT_GCS_PATH = '<Put your output GCS path here>'
REGION = 'us-central1'
MAIN_CLASS = 'org.apache.hadoop.examples.WordCount'
INTPUT_GCS_PATH = 'gs://ml-pipeline-playground/shakespeare1.txt'
EXPERIMENT_NAME = 'Dataproc - Submit Hadoop Job'
```
#### Inspect the input data
The input file is a simple text file:
```python
!gsutil cat $INTPUT_GCS_PATH
```
#### Clean up the existing output files (optional)
This is needed because the sample code requires the output folder to be a clean folder. To continue to run the sample, make sure that the service account of the notebook server has access to `OUTPUT_GCS_PATH`.
Caution: This will remove all blob files under `OUTPUT_GCS_PATH`.
```python
!gsutil rm $OUTPUT_GCS_PATH/**
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Dataproc submit Hadoop job pipeline',
description='Dataproc submit Hadoop job pipeline'
)
def dataproc_submit_hadoop_job_pipeline(
project_id = PROJECT_ID,
region = REGION,
cluster_name = CLUSTER_NAME,
main_jar_file_uri = '',
main_class = MAIN_CLASS,
args = json.dumps([
INTPUT_GCS_PATH,
OUTPUT_GCS_PATH
]),
hadoop_job='',
job='{}',
wait_interval='30'
):
dataproc_submit_hadoop_job_op(
project_id=project_id,
region=region,
cluster_name=cluster_name,
main_jar_file_uri=main_jar_file_uri,
main_class=main_class,
args=args,
hadoop_job=hadoop_job,
job=job,
wait_interval=wait_interval)
```
#### Compile the pipeline
```python
pipeline_func = dataproc_submit_hadoop_job_pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify values for the pipeline's arguments
arguments = {}
#Get or create an experiment
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
#### Inspect the output
The sample in the notebook will count the words in the input text and save them in sharded files. The command to inspect the output is:
```python
!gsutil cat $OUTPUT_GCS_PATH/*
```
## References
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)
* [Dataproc HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob)
# License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,100 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: dataproc_submit_hadoop_job
description: >-
Submits a Cloud Dataproc job for running Apache Hadoop MapReduce jobs on
Apache Hadoop YARN.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: >-
Required. The ID of the Google Cloud Platform project that the cluster
belongs to.
type: GCPProjectID
- name: region
description: >-
Required. The Cloud Dataproc region in which to handle the request.
type: GCPRegion
- name: cluster_name
description: 'Required. The cluster to run the job.'
type: String
- name: main_jar_file_uri
default: ''
description: >-
The HCFS URI of the jar file containing the main class. Examples:
`gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar`
`hdfs:/tmp/test-samples/custom-wordcount.jar`
`file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar`
type: GCSPath
- name: main_class
default: ''
description: >-
The name of the driver's main class. The jar file
containing the class must be in the default CLASSPATH or specified
in `jarFileUris`.
type: String
- name: args
default: ''
description: >-
Optional. The arguments to pass to the driver. Do not include
arguments, such as -libjars or -Dfoo=bar, that can be set as job properties,
since a collision may occur that causes an incorrect job submission.
type: List
- name: hadoop_job
default: ''
description: >-
Optional. The full payload of a
[hadoop job](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob).
type: Dict
- name: job
default: ''
description: >-
Optional. The full payload of a
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
type: Dict
- name: wait_interval
default: '30'
description: >-
Optional. The wait seconds between polling the operation.
Defaults to 30.
type: Integer
outputs:
- name: job_id
description: 'The ID of the created job.'
type: String
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.dataproc, submit_hadoop_job,
--project_id, {inputValue: project_id},
--region, {inputValue: region},
--cluster_name, {inputValue: cluster_name},
--main_jar_file_uri, {inputValue: main_jar_file_uri},
--main_class, {inputValue: main_class},
--args, {inputValue: args},
--hadoop_job, {inputValue: hadoop_job},
--job, {inputValue: job},
--wait_interval, {inputValue: wait_interval},
--job_id_output_path, {outputPath: job_id},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,313 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"Data preparation using Hadoop MapReduce on YARN with Cloud Dataproc\n",
"\n",
"# Label\n",
"Cloud Dataproc, GCP, Cloud Storage, Hadoop, YARN, Apache, MapReduce\n",
"\n",
"\n",
"# Summary\n",
"A Kubeflow Pipeline component to prepare data by submitting an Apache Hadoop MapReduce job on Apache Hadoop YARN to Cloud Dataproc.\n",
"\n",
"# Details\n",
"## Intended use\n",
"Use the component to run an Apache Hadoop MapReduce job as one preprocessing step in a Kubeflow Pipeline. \n",
"\n",
"## Runtime arguments\n",
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
"|----------|-------------|----------|-----------|-----------------|---------|\n",
"| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectID | | |\n",
"| region | The Dataproc region to handle the request. | No | GCPRegion | | |\n",
"| cluster_name | The name of the cluster to run the job. | No | String | | |\n",
"| main_jar_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the JAR file containing the main class to execute. | No | List | | |\n",
"| main_class | The name of the driver's main class. The JAR file that contains the class must be either in the default CLASSPATH or specified in `hadoop_job.jarFileUris`. | No | String | | |\n",
"| args | The arguments to pass to the driver. Do not include arguments, such as -libjars or -Dfoo=bar, that can be set as job properties, since a collision may occur that causes an incorrect job submission. | Yes | List | | None |\n",
"| hadoop_job | The payload of a [HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob). | Yes | Dict | | None |\n",
"| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |\n",
"| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | | 30 |\n",
"\n",
"Note: \n",
"`main_jar_file_uri`: The examples for the files are : \n",
"- `gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar` \n",
"- `hdfs:/tmp/test-samples/custom-wordcount.jarfile:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar`\n",
"\n",
"\n",
"## Output\n",
"Name | Description | Type\n",
":--- | :---------- | :---\n",
"job_id | The ID of the created job. | String\n",
"\n",
"## Cautions & requirements\n",
"To use the component, you must:\n",
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
"\n",
"## Detailed description\n",
"\n",
"This component creates a Hadoop job from [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
"\n",
"Follow these steps to use the component in a pipeline:\n",
"\n",
"1. Install the Kubeflow Pipeline SDK:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"dataproc_submit_hadoop_job_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_hadoop_job/component.yaml')\n",
"help(dataproc_submit_hadoop_job_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sample\n",
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
"\n",
"\n",
"### Setup a Dataproc cluster\n",
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
"\n",
"\n",
"### Prepare a Hadoop job\n",
"Upload your Hadoop JAR file to a Cloud Storage bucket. In the sample, we will use a JAR file that is preinstalled in the main cluster, so there is no need to provide `main_jar_file_uri`. \n",
"\n",
"Here is the [WordCount example source code](https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordCount.java).\n",
"\n",
"To package a self-contained Hadoop MapReduce application from the source code, follow the [MapReduce Tutorial](https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html).\n",
"\n",
"\n",
"### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"PROJECT_ID = '<Please put your project ID here>'\n",
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
"OUTPUT_GCS_PATH = '<Please put your output GCS path here>'\n",
"REGION = 'us-central1'\n",
"MAIN_CLASS = 'org.apache.hadoop.examples.WordCount'\n",
"INTPUT_GCS_PATH = 'gs://ml-pipeline-playground/shakespeare1.txt'\n",
"EXPERIMENT_NAME = 'Dataproc - Submit Hadoop Job'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Insepct Input Data\n",
"The input file is a simple text file:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!gsutil cat $INTPUT_GCS_PATH"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Clean up the existing output files (optional)\n",
"This is needed because the sample code requires the output folder to be a clean folder. To continue to run the sample, make sure that the service account of the notebook server has access to the `OUTPUT_GCS_PATH`.\n",
"\n",
"CAUTION: This will remove all blob files under `OUTPUT_GCS_PATH`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!gsutil rm $OUTPUT_GCS_PATH/**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='Dataproc submit Hadoop job pipeline',\n",
" description='Dataproc submit Hadoop job pipeline'\n",
")\n",
"def dataproc_submit_hadoop_job_pipeline(\n",
" project_id = PROJECT_ID, \n",
" region = REGION,\n",
" cluster_name = CLUSTER_NAME,\n",
" main_jar_file_uri = '',\n",
" main_class = MAIN_CLASS,\n",
" args = json.dumps([\n",
" INTPUT_GCS_PATH,\n",
" OUTPUT_GCS_PATH\n",
" ]), \n",
" hadoop_job='', \n",
" job='{}', \n",
" wait_interval='30'\n",
"):\n",
" dataproc_submit_hadoop_job_op(\n",
" project_id=project_id, \n",
" region=region, \n",
" cluster_name=cluster_name, \n",
" main_jar_file_uri=main_jar_file_uri, \n",
" main_class=main_class,\n",
" args=args, \n",
" hadoop_job=hadoop_job, \n",
" job=job, \n",
" wait_interval=wait_interval)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = dataproc_submit_hadoop_job_pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Inspect the output\n",
"The sample in the notebook will count the words in the input text and save them in sharded files. The command to inspect the output is:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!gsutil cat $OUTPUT_GCS_PATH/*"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)\n",
"* [Dataproc HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob)\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,189 @@
# Name
Component: Data preparation using Apache Hive on YARN with Cloud Dataproc
# Label
Cloud Dataproc, YARN, Apache Hive
# Summary
A Kubeflow pipeline component to prepare data by submitting an Apache Hive job on YARN to Cloud Dataproc.
# Facets
<!--Make sure the asset has data for the following facets:
Use case
Technique
Input data type
ML workflow
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
--->
Use case:
Technique:
Input data type:
ML workflow:
# Details
## Intended use
Use the component to run an Apache Hive job as one preprocessing step in a Kubeflow pipeline.
## Runtime arguments
| Argument | Description | Optional | Data type | Accepted values | Default |
|----------|-------------|----------|-----------|-----------------|---------|
| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectId | | |
| region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | |
| cluster_name | The name of the cluster to run the job. | No | String | | |
| queries | The queries to execute the Hive job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | | None |
| query_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the script that contains the Hive queries. | Yes | GCSPath | | None |
| script_variables | Mapping of the querys variable names to their values (equivalent to the Hive command: SET name="value";). | Yes | Dict | | None |
| hive_job | The payload of a [Hive job](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob) | Yes | Dict | | None |
| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |
| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | | 30 |
## Output
Name | Description | Type
:--- | :---------- | :---
job_id | The ID of the created job. | String
## Cautions & requirements
To use the component, you must:
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.
## Detailed description
This component creates a Hive job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
Follow these steps to use the component in a pipeline:
1. Install the Kubeflow pipeline's SDK:
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using the Kubeflow pipeline's SDK:
```python
import kfp.components as comp
dataproc_submit_hive_job_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_hive_job/component.yaml')
help(dataproc_submit_hive_job_op)
```
### Sample
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
#### Setup a Dataproc cluster
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
#### Prepare a Hive query
You can put your Hive queries in the `queries` list, or you can use `query_file_uri`. In this sample, we will use a hard coded query in the `queries` list to select data from a public CSV file in Cloud Storage.
For more details, see the [Hive language manual.](https://cwiki.apache.org/confluence/display/Hive/LanguageManual)
#### Set sample parameters
```python
PROJECT_ID = '<Put your project ID here>'
CLUSTER_NAME = '<Put your existing cluster name here>'
REGION = 'us-central1'
QUERY = '''
DROP TABLE IF EXISTS natality_csv;
CREATE EXTERNAL TABLE natality_csv (
source_year BIGINT, year BIGINT, month BIGINT, day BIGINT, wday BIGINT,
state STRING, is_male BOOLEAN, child_race BIGINT, weight_pounds FLOAT,
plurality BIGINT, apgar_1min BIGINT, apgar_5min BIGINT,
mother_residence_state STRING, mother_race BIGINT, mother_age BIGINT,
gestation_weeks BIGINT, lmp STRING, mother_married BOOLEAN,
mother_birth_state STRING, cigarette_use BOOLEAN, cigarettes_per_day BIGINT,
alcohol_use BOOLEAN, drinks_per_week BIGINT, weight_gain_pounds BIGINT,
born_alive_alive BIGINT, born_alive_dead BIGINT, born_dead BIGINT,
ever_born BIGINT, father_race BIGINT, father_age BIGINT,
record_weight BIGINT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION 'gs://public-datasets/natality/csv';
SELECT * FROM natality_csv LIMIT 10;'''
EXPERIMENT_NAME = 'Dataproc - Submit Hive Job'
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Dataproc submit Hive job pipeline',
description='Dataproc submit Hive job pipeline'
)
def dataproc_submit_hive_job_pipeline(
project_id = PROJECT_ID,
region = REGION,
cluster_name = CLUSTER_NAME,
queries = json.dumps([QUERY]),
query_file_uri = '',
script_variables = '',
hive_job='',
job='',
wait_interval='30'
):
dataproc_submit_hive_job_op(
project_id=project_id,
region=region,
cluster_name=cluster_name,
queries=queries,
query_file_uri=query_file_uri,
script_variables=script_variables,
hive_job=hive_job,
job=job,
wait_interval=wait_interval)
```
#### Compile the pipeline
```python
pipeline_func = dataproc_submit_hive_job_pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify values for the pipeline's arguments
arguments = {}
#Get or create an experiment
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
## References
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hive_job/sample.ipynb)
* [Dataproc HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,95 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: dataproc_submit_hive_job
description: >-
Submits a Cloud Dataproc job for running Apache Hive queries on YARN.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: >-
Required. The ID of the Google Cloud Platform project that the cluster
belongs to.
type: GCPProjectID
- name: region
description: >-
Required. The Cloud Dataproc region in which to handle the request.
type: GCPRegion
- name: cluster_name
description: 'Required. The cluster to run the job.'
type: String
- name: queries
default: ''
description: >-
Required. The queries to execute. You do not need to
terminate a query with a semicolon. Multiple queries can be specified
in one string by separating each with a semicolon.
type: List
- name: query_file_uri
default: ''
description: >-
The HCFS URI of the script that contains Hive queries.
type: GCSPath
- name: script_variables
default: ''
description: >-
Optional. Mapping of query variable names to
values (equivalent to the Hive command: SET name="value";).
type: Dict
- name: hive_job
default: ''
description: >-
Optional. The full payload of a
[HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob).
type: Dict
- name: job
default: ''
description: >-
Optional. The full payload of a
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
type: Dict
- name: wait_interval
default: '30'
description: >-
Optional. The wait seconds between polling the operation.
Defaults to 30.
type: Integer
outputs:
- name: job_id
description: 'The ID of the created job.'
type: String
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.dataproc, submit_hive_job,
--project_id, {inputValue: project_id},
--region, {inputValue: region},
--cluster_name, {inputValue: cluster_name},
--queries, {inputValue: queries},
--query_file_uri, {inputValue: query_file_uri},
--script_variables, {inputValue: script_variables},
--hive_job, {inputValue: hive_job},
--job, {inputValue: job},
--wait_interval, {inputValue: wait_interval},
--job_id_output_path, {outputPath: job_id},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,264 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"Data preparation using Apache Hive on YARN with Cloud Dataproc\n",
"\n",
"# Label\n",
"Cloud Dataproc, GCP, Cloud Storage, YARN, Hive, Apache\n",
"\n",
"# Summary\n",
"A Kubeflow Pipeline component to prepare data by submitting an Apache Hive job on YARN to Cloud Dataproc.\n",
"\n",
"# Details\n",
"## Intended use\n",
"Use the component to run an Apache Hive job as one preprocessing step in a Kubeflow Pipeline.\n",
"\n",
"## Runtime arguments\n",
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
"|----------|-------------|----------|-----------|-----------------|---------|\n",
"| project_id | The Google Cloud Platform (GCP) project ID that the cluster belongs to. | No | GCPProjectId | | |\n",
"| region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | |\n",
"| cluster_name | The name of the cluster to run the job. | No | String | | |\n",
"| queries | The queries to execute the Hive job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | | None |\n",
"| query_file_uri | The HCFS URI of the script that contains the Hive queries. | Yes | GCSPath | | None |\n",
"| script_variables | Mapping of the querys variable names to their values (equivalent to the Hive command: SET name=\"value\";). | Yes | Dict | | None |\n",
"| hive_job | The payload of a [HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob) | Yes | Dict | | None |\n",
"| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |\n",
"| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | | 30 |\n",
"\n",
"## Output\n",
"Name | Description | Type\n",
":--- | :---------- | :---\n",
"job_id | The ID of the created job. | String\n",
"\n",
"## Cautions & requirements\n",
"To use the component, you must:\n",
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
"\n",
"## Detailed description\n",
"This component creates a Hive job from [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
"\n",
"Follow these steps to use the component in a pipeline:\n",
"1. Install the Kubeflow Pipeline SDK:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"dataproc_submit_hive_job_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_hive_job/component.yaml')\n",
"help(dataproc_submit_hive_job_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample\n",
"\n",
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
"\n",
"\n",
"#### Setup a Dataproc cluster\n",
"\n",
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
"\n",
"#### Prepare a Hive query\n",
"\n",
"Put your Hive queries in the queries list, or upload your Hive queries into a file saved in a Cloud Storage bucket and then enter the Cloud Storage buckets path in `query_file_uri.` In this sample, we will use a hard coded query in the queries list to select data from a public CSV file from Cloud Storage.\n",
"\n",
"For more details, see the [Hive language manual.](https://cwiki.apache.org/confluence/display/Hive/LanguageManual)\n",
"\n",
"\n",
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"PROJECT_ID = '<Please put your project ID here>'\n",
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
"REGION = 'us-central1'\n",
"QUERY = '''\n",
"DROP TABLE IF EXISTS natality_csv;\n",
"CREATE EXTERNAL TABLE natality_csv (\n",
" source_year BIGINT, year BIGINT, month BIGINT, day BIGINT, wday BIGINT,\n",
" state STRING, is_male BOOLEAN, child_race BIGINT, weight_pounds FLOAT,\n",
" plurality BIGINT, apgar_1min BIGINT, apgar_5min BIGINT,\n",
" mother_residence_state STRING, mother_race BIGINT, mother_age BIGINT,\n",
" gestation_weeks BIGINT, lmp STRING, mother_married BOOLEAN,\n",
" mother_birth_state STRING, cigarette_use BOOLEAN, cigarettes_per_day BIGINT,\n",
" alcohol_use BOOLEAN, drinks_per_week BIGINT, weight_gain_pounds BIGINT,\n",
" born_alive_alive BIGINT, born_alive_dead BIGINT, born_dead BIGINT,\n",
" ever_born BIGINT, father_race BIGINT, father_age BIGINT,\n",
" record_weight BIGINT\n",
")\n",
"ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n",
"LOCATION 'gs://public-datasets/natality/csv';\n",
"\n",
"SELECT * FROM natality_csv LIMIT 10;'''\n",
"EXPERIMENT_NAME = 'Dataproc - Submit Hive Job'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='Dataproc submit Hive job pipeline',\n",
" description='Dataproc submit Hive job pipeline'\n",
")\n",
"def dataproc_submit_hive_job_pipeline(\n",
" project_id = PROJECT_ID, \n",
" region = REGION,\n",
" cluster_name = CLUSTER_NAME,\n",
" queries = json.dumps([QUERY]),\n",
" query_file_uri = '',\n",
" script_variables = '', \n",
" hive_job='', \n",
" job='', \n",
" wait_interval='30'\n",
"):\n",
" dataproc_submit_hive_job_op(\n",
" project_id=project_id, \n",
" region=region, \n",
" cluster_name=cluster_name, \n",
" queries=queries, \n",
" query_file_uri=query_file_uri,\n",
" script_variables=script_variables, \n",
" hive_job=hive_job, \n",
" job=job, \n",
" wait_interval=wait_interval)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = dataproc_submit_hive_job_pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hive_job/sample.ipynb)\n",
"* [Dataproc HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob)\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,185 @@
# Name
Component: Data preparation using Apache Pig on YARN with Cloud Dataproc
# Labels
Cloud Dataproc, YARN, Apache Pig, Kubeflow
# Summary
A Kubeflow pipeline component to prepare data by submitting an Apache Pig job on YARN to Cloud Dataproc.
# Facets
<!--Make sure the asset has data for the following facets:
Use case
Technique
Input data type
ML workflow
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
--->
Use case:
Other
Technique:
Other
Input data type:
Tabular
ML workflow:
Data preparation
# Details
## Intended use
Use this component to run an Apache Pig job as one preprocessing step in a Kubeflow pipeline.
## Runtime arguments
| Argument | Description | Optional | Data type | Accepted values | Default |
|:----------|:-------------|:----------|:-----------|:-----------------|:---------|
| project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No | GCPProjectID |- | -|
| region | The Cloud Dataproc region that handles the request. | No | GCPRegion | - |- |
| cluster_name | The name of the cluster that runs the job. | No | String | - | - |
| queries | The queries to execute the Pig job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | -| None |
| query_file_uri | The Cloud Storage bucket path pointing to a file that contains the Pig queries. | Yes | GCSPath | - | None |
| script_variables | Mapping of the querys variable names to their values (equivalent to the Pig command: SET name="value";). | Yes | Dict | -| None |
| pig_job | The payload of a [PigJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob). | Yes | Dict | - | None |
| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |
| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | - | 30 |
## Output
Name | Description | Type
:--- | :---------- | :---
job_id | The ID of the created job. | String
## Cautions & requirements
To use the component, you must:
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant the Kubeflow user service account the role, `roles/dataproc.editor`, on the project.
## Detailed description
This component creates a Pig job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
Follow these steps to use the component in a pipeline:
1. Install the Kubeflow pipeline's SDK
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using the Kubeflow pipeline's SDK
```python
import kfp.components as comp
dataproc_submit_pig_job_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_pig_job/component.yaml')
help(dataproc_submit_pig_job_op)
```
### Sample
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
#### Setup a Dataproc cluster
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
#### Prepare a Pig query
You can put your Pig queries in the `queries` list, or you can use `query_file_uri`. In this sample, we will use a hard-coded query in the `queries` list to select data from a local password file.
For more details on Apache Pig, see the [Pig documentation.](http://pig.apache.org/docs/latest/)
#### Set sample parameters
```python
PROJECT_ID = '<Put your project ID here>'
CLUSTER_NAME = '<Put your existing cluster name here>'
REGION = 'us-central1'
QUERY = '''
natality_csv = load 'gs://public-datasets/natality/csv' using PigStorage(':');
top_natality_csv = LIMIT natality_csv 10;
dump natality_csv;'''
EXPERIMENT_NAME = 'Dataproc - Submit Pig Job'
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Dataproc submit Pig job pipeline',
description='Dataproc submit Pig job pipeline'
)
def dataproc_submit_pig_job_pipeline(
project_id = PROJECT_ID,
region = REGION,
cluster_name = CLUSTER_NAME,
queries = json.dumps([QUERY]),
query_file_uri = '',
script_variables = '',
pig_job='',
job='',
wait_interval='30'
):
dataproc_submit_pig_job_op(
project_id=project_id,
region=region,
cluster_name=cluster_name,
queries=queries,
query_file_uri=query_file_uri,
script_variables=script_variables,
pig_job=pig_job,
job=job,
wait_interval=wait_interval)
```
#### Compile the pipeline
```python
pipeline_func = dataproc_submit_pig_job_pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify values for the pipeline's arguments
arguments = {}
#Get or create an experiment
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
## References
* [Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster)
* [Pig documentation](http://pig.apache.org/docs/latest/)
* [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)
* [PigJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,95 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: dataproc_submit_pig_job
description: >-
Submits a Cloud Dataproc job for running Apache Pig queries on YARN.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: >-
Required. The ID of the Google Cloud Platform project that the cluster
belongs to.
type: GCPProjectID
- name: region
description: >-
Required. The Cloud Dataproc region in which to handle the request.
type: GCPRegion
- name: cluster_name
description: 'Required. The cluster to run the job.'
type: String
- name: queries
default: ''
description: >-
Required. The queries to execute. You do not need to
terminate a query with a semicolon. Multiple queries can be specified
in one string by separating each with a semicolon.
type: List
- name: query_file_uri
default: ''
description: >-
The HCFS URI of the script that contains Pig queries.
type: GCSPath
- name: script_variables
default: ''
description: >-
Optional. Mapping of query variable names to
values (equivalent to the Pig command: SET name="value";).
type: Dict
- name: pig_job
default: ''
description: >-
Optional. The full payload of a
[PigJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob).
type: Dict
- name: job
default: ''
description: >-
Optional. The full payload of a
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
type: Dict
- name: wait_interval
default: '30'
description: >-
Optional. The wait seconds between polling the operation.
Defaults to 30.
type: Integer
outputs:
- name: job_id
description: 'The ID of the created job.'
type: String
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.dataproc, submit_pig_job,
--project_id, {inputValue: project_id},
--region, {inputValue: region},
--cluster_name, {inputValue: cluster_name},
--queries, {inputValue: queries},
--query_file_uri, {inputValue: query_file_uri},
--script_variables, {inputValue: script_variables},
--pig_job, {inputValue: pig_job},
--job, {inputValue: job},
--wait_interval, {inputValue: wait_interval},
--job_id_output_path, {outputPath: job_id},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,254 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"Data preparation using Apache Pig on YARN with Cloud Dataproc\n",
"\n",
"# Label\n",
"Cloud Dataproc, GCP, Cloud Storage, YARN, Pig, Apache, Kubeflow, pipelines, components\n",
"\n",
"\n",
"# Summary\n",
"A Kubeflow Pipeline component to prepare data by submitting an Apache Pig job on YARN to Cloud Dataproc.\n",
"\n",
"\n",
"# Details\n",
"## Intended use\n",
"Use the component to run an Apache Pig job as one preprocessing step in a Kubeflow Pipeline.\n",
"\n",
"## Runtime arguments\n",
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
"|----------|-------------|----------|-----------|-----------------|---------|\n",
"| project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No | GCPProjectID | | |\n",
"| region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | |\n",
"| cluster_name | The name of the cluster to run the job. | No | String | | |\n",
"| queries | The queries to execute the Pig job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | | None |\n",
"| query_file_uri | The HCFS URI of the script that contains the Pig queries. | Yes | GCSPath | | None |\n",
"| script_variables | Mapping of the querys variable names to their values (equivalent to the Pig command: SET name=\"value\";). | Yes | Dict | | None |\n",
"| pig_job | The payload of a [PigJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob). | Yes | Dict | | None |\n",
"| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |\n",
"| wait_interval | The number of seconds to pause between polling the operation. | Yes | Integer | | 30 |\n",
"\n",
"## Output\n",
"Name | Description | Type\n",
":--- | :---------- | :---\n",
"job_id | The ID of the created job. | String\n",
"\n",
"## Cautions & requirements\n",
"\n",
"To use the component, you must:\n",
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
"\n",
"## Detailed description\n",
"This component creates a Pig job from [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
"\n",
"Follow these steps to use the component in a pipeline:\n",
"1. Install the Kubeflow Pipeline SDK:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"dataproc_submit_pig_job_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_pig_job/component.yaml')\n",
"help(dataproc_submit_pig_job_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample\n",
"\n",
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
"\n",
"\n",
"#### Setup a Dataproc cluster\n",
"\n",
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
"\n",
"\n",
"#### Prepare a Pig query\n",
"\n",
"Either put your Pig queries in the `queries` list, or upload your Pig queries into a file to a Cloud Storage bucket and then enter the Cloud Storage buckets path in `query_file_uri`. In this sample, we will use a hard coded query in the `queries` list to select data from a local `passwd` file.\n",
"\n",
"For more details on Apache Pig, see the [Pig documentation.](http://pig.apache.org/docs/latest/)\n",
"\n",
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"PROJECT_ID = '<Please put your project ID here>'\n",
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
"\n",
"REGION = 'us-central1'\n",
"QUERY = '''\n",
"natality_csv = load 'gs://public-datasets/natality/csv' using PigStorage(':');\n",
"top_natality_csv = LIMIT natality_csv 10; \n",
"dump natality_csv;'''\n",
"EXPERIMENT_NAME = 'Dataproc - Submit Pig Job'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='Dataproc submit Pig job pipeline',\n",
" description='Dataproc submit Pig job pipeline'\n",
")\n",
"def dataproc_submit_pig_job_pipeline(\n",
" project_id = PROJECT_ID, \n",
" region = REGION,\n",
" cluster_name = CLUSTER_NAME,\n",
" queries = json.dumps([QUERY]),\n",
" query_file_uri = '',\n",
" script_variables = '', \n",
" pig_job='', \n",
" job='', \n",
" wait_interval='30'\n",
"):\n",
" dataproc_submit_pig_job_op(\n",
" project_id=project_id, \n",
" region=region, \n",
" cluster_name=cluster_name, \n",
" queries=queries, \n",
" query_file_uri=query_file_uri,\n",
" script_variables=script_variables, \n",
" pig_job=pig_job, \n",
" job=job, \n",
" wait_interval=wait_interval)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = dataproc_submit_pig_job_pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"* [Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) \n",
"* [Pig documentation](http://pig.apache.org/docs/latest/)\n",
"* [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)\n",
"* [PigJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob)\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,178 @@
# Name
Component: Data preparation using PySpark on Cloud Dataproc
# Labels
Cloud Dataproc, PySpark, Kubeflow
# Summary
A Kubeflow Pipeline component to prepare data by submitting a PySpark job to Cloud Dataproc.
# Facets
<!--Make sure the asset has data for the following facets:
Use case
Technique
Input data type
ML workflow
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
--->
Use case:
Technique:
Input data type:
ML workflow:
# Details
## Intended use
Use this component to run an Apache PySpark job as one preprocessing step in a Kubeflow pipeline.
## Runtime arguments
| Argument | Description | Optional | Data type | Accepted values | Default |
|:----------------------|:------------|:----------|:--------------|:-----------------|:---------|
| project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No | GCPProjectID | - | - |
| region | The Cloud Dataproc region to handle the request. | No | GCPRegion | - | - |
| cluster_name | The name of the cluster to run the job. | No | String | - | - |
| main_python_file_uri | The HCFS URI of the Python file to use as the driver. This must be a .py file. | No | GCSPath | - | - |
| args | The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission. | Yes | List | - | None |
| pyspark_job | The payload of a [PySparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob). | Yes | Dict | - | None |
| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | - | None |
## Output
Name | Description | Type
:--- | :---------- | :---
job_id | The ID of the created job. | String
## Cautions & requirements
To use the component, you must:
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.
## Detailed description
This component creates a PySpark job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
Follow these steps to use the component in a pipeline:
1. Install the Kubeflow pipeline's SDK:
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the Kubeflow pipeline's SDK:
```python
import kfp.components as comp
dataproc_submit_pyspark_job_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_pyspark_job/component.yaml')
help(dataproc_submit_pyspark_job_op)
```
### Sample
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
#### Setup a Dataproc cluster
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
#### Prepare a PySpark job
Upload your PySpark code file to a Cloud Storage bucket. For example, this is a publicly accessible `hello-world.py` in Cloud Storage:
```python
!gsutil cat gs://dataproc-examples-2f10d78d114f6aaec76462e3c310f31f/src/pyspark/hello-world/hello-world.py
```
#### Set sample parameters
```python
PROJECT_ID = '<Put your project ID here>'
CLUSTER_NAME = '<Put your existing cluster name here>'
REGION = 'us-central1'
PYSPARK_FILE_URI = 'gs://dataproc-examples-2f10d78d114f6aaec76462e3c310f31f/src/pyspark/hello-world/hello-world.py'
ARGS = ''
EXPERIMENT_NAME = 'Dataproc - Submit PySpark Job'
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Dataproc submit PySpark job pipeline',
description='Dataproc submit PySpark job pipeline'
)
def dataproc_submit_pyspark_job_pipeline(
project_id = PROJECT_ID,
region = REGION,
cluster_name = CLUSTER_NAME,
main_python_file_uri = PYSPARK_FILE_URI,
args = ARGS,
pyspark_job='{}',
job='{}',
wait_interval='30'
):
dataproc_submit_pyspark_job_op(
project_id=project_id,
region=region,
cluster_name=cluster_name,
main_python_file_uri=main_python_file_uri,
args=args,
pyspark_job=pyspark_job,
job=job,
wait_interval=wait_interval)
```
#### Compile the pipeline
```python
pipeline_func = dataproc_submit_pyspark_job_pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify values for the pipeline's arguments
arguments = {}
#Get or create an experiment
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
## References
* [Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster)
* [PySparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob)
* [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,88 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: dataproc_submit_pyspark_job
description: >-
Submits a Cloud Dataproc job for running Apache PySpark applications on YARN.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: >-
Required. The ID of the Google Cloud Platform project that the cluster
belongs to.
type: GCPProjectID
- name: region
description: >-
Required. The Cloud Dataproc region in which to handle the request.
type: GCPRegion
- name: cluster_name
description: 'Required. The cluster to run the job.'
type: String
- name: main_python_file_uri
description: >-
Required. The HCFS URI of the main Python file to
use as the driver. Must be a .py file.
type: GCSPath
- name: args
default: ''
description: >-
Optional. The arguments to pass to the driver. Do not include
arguments, such as --conf, that can be set as job properties, since a
collision may occur that causes an incorrect job submission.
type: List
- name: pyspark_job
default: ''
description: >-
Optional. The full payload of a
[PySparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob).
type: Dict
- name: job
default: ''
description: >-
Optional. The full payload of a
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
type: Dict
- name: wait_interval
default: '30'
description: >-
Optional. The wait seconds between polling the operation.
Defaults to 30.
type: Integer
outputs:
- name: job_id
description: 'The ID of the created job.'
type: String
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.dataproc, submit_pyspark_job,
--project_id, {inputValue: project_id},
--region, {inputValue: region},
--cluster_name, {inputValue: cluster_name},
--main_python_file_uri, {inputValue: main_python_file_uri},
--args, {inputValue: args},
--pyspark_job, {inputValue: pyspark_job},
--job, {inputValue: job},
--wait_interval, {inputValue: wait_interval},
--job_id_output_path, {outputPath: job_id},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,263 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"Data preparation using PySpark on Cloud Dataproc\n",
"\n",
"\n",
"# Label\n",
"Cloud Dataproc, GCP, Cloud Storage,PySpark, Kubeflow, pipelines, components\n",
"\n",
"\n",
"# Summary\n",
"A Kubeflow Pipeline component to prepare data by submitting a PySpark job to Cloud Dataproc.\n",
"\n",
"\n",
"# Details\n",
"## Intended use\n",
"Use the component to run an Apache PySpark job as one preprocessing step in a Kubeflow Pipeline.\n",
"\n",
"\n",
"## Runtime arguments\n",
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
"|----------------------|------------|----------|--------------|-----------------|---------|\n",
"| project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No | GCPProjectID | | |\n",
"| region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | |\n",
"| cluster_name | The name of the cluster to run the job. | No | String | | |\n",
"| main_python_file_uri | The HCFS URI of the Python file to use as the driver. This must be a .py file. | No | GCSPath | | |\n",
"| args | The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission. | Yes | List | | None |\n",
"| pyspark_job | The payload of a [PySparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob). | Yes | Dict | | None |\n",
"| job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |\n",
"\n",
"## Output\n",
"Name | Description | Type\n",
":--- | :---------- | :---\n",
"job_id | The ID of the created job. | String\n",
"\n",
"## Cautions & requirements\n",
"\n",
"To use the component, you must:\n",
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
"\n",
"## Detailed description\n",
"\n",
"This component creates a PySpark job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
"\n",
"Follow these steps to use the component in a pipeline:\n",
"\n",
"1. Install the Kubeflow Pipeline SDK:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"dataproc_submit_pyspark_job_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_pyspark_job/component.yaml')\n",
"help(dataproc_submit_pyspark_job_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample\n",
"\n",
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
"\n",
"\n",
"#### Setup a Dataproc cluster\n",
"\n",
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
"\n",
"\n",
"#### Prepare a PySpark job\n",
"\n",
"Upload your PySpark code file to a Cloud Storage bucket. For example, this is a publicly accessible `hello-world.py` in Cloud Storage:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!gsutil cat gs://dataproc-examples-2f10d78d114f6aaec76462e3c310f31f/src/pyspark/hello-world/hello-world.py"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"PROJECT_ID = '<Please put your project ID here>'\n",
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
"REGION = 'us-central1'\n",
"PYSPARK_FILE_URI = 'gs://dataproc-examples-2f10d78d114f6aaec76462e3c310f31f/src/pyspark/hello-world/hello-world.py'\n",
"ARGS = ''\n",
"EXPERIMENT_NAME = 'Dataproc - Submit PySpark Job'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='Dataproc submit PySpark job pipeline',\n",
" description='Dataproc submit PySpark job pipeline'\n",
")\n",
"def dataproc_submit_pyspark_job_pipeline(\n",
" project_id = PROJECT_ID, \n",
" region = REGION,\n",
" cluster_name = CLUSTER_NAME,\n",
" main_python_file_uri = PYSPARK_FILE_URI, \n",
" args = ARGS, \n",
" pyspark_job='{}', \n",
" job='{}', \n",
" wait_interval='30'\n",
"):\n",
" dataproc_submit_pyspark_job_op(\n",
" project_id=project_id, \n",
" region=region, \n",
" cluster_name=cluster_name, \n",
" main_python_file_uri=main_python_file_uri, \n",
" args=args, \n",
" pyspark_job=pyspark_job, \n",
" job=job, \n",
" wait_interval=wait_interval)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = dataproc_submit_pyspark_job_pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"\n",
"* [Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) \n",
"* [PySparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob)\n",
"* [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,200 @@
# Name
Component: Data preparation using Spark on YARN with Cloud Dataproc
# Labels
Spark, Kubeflow,YARN
# Facets
<!--Make sure the asset has data for the following facets:
Use case
Technique
Input data type
ML workflow
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
--->
Use case:
Other
Technique:
Other
Input data type:
Tabular
ML workflow:
Data preparation
# Summary
A Kubeflow pipeline component to prepare data by submitting a Spark job on YARN to Cloud Dataproc.
# Details
## Intended use
Use the component to run an Apache Spark job as one preprocessing step in a Kubeflow pipeline.
## Runtime arguments
Argument | Description | Optional | Data type | Accepted values | Default |
:--- | :---------- | :--- | :------- | :------| :------|
project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to.|No | GCPProjectID | | |
region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | |
cluster_name | The name of the cluster to run the job. | No | String | | |
main_jar_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the JAR file that contains the main class. | No | GCSPath | | |
main_class | The name of the driver's main class. The JAR file that contains the class must be either in the default CLASSPATH or specified in `spark_job.jarFileUris`.| No | | | |
args | The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission.| Yes | | | |
spark_job | The payload of a [SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob).| Yes | | | |
job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | | | |
wait_interval | The number of seconds to wait between polling the operation. | Yes | | | 30 |
## Output
Name | Description | Type
:--- | :---------- | :---
job_id | The ID of the created job. | String
## Cautions & requirements
To use the component, you must:
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.
## Detailed description
This component creates a Spark job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
Follow these steps to use the component in a pipeline:
1. Install the Kubeflow Pipeline's SDK:
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using the Kubeflow Pipeline's SDK
```python
import kfp.components as comp
dataproc_submit_spark_job_op = comp.load_component_from_url(
'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_spark_job/component.yaml')
help(dataproc_submit_spark_job_op)
```
### Sample
Note: The following sample code works in an IPython notebook or directly in Python code.
#### Set up a Dataproc cluster
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
#### Prepare a Spark job
Upload your Spark JAR file to a Cloud Storage bucket. In the sample, we use a JAR file that is preinstalled in the main cluster: `file:///usr/lib/spark/examples/jars/spark-examples.jar`.
Here is the [source code of the sample](https://github.com/apache/spark/blob/master/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java).
To package a self-contained Spark application, follow these [instructions](https://spark.apache.org/docs/latest/quick-start.html#self-contained-applications).
#### Set sample parameters
```python
PROJECT_ID = '<Put your project ID here>'
CLUSTER_NAME = '<Put your existing cluster name here>'
REGION = 'us-central1'
SPARK_FILE_URI = 'file:///usr/lib/spark/examples/jars/spark-examples.jar'
MAIN_CLASS = 'org.apache.spark.examples.SparkPi'
ARGS = ['1000']
EXPERIMENT_NAME = 'Dataproc - Submit Spark Job'
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Dataproc submit Spark job pipeline',
description='Dataproc submit Spark job pipeline'
)
def dataproc_submit_spark_job_pipeline(
project_id = PROJECT_ID,
region = REGION,
cluster_name = CLUSTER_NAME,
main_jar_file_uri = '',
main_class = MAIN_CLASS,
args = json.dumps(ARGS),
spark_job=json.dumps({ 'jarFileUris': [ SPARK_FILE_URI ] }),
job='{}',
wait_interval='30'
):
dataproc_submit_spark_job_op(
project_id=project_id,
region=region,
cluster_name=cluster_name,
main_jar_file_uri=main_jar_file_uri,
main_class=main_class,
args=args,
spark_job=spark_job,
job=job,
wait_interval=wait_interval)
```
#### Compile the pipeline
```python
#Compile the pipeline
pipeline_func = dataproc_submit_spark_job_pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify values for the pipeline's arguments
arguments = {}
#Get or create an experiment
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
## References
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_spark_job/sample.ipynb)
* [Dataproc SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,96 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: dataproc_submit_spark_job
description: >-
Submits a Cloud Dataproc job for running Apache Spark applications on YARN.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: >-
Required. The ID of the Google Cloud Platform project that the cluster
belongs to.
type: GCPProjectID
- name: region
description: >-
Required. The Cloud Dataproc region in which to handle the request.
type: GCPRegion
- name: cluster_name
description: 'Required. The cluster to run the job.'
type: String
- name: main_jar_file_uri
default: ''
description: >-
The HCFS URI of the jar file that contains the main class.
type: GCSPath
- name: main_class
default: ''
description: >-
The name of the driver's main class. The jar file that
contains the class must be in the default CLASSPATH or specified in
jarFileUris.
type: String
- name: args
default: ''
description: >-
Optional. The arguments to pass to the driver. Do not include
arguments, such as --conf, that can be set as job properties, since a
collision may occur that causes an incorrect job submission.
type: List
- name: spark_job
default: ''
description: >-
Optional. The full payload of a
[SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob).
type: Dict
- name: job
default: ''
description: >-
Optional. The full payload of a
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
type: Dict
- name: wait_interval
default: '30'
description: >-
Optional. The wait seconds between polling the operation.
Defaults to 30.
type: Integer
outputs:
- name: job_id
description: 'The ID of the created job.'
type: String
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.dataproc, submit_spark_job,
--project_id, {inputValue: project_id},
--region, {inputValue: region},
--cluster_name, {inputValue: cluster_name},
--main_jar_file_uri, {inputValue: main_jar_file_uri},
--main_class, {inputValue: main_class},
--args, {inputValue: args},
--spark_job, {inputValue: spark_job},
--job, {inputValue: job},
--wait_interval, {inputValue: wait_interval},
--job_id_output_path, {outputPath: job_id},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,266 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"\n",
"Data preparation using Spark on YARN with Cloud Dataproc\n",
"\n",
"\n",
"# Label\n",
"\n",
"Cloud Dataproc, GCP, Cloud Storage, Spark, Kubeflow, pipelines, components, YARN\n",
"\n",
"\n",
"# Summary\n",
"\n",
"A Kubeflow Pipeline component to prepare data by submitting a Spark job on YARN to Cloud Dataproc.\n",
"\n",
"# Details\n",
"\n",
"## Intended use\n",
"\n",
"Use the component to run an Apache Spark job as one preprocessing step in a Kubeflow Pipeline.\n",
"\n",
"## Runtime arguments\n",
"Argument | Description | Optional | Data type | Accepted values | Default |\n",
":--- | :---------- | :--- | :------- | :------| :------| \n",
"project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to.|No | GCPProjectID | | |\n",
"region | The Cloud Dataproc region to handle the request. | No | GCPRegion | | | \n",
"cluster_name | The name of the cluster to run the job. | No | String | | |\n",
"main_jar_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the JAR file that contains the main class. | No | GCSPath | | |\n",
"main_class | The name of the driver's main class. The JAR file that contains the class must be either in the default CLASSPATH or specified in `spark_job.jarFileUris`.| No | | | | \n",
"args | The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission.| Yes | | | |\n",
"spark_job | The payload of a [SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob).| Yes | | | |\n",
"job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | | | |\n",
"wait_interval | The number of seconds to wait between polling the operation. | Yes | | | 30 |\n",
"\n",
"## Output\n",
"Name | Description | Type\n",
":--- | :---------- | :---\n",
"job_id | The ID of the created job. | String\n",
"\n",
"## Cautions & requirements\n",
"\n",
"To use the component, you must:\n",
"\n",
"\n",
"\n",
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
"\n",
"\n",
"## Detailed description\n",
"\n",
"This component creates a Spark job from [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
"\n",
"Follow these steps to use the component in a pipeline:\n",
"\n",
"\n",
"\n",
"1. Install the Kubeflow Pipeline SDK:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"dataproc_submit_spark_job_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_spark_job/component.yaml')\n",
"help(dataproc_submit_spark_job_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample\n",
"Note: The following sample code works in an IPython notebook or directly in Python code.\n",
"\n",
"\n",
"#### Set up a Dataproc cluster\n",
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
"\n",
"\n",
"#### Prepare a Spark job\n",
"Upload your Spark JAR file to a Cloud Storage bucket. In the sample, we use a JAR file that is preinstalled in the main cluster: `file:///usr/lib/spark/examples/jars/spark-examples.jar`.\n",
"\n",
"Here is the [source code of the sample](https://github.com/apache/spark/blob/master/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java).\n",
"\n",
"To package a self-contained Spark application, follow these [instructions](https://spark.apache.org/docs/latest/quick-start.html#self-contained-applications).\n",
"\n",
"\n",
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"PROJECT_ID = '<Please put your project ID here>'\n",
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
"REGION = 'us-central1'\n",
"SPARK_FILE_URI = 'file:///usr/lib/spark/examples/jars/spark-examples.jar'\n",
"MAIN_CLASS = 'org.apache.spark.examples.SparkPi'\n",
"ARGS = ['1000']\n",
"EXPERIMENT_NAME = 'Dataproc - Submit Spark Job'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='Dataproc submit Spark job pipeline',\n",
" description='Dataproc submit Spark job pipeline'\n",
")\n",
"def dataproc_submit_spark_job_pipeline(\n",
" project_id = PROJECT_ID, \n",
" region = REGION,\n",
" cluster_name = CLUSTER_NAME,\n",
" main_jar_file_uri = '',\n",
" main_class = MAIN_CLASS,\n",
" args = json.dumps(ARGS), \n",
" spark_job=json.dumps({ 'jarFileUris': [ SPARK_FILE_URI ] }), \n",
" job='{}', \n",
" wait_interval='30'\n",
"):\n",
" dataproc_submit_spark_job_op(\n",
" project_id=project_id, \n",
" region=region, \n",
" cluster_name=cluster_name, \n",
" main_jar_file_uri=main_jar_file_uri, \n",
" main_class=main_class,\n",
" args=args, \n",
" spark_job=spark_job, \n",
" job=job, \n",
" wait_interval=wait_interval)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = dataproc_submit_spark_job_pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_spark_job/sample.ipynb)\n",
"* [Dataproc SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob)\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,184 @@
# Name
Component: Data preparation using SparkSQL on YARN with Cloud Dataproc
# Label
Cloud Dataproc, YARN, SparkSQL, Kubeflow
# Summary
A Kubeflow pipeline component to prepare data by submitting a SparkSql job on YARN to Cloud Dataproc.
# Facets
<!--Make sure the asset has data for the following facets:
Use case
Technique
Input data type
ML workflow
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
--->
Use case:
Technique:
Input data type:
ML workflow:
# Details
## Intended use
Use the component to run an Apache SparkSql job as one preprocessing step in a Kubeflow pipeline.
## Runtime arguments
Argument| Description | Optional | Data type| Accepted values| Default |
:--- | :---------- | :--- | :------- | :------ | :------
project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No| GCPProjectID | - | -|
region | The Cloud Dataproc region to handle the request. | No | GCPRegion|-|-
cluster_name | The name of the cluster to run the job. | No | String| -| -|
queries | The queries to execute the SparkSQL job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | - | None |
query_file_uri | The Hadoop Compatible Filesystem (HCFS) URI of the script that contains the SparkSQL queries. The SparkSQL queries are listed in a CSV file that is stored in a Cloud Storage bucket.| Yes | GCSPath | - | None |
script_variables | Mapping of the querys variable names to their values (equivalent to the SparkSQL command: SET name="value";).| Yes| Dict |- | None |
sparksql_job | The payload of a [SparkSql job](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob). | Yes | Dict | - | None |
job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | - | None |
wait_interval | The number of seconds to pause between polling the operation. | Yes |Integer | - | 30 |
## Output
Name | Description | Type
:--- | :---------- | :---
job_id | The ID of the created job. | String
## Cautions & requirements
To use the component, you must:
* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).
* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant the Kubeflow user service account the role, `roles/dataproc.editor`, on the project.
## Detailed Description
This component creates a SparkSql job from the [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).
Follow these steps to use the component in a pipeline:
1. Install the Kubeflow pipeline's SDK:
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using the Kubeflow pipeline's SDK:
```python
import kfp.components as comp
dataproc_submit_sparksql_job_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_sparksql_job/component.yaml')
help(dataproc_submit_sparksql_job_op)
```
### Sample
The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.
#### Setup a Dataproc cluster
[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.
#### Prepare a SparkSQL job
You can put your SparkSQL queries in the `queries` list, or you can use `query_file_uri`. In this sample, we will use a hard coded query in the `queries` list to select data from a public CSV file in Cloud Storage.
For more details about Spark SQL, see [Spark SQL, DataFrames and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html).
#### Set sample parameters
```python
PROJECT_ID = '<Put your project ID here>'
CLUSTER_NAME = '<Put your existing cluster name here>'
REGION = 'us-central1'
QUERY = '''
DROP TABLE IF EXISTS natality_csv;
CREATE EXTERNAL TABLE natality_csv (
source_year BIGINT, year BIGINT, month BIGINT, day BIGINT, wday BIGINT,
state STRING, is_male BOOLEAN, child_race BIGINT, weight_pounds FLOAT,
plurality BIGINT, apgar_1min BIGINT, apgar_5min BIGINT,
mother_residence_state STRING, mother_race BIGINT, mother_age BIGINT,
gestation_weeks BIGINT, lmp STRING, mother_married BOOLEAN,
mother_birth_state STRING, cigarette_use BOOLEAN, cigarettes_per_day BIGINT,
alcohol_use BOOLEAN, drinks_per_week BIGINT, weight_gain_pounds BIGINT,
born_alive_alive BIGINT, born_alive_dead BIGINT, born_dead BIGINT,
ever_born BIGINT, father_race BIGINT, father_age BIGINT,
record_weight BIGINT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION 'gs://public-datasets/natality/csv';
SELECT * FROM natality_csv LIMIT 10;'''
EXPERIMENT_NAME = 'Dataproc - Submit SparkSQL Job'
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Dataproc submit SparkSQL job pipeline',
description='Dataproc submit SparkSQL job pipeline'
)
def dataproc_submit_sparksql_job_pipeline(
project_id = PROJECT_ID,
region = REGION,
cluster_name = CLUSTER_NAME,
queries = json.dumps([QUERY]),
query_file_uri = '',
script_variables = '',
sparksql_job='',
job='',
wait_interval='30'
):
dataproc_submit_sparksql_job_op(
project_id=project_id,
region=region,
cluster_name=cluster_name,
queries=queries,
query_file_uri=query_file_uri,
script_variables=script_variables,
sparksql_job=sparksql_job,
job=job,
wait_interval=wait_interval)
```
#### Compile the pipeline
```python
pipeline_func = dataproc_submit_sparksql_job_pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify values for the pipeline's arguments
arguments = {}
#Get or create an experiment
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
## References
* [Spark SQL, DataFrames and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html)
* [SparkSqlJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob)
* [Cloud Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,95 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: dataproc_submit_sparksql_job
description: >-
Submits a Cloud Dataproc job for running Apache Spark SQL queries.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: >-
Required. The ID of the Google Cloud Platform project that the cluster
belongs to.
type: GCPProjectID
- name: region
description: >-
Required. The Cloud Dataproc region in which to handle the request.
type: GCPRegion
- name: cluster_name
description: 'Required. The cluster to run the job.'
type: String
- name: queries
default: ''
description: >-
Required. The queries to execute. You do not need to
terminate a query with a semicolon. Multiple queries can be specified
in one string by separating each with a semicolon.
type: List
- name: query_file_uri
default: ''
description: >-
The HCFS URI of the script that contains SQL queries.
type: GCSPath
- name: script_variables
default: ''
description: >-
Optional. Mapping of query variable names to
values (equivalent to the Spark SQL command: SET name="value";).
type: Dict
- name: sparksql_job
default: ''
description: >-
Optional. The full payload of a
[SparkSqlJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob).
type: Dict
- name: job
default: ''
description: >-
Optional. The full payload of a
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
type: Dict
- name: wait_interval
default: '30'
description: >-
Optional. The wait seconds between polling the operation.
Defaults to 30.
type: Integer
outputs:
- name: job_id
description: 'The ID of the created job.'
type: String
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.dataproc, submit_sparksql_job,
--project_id, {inputValue: project_id},
--region, {inputValue: region},
--cluster_name, {inputValue: cluster_name},
--queries, {inputValue: queries},
--query_file_uri, {inputValue: query_file_uri},
--script_variables, {inputValue: script_variables},
--sparksql_job, {inputValue: sparksql_job},
--job, {inputValue: job},
--wait_interval, {inputValue: wait_interval},
--job_id_output_path, {outputPath: job_id},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,261 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"Data preparation using SparkSQL on YARN with Cloud Dataproc\n",
"\n",
"# Label\n",
"Cloud Dataproc, GCP, Cloud Storage, YARN, SparkSQL, Kubeflow, pipelines, components \n",
"\n",
"# Summary\n",
"A Kubeflow Pipeline component to prepare data by submitting a SparkSql job on YARN to Cloud Dataproc.\n",
"\n",
"# Details\n",
"\n",
"## Intended use\n",
"Use the component to run an Apache SparkSql job as one preprocessing step in a Kubeflow Pipeline.\n",
"\n",
"## Runtime arguments\n",
"Argument| Description | Optional | Data type| Accepted values| Default |\n",
":--- | :---------- | :--- | :------- | :------ | :------\n",
"project_id | The ID of the Google Cloud Platform (GCP) project that the cluster belongs to. | No| GCPProjectID | | |\n",
"region | The Cloud Dataproc region to handle the request. | No | GCPRegion|\n",
"cluster_name | The name of the cluster to run the job. | No | String| | |\n",
"queries | The queries to execute the SparkSQL job. Specify multiple queries in one string by separating them with semicolons. You do not need to terminate queries with semicolons. | Yes | List | | None | \n",
"query_file_uri | The HCFS URI of the script that contains the SparkSQL queries.| Yes | GCSPath | | None |\n",
"script_variables | Mapping of the querys variable names to their values (equivalent to the SparkSQL command: SET name=\"value\";).| Yes| Dict | | None |\n",
"sparksql_job | The payload of a [SparkSqlJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob). | Yes | Dict | | None |\n",
"job | The payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). | Yes | Dict | | None |\n",
"wait_interval | The number of seconds to pause between polling the operation. | Yes |Integer | | 30 |\n",
"\n",
"## Output\n",
"Name | Description | Type\n",
":--- | :---------- | :---\n",
"job_id | The ID of the created job. | String\n",
"\n",
"## Cautions & requirements\n",
"To use the component, you must:\n",
"* Set up a GCP project by following this [guide](https://cloud.google.com/dataproc/docs/guides/setup-project).\n",
"* [Create a new cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant the Kubeflow user service account the role `roles/dataproc.editor` on the project.\n",
"\n",
"## Detailed Description\n",
"This component creates a Pig job from [Dataproc submit job REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs/submit).\n",
"\n",
"Follow these steps to use the component in a pipeline:\n",
"1. Install the Kubeflow Pipeline SDK:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"dataproc_submit_sparksql_job_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/dataproc/submit_sparksql_job/component.yaml')\n",
"help(dataproc_submit_sparksql_job_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample\n",
"\n",
"Note: The following sample code works in an IPython notebook or directly in Python code. See the sample code below to learn how to execute the template.\n",
"\n",
"#### Setup a Dataproc cluster\n",
"[Create a new Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) (or reuse an existing one) before running the sample code.\n",
"\n",
"#### Prepare a SparkSQL job\n",
"Either put your SparkSQL queries in the `queires` list, or upload your SparkSQL queries into a file to a Cloud Storage bucket and then enter the Cloud Storage buckets path in `query_file_uri`. In this sample, we will use a hard coded query in the `queries` list to select data from a public CSV file from Cloud Storage.\n",
"\n",
"For more details about Spark SQL, see [Spark SQL, DataFrames and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html)\n",
"\n",
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"PROJECT_ID = '<Please put your project ID here>'\n",
"CLUSTER_NAME = '<Please put your existing cluster name here>'\n",
"REGION = 'us-central1'\n",
"QUERY = '''\n",
"DROP TABLE IF EXISTS natality_csv;\n",
"CREATE EXTERNAL TABLE natality_csv (\n",
" source_year BIGINT, year BIGINT, month BIGINT, day BIGINT, wday BIGINT,\n",
" state STRING, is_male BOOLEAN, child_race BIGINT, weight_pounds FLOAT,\n",
" plurality BIGINT, apgar_1min BIGINT, apgar_5min BIGINT,\n",
" mother_residence_state STRING, mother_race BIGINT, mother_age BIGINT,\n",
" gestation_weeks BIGINT, lmp STRING, mother_married BOOLEAN,\n",
" mother_birth_state STRING, cigarette_use BOOLEAN, cigarettes_per_day BIGINT,\n",
" alcohol_use BOOLEAN, drinks_per_week BIGINT, weight_gain_pounds BIGINT,\n",
" born_alive_alive BIGINT, born_alive_dead BIGINT, born_dead BIGINT,\n",
" ever_born BIGINT, father_race BIGINT, father_age BIGINT,\n",
" record_weight BIGINT\n",
")\n",
"ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n",
"LOCATION 'gs://public-datasets/natality/csv';\n",
"\n",
"SELECT * FROM natality_csv LIMIT 10;'''\n",
"EXPERIMENT_NAME = 'Dataproc - Submit SparkSQL Job'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='Dataproc submit SparkSQL job pipeline',\n",
" description='Dataproc submit SparkSQL job pipeline'\n",
")\n",
"def dataproc_submit_sparksql_job_pipeline(\n",
" project_id = PROJECT_ID, \n",
" region = REGION,\n",
" cluster_name = CLUSTER_NAME,\n",
" queries = json.dumps([QUERY]),\n",
" query_file_uri = '',\n",
" script_variables = '', \n",
" sparksql_job='', \n",
" job='', \n",
" wait_interval='30'\n",
"):\n",
" dataproc_submit_sparksql_job_op(\n",
" project_id=project_id, \n",
" region=region, \n",
" cluster_name=cluster_name, \n",
" queries=queries, \n",
" query_file_uri=query_file_uri,\n",
" script_variables=script_variables, \n",
" sparksql_job=sparksql_job, \n",
" job=job, \n",
" wait_interval=wait_interval)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = dataproc_submit_sparksql_job_pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"* [Spark SQL, DataFrames and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html)\n",
"* [SparkSqlJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob)\n",
"* [Cloud Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs)\n",
"\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,200 @@
# Name
Batch prediction using Cloud Machine Learning Engine
# Label
Cloud Storage, Cloud ML Engine, Kubeflow, Pipeline, Component
# Summary
A Kubeflow Pipeline component to submit a batch prediction job against a deployed model on Cloud ML Engine.
# Details
## Intended use
Use the component to run a batch prediction job against a deployed model on Cloud ML Engine. The prediction output is stored in a Cloud Storage bucket.
## Runtime arguments
| Argument | Description | Optional | Data type | Accepted values | Default |
|--------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------------|-----------------|---------|
| project_id | The ID of the Google Cloud Platform (GCP) project of the job. | No | GCPProjectID | | |
| model_path | The path to the model. It can be one of the following:<br/> <ul> <li>projects/[PROJECT_ID]/models/[MODEL_ID]</li> <li>projects/[PROJECT_ID]/models/[MODEL_ID]/versions/[VERSION_ID]</li> <li>The path to a Cloud Storage location containing a model file.</li> </ul> | No | GCSPath | | |
| input_paths | The path to the Cloud Storage location containing the input data files. It can contain wildcards, for example, `gs://foo/*.csv` | No | List | GCSPath | |
| input_data_format | The format of the input data files. See [REST Resource: projects.jobs](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat) for more details. | No | String | DataFormat | |
| output_path | The path to the Cloud Storage location for the output data. | No | GCSPath | | |
| region | The Compute Engine region where the prediction job is run. | No | GCPRegion | | |
| output_data_format | The format of the output data files. See [REST Resource: projects.jobs](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat) for more details. | Yes | String | DataFormat | JSON |
| prediction_input | The JSON input parameters to create a prediction job. See [PredictionInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#PredictionInput) for more information. | Yes | Dict | | None |
| job_id_prefix | The prefix of the generated job id. | Yes | String | | None |
| wait_interval | The number of seconds to wait in case the operation has a long run time. | Yes | | | 30 |
## Input data schema
The component accepts the following as input:
* A trained model: It can be a model file in Cloud Storage, a deployed model, or a version in Cloud ML Engine. Specify the path to the model in the `model_path `runtime argument.
* Input data: The data used to make predictions against the trained model. The data can be in [multiple formats](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat). The data path is specified by `input_paths` and the format is specified by `input_data_format`.
## Output
Name | Description | Type
:--- | :---------- | :---
job_id | The ID of the created batch job. | String
## Cautions & requirements
To use the component, you must:
* Set up a cloud environment by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant the following types of access to the Kubeflow user service account:
* Read access to the Cloud Storage buckets which contains the input data.
* Write access to the Cloud Storage bucket of the output directory.
## Detailed description
Follow these steps to use the component in a pipeline:
1. Install the Kubeflow Pipeline SDK:
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using KFP SDK
```python
import kfp.components as comp
mlengine_batch_predict_op = comp.load_component_from_url(
'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/batch_predict/component.yaml')
help(mlengine_batch_predict_op)
```
### Sample Code
Note: The following sample code works in an IPython notebook or directly in Python code.
In this sample, you batch predict against a pre-built trained model from `gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/` and use the test data from `gs://ml-pipeline-playground/samples/ml_engine/census/test.json`.
#### Inspect the test data
```python
!gsutil cat gs://ml-pipeline-playground/samples/ml_engine/census/test.json
```
#### Set sample parameters
```python
# Required Parameters
PROJECT_ID = '<Please put your project ID here>'
GCS_WORKING_DIR = 'gs://<Please put your GCS path here>' # No ending slash
```
```python
# Optional Parameters
EXPERIMENT_NAME = 'CLOUDML - Batch Predict'
OUTPUT_GCS_PATH = GCS_WORKING_DIR + '/batch_predict/output/'
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='CloudML batch predict pipeline',
description='CloudML batch predict pipeline'
)
def pipeline(
project_id = PROJECT_ID,
model_path = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/',
input_paths = '["gs://ml-pipeline-playground/samples/ml_engine/census/test.json"]',
input_data_format = 'JSON',
output_path = OUTPUT_GCS_PATH,
region = 'us-central1',
output_data_format='',
prediction_input = json.dumps({
'runtimeVersion': '1.10'
}),
job_id_prefix='',
wait_interval='30'):
mlengine_batch_predict_op(
project_id=project_id,
model_path=model_path,
input_paths=input_paths,
input_data_format=input_data_format,
output_path=output_path,
region=region,
output_data_format=output_data_format,
prediction_input=prediction_input,
job_id_prefix=job_id_prefix,
wait_interval=wait_interval)
```
#### Compile the pipeline
```python
pipeline_func = pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify pipeline argument values
arguments = {}
#Get or create an experiment and submit a pipeline run
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
#### Inspect prediction results
```python
OUTPUT_FILES_PATTERN = OUTPUT_GCS_PATH + '*'
!gsutil cat OUTPUT_FILES_PATTERN
```
## References
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/batch_predict/sample.ipynb)
* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,90 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Batch predict against a model with Cloud ML Engine
description: |
Creates a MLEngine batch prediction job.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: 'Required. The ID of the parent project of the job.'
type: GCPProjectID
- name: model_path
description: >-
The path to the model. It can be either: `projects/[PROJECT_ID]/models/[MODEL_ID]`
or `projects/[PROJECT_ID]/models/[MODEL_ID]/versions/[VERSION_ID]` or a GCS path
of a model file.
type: String
- name: input_paths
description: >-
Required. The Google Cloud Storage location of the input data files. May contain
wildcards.
type: List
- name: input_data_format
description: >-
Required. The format of the input data files. See
https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat.
type: String
- name: output_path
description: 'Required. The output Google Cloud Storage location.'
type: GCSPath
- name: region
description: >-
Required. The Google Compute Engine region to run the prediction job in.
type: GCPRegion
- name: output_data_format
description: 'Optional. Format of the output data files, defaults to JSON.'
default: ''
type: String
- name: prediction_input
description: 'Input parameters to create a prediction job.'
default: ''
type: Dict
- name: job_id_prefix
description: 'The prefix of the generated job id.'
default: ''
type: String
- name: wait_interval
description: 'Optional wait interval between calls to get job status. Defaults to 30.'
default: '30'
type: Integer
outputs:
- name: job_id
description: 'The ID of the created job.'
type: String
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.ml_engine, batch_predict,
--project_id, {inputValue: project_id},
--model_path, {inputValue: model_path},
--input_paths, {inputValue: input_paths},
--input_data_format, {inputValue: input_data_format},
--output_path, {inputValue: output_path},
--region, {inputValue: region},
--output_data_format, {inputValue: output_data_format},
--prediction_input, {inputValue: prediction_input},
--job_id_prefix, {inputValue: job_id_prefix},
--wait_interval, {inputValue: wait_interval},
--job_id_output_path, {outputPath: job_id},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,310 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"\n",
"Batch prediction using Cloud Machine Learning Engine\n",
"\n",
"\n",
"# Label\n",
"\n",
"Cloud Storage, Cloud ML Engine, Kubeflow, Pipeline, Component\n",
"\n",
"\n",
"# Summary\n",
"\n",
"A Kubeflow Pipeline component to submit a batch prediction job against a deployed model on Cloud ML Engine.\n",
"\n",
"\n",
"# Details\n",
"\n",
"\n",
"## Intended use\n",
"\n",
"Use the component to run a batch prediction job against a deployed model on Cloud ML Engine. The prediction output is stored in a Cloud Storage bucket.\n",
"\n",
"\n",
"## Runtime arguments\n",
"\n",
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
"|--------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------------|-----------------|---------|\n",
"| project_id | The ID of the Google Cloud Platform (GCP) project of the job. | No | GCPProjectID | | |\n",
"| model_path | The path to the model. It can be one of the following:<br/> <ul> <li>projects/[PROJECT_ID]/models/[MODEL_ID]</li> <li>projects/[PROJECT_ID]/models/[MODEL_ID]/versions/[VERSION_ID]</li> <li>The path to a Cloud Storage location containing a model file.</li> </ul> | No | GCSPath | | |\n",
"| input_paths | The path to the Cloud Storage location containing the input data files. It can contain wildcards, for example, `gs://foo/*.csv` | No | List | GCSPath | |\n",
"| input_data_format | The format of the input data files. See [REST Resource: projects.jobs](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat) for more details. | No | String | DataFormat | |\n",
"| output_path | The path to the Cloud Storage location for the output data. | No | GCSPath | | |\n",
"| region | The Compute Engine region where the prediction job is run. | No | GCPRegion | | |\n",
"| output_data_format | The format of the output data files. See [REST Resource: projects.jobs](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat) for more details. | Yes | String | DataFormat | JSON |\n",
"| prediction_input | The JSON input parameters to create a prediction job. See [PredictionInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#PredictionInput) for more information. | Yes | Dict | | None |\n",
"| job_id_prefix | The prefix of the generated job id. | Yes | String | | None |\n",
"| wait_interval | The number of seconds to wait in case the operation has a long run time. | Yes | | | 30 |\n",
"\n",
"\n",
"## Input data schema\n",
"\n",
"The component accepts the following as input:\n",
"\n",
"* A trained model: It can be a model file in Cloud Storage, a deployed model, or a version in Cloud ML Engine. Specify the path to the model in the `model_path `runtime argument.\n",
"* Input data: The data used to make predictions against the trained model. The data can be in [multiple formats](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat). The data path is specified by `input_paths` and the format is specified by `input_data_format`.\n",
"\n",
"## Output\n",
"Name | Description | Type\n",
":--- | :---------- | :---\n",
"job_id | The ID of the created batch job. | String\n",
"output_path | The output path of the batch prediction job | GCSPath\n",
"\n",
"\n",
"## Cautions & requirements\n",
"\n",
"To use the component, you must:\n",
"\n",
"* Set up a cloud environment by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant the following types of access to the Kubeflow user service account:\n",
" * Read access to the Cloud Storage buckets which contains the input data.\n",
" * Write access to the Cloud Storage bucket of the output directory.\n",
"\n",
"\n",
"## Detailed description\n",
"\n",
"Follow these steps to use the component in a pipeline:\n",
"\n",
"\n",
"\n",
"1. Install the Kubeflow Pipeline SDK:\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"mlengine_batch_predict_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/batch_predict/component.yaml')\n",
"help(mlengine_batch_predict_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"### Sample Code\n",
"Note: The following sample code works in an IPython notebook or directly in Python code. \n",
"\n",
"In this sample, you batch predict against a pre-built trained model from `gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/` and use the test data from `gs://ml-pipeline-playground/samples/ml_engine/census/test.json`.\n",
"\n",
"#### Inspect the test data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!gsutil cat gs://ml-pipeline-playground/samples/ml_engine/census/test.json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Required Parameters\n",
"PROJECT_ID = '<Please put your project ID here>'\n",
"GCS_WORKING_DIR = 'gs://<Please put your GCS path here>' # No ending slash"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Optional Parameters\n",
"EXPERIMENT_NAME = 'CLOUDML - Batch Predict'\n",
"OUTPUT_GCS_PATH = GCS_WORKING_DIR + '/batch_predict/output/'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='CloudML batch predict pipeline',\n",
" description='CloudML batch predict pipeline'\n",
")\n",
"def pipeline(\n",
" project_id = PROJECT_ID, \n",
" model_path = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/', \n",
" input_paths = '[\"gs://ml-pipeline-playground/samples/ml_engine/census/test.json\"]', \n",
" input_data_format = 'JSON', \n",
" output_path = OUTPUT_GCS_PATH, \n",
" region = 'us-central1', \n",
" output_data_format='', \n",
" prediction_input = json.dumps({\n",
" 'runtimeVersion': '1.10'\n",
" }), \n",
" job_id_prefix='',\n",
" wait_interval='30'):\n",
" mlengine_batch_predict_op(\n",
" project_id=project_id, \n",
" model_path=model_path, \n",
" input_paths=input_paths, \n",
" input_data_format=input_data_format, \n",
" output_path=output_path, \n",
" region=region, \n",
" output_data_format=output_data_format, \n",
" prediction_input=prediction_input, \n",
" job_id_prefix=job_id_prefix,\n",
" wait_interval=wait_interval)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Inspect prediction results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"OUTPUT_FILES_PATTERN = OUTPUT_GCS_PATH + '*'\n",
"!gsutil cat OUTPUT_FILES_PATTERN"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/batch_predict/sample.ipynb)\n",
"* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,196 @@
# Name
Deploying a trained model to Cloud Machine Learning Engine
# Label
Cloud Storage, Cloud ML Engine, Kubeflow, Pipeline
# Summary
A Kubeflow Pipeline component to deploy a trained model from a Cloud Storage location to Cloud ML Engine.
# Details
## Intended use
Use the component to deploy a trained model to Cloud ML Engine. The deployed model can serve online or batch predictions in a Kubeflow Pipeline.
## Runtime arguments
| Argument | Description | Optional | Data type | Accepted values | Default |
|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------------|-----------------|---------|
| model_uri | The URI of a Cloud Storage directory that contains a trained model file.<br/> Or <br/> An [Estimator export base directory](https://www.tensorflow.org/guide/saved_model#perform_the_export) that contains a list of subdirectories named by timestamp. The directory with the latest timestamp is used to load the trained model file. | No | GCSPath | | |
| project_id | The ID of the Google Cloud Platform (GCP) project of the serving model. | No | GCPProjectID | | |
| model_id | The name of the trained model. | Yes | String | | None |
| version_id | The name of the version of the model. If it is not provided, the operation uses a random name. | Yes | String | | None |
| runtime_version | The Cloud ML Engine runtime version to use for this deployment. If it is not provided, the default stable version, 1.0, is used. | Yes | String | | None |
| python_version | The version of Python used in the prediction. If it is not provided, version 2.7 is used. You can use Python 3.5 if runtime_version is set to 1.4 or above. Python 2.7 works with all supported runtime versions. | Yes | String | | 2.7 |
| model | The JSON payload of the new [model](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models). | Yes | Dict | | None |
| version | The new [version](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions) of the trained model. | Yes | Dict | | None |
| replace_existing_version | Indicates whether to replace the existing version in case of a conflict (if the same version number is found.) | Yes | Boolean | | FALSE |
| set_default | Indicates whether to set the new version as the default version in the model. | Yes | Boolean | | FALSE |
| wait_interval | The number of seconds to wait in case the operation has a long run time. | Yes | Integer | | 30 |
## Input data schema
The component looks for a trained model in the location specified by the `model_uri` runtime argument. The accepted trained models are:
* [Tensorflow SavedModel](https://cloud.google.com/ml-engine/docs/tensorflow/exporting-for-prediction)
* [Scikit-learn & XGBoost model](https://cloud.google.com/ml-engine/docs/scikit/exporting-for-prediction)
The accepted file formats are:
* *.pb
* *.pbtext
* model.bst
* model.joblib
* model.pkl
`model_uri` can also be an [Estimator export base directory, ](https://www.tensorflow.org/guide/saved_model#perform_the_export)which contains a list of subdirectories named by timestamp. The directory with the latest timestamp is used to load the trained model file.
## Output
Name | Description | Type
:--- | :---------- | :---
| model_uri | The Cloud Storage URI of the trained model. | GCSPath |
| model_name | The name of the deployed model. | String |
| version_name | The name of the deployed version. | String |
## Cautions & requirements
To use the component, you must:
* [Set up the cloud environment](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant read access to the Cloud Storage bucket that contains the trained model to the Kubeflow user service account.
## Detailed description
Use the component to:
* Locate the trained model at the Cloud Storage location you specify.
* Create a new model if a model provided by you doesnt exist.
* Delete the existing model version if `replace_existing_version` is enabled.
* Create a new version of the model from the trained model.
* Set the new version as the default version of the model if `set_default` is enabled.
Follow these steps to use the component in a pipeline:
1. Install the Kubeflow Pipeline SDK:
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using KFP SDK
```python
import kfp.components as comp
mlengine_deploy_op = comp.load_component_from_url(
'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/deploy/component.yaml')
help(mlengine_deploy_op)
```
### Sample
Note: The following sample code works in IPython notebook or directly in Python code.
In this sample, you deploy a pre-built trained model from `gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/` to Cloud ML Engine. The deployed model is `kfp_sample_model`. A new version is created every time the sample is run, and the latest version is set as the default version of the deployed model.
#### Set sample parameters
```python
# Required Parameters
PROJECT_ID = '<Please put your project ID here>'
# Optional Parameters
EXPERIMENT_NAME = 'CLOUDML - Deploy'
TRAINED_MODEL_PATH = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/'
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='CloudML deploy pipeline',
description='CloudML deploy pipeline'
)
def pipeline(
model_uri = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/',
project_id = PROJECT_ID,
model_id = 'kfp_sample_model',
version_id = '',
runtime_version = '1.10',
python_version = '',
version = {},
replace_existing_version = 'False',
set_default = 'True',
wait_interval = '30'):
task = mlengine_deploy_op(
model_uri=model_uri,
project_id=project_id,
model_id=model_id,
version_id=version_id,
runtime_version=runtime_version,
python_version=python_version,
version=version,
replace_existing_version=replace_existing_version,
set_default=set_default,
wait_interval=wait_interval)
```
#### Compile the pipeline
```python
pipeline_func = pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify pipeline argument values
arguments = {}
#Get or create an experiment and submit a pipeline run
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
## References
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/deploy/sample.ipynb)
* [Cloud Machine Learning Engine Model REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models)
* [Cloud Machine Learning Engine Version REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.versions)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,119 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Deploying a trained model to Cloud Machine Learning Engine
description: |
A Kubeflow Pipeline component to deploy a trained model from a Cloud Storage
path to a Cloud Machine Learning Engine service.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: model_uri
description: >-
Required. The Cloud Storage URI which contains a model file. Commonly
used TF model search paths (export/exporter) will be used if they exist.
type: GCSPath
- name: project_id
description: 'Required.The ID of the parent project of the serving model.'
type: GCPProjectID
- name: model_id
description: >-
Optional. The user-specified name of the model. If it is not provided,
the operation uses a random name.
default: ''
type: String
- name: version_id
description: >-
Optional. The user-specified name of the version. If it is not provided,
the operation uses a random name.
default: ''
type: String
- name: runtime_version
description: >-
Optional. The [Cloud ML Engine runtime version](https://cloud.google.com/ml-engine/docs/tensorflow/runtime-version-list) to use for
this deployment. If it is not set, the Cloud ML Engine uses the default
stable version, 1.0.
default: ''
type: String
- name: python_version
description: >-
Optional. The version of Python used in the prediction. If it is not set,
the default version is `2.7`. Python `3.5` is available when the
runtime_version is set to `1.4` and above. Python `2.7` works with all
supported runtime versions.
default: ''
type: String
- name: model
description: >-
Optional. The JSON payload of the new
[Model](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models), if it does not exist.
default: ''
type: Dict
- name: version
description: >-
Optional. The JSON payload of the new
[Version](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions).
default: ''
type: Dict
- name: replace_existing_version
description: >-
A Boolean flag that indicates whether to replace existing version in case of conflict.
default: 'False'
type: Bool
- name: set_default
description: >-
A Boolean flag that indicates whether to set the new version as default version in the model.
default: 'False'
type: Bool
- name: wait_interval
description: 'A time-interval to wait for in case the operation has a long run time.'
default: '30'
type: Integer
outputs:
- name: model_uri
description: 'The Cloud Storage URI of the trained model.'
type: GCSPath
- name: model_name
description: 'The name of the deployed model.'
type: String
- name: version_name
description: 'The name of the deployed version.'
type: String
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ["python", -u, -m, "kfp_component.launcher"]
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.ml_engine, deploy,
--model_uri, {inputValue: model_uri},
--project_id, {inputValue: project_id},
--model_id, {inputValue: model_id},
--version_id, {inputValue: version_id},
--runtime_version, {inputValue: runtime_version},
--python_version, {inputValue: python_version},
--model, {inputValue: model},
--version, {inputValue: version},
--replace_existing_version, {inputValue: replace_existing_version},
--set_default, {inputValue: set_default},
--wait_interval, {inputValue: wait_interval},
--model_uri_output_path, {outputPath: model_uri},
--model_name_output_path, {outputPath: model_name},
--version_name_output_path, {outputPath: version_name},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,282 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"\n",
"Deploying a trained model to Cloud Machine Learning Engine \n",
"\n",
"\n",
"# Label\n",
"\n",
"Cloud Storage, Cloud ML Engine, Kubeflow, Pipeline\n",
"\n",
"\n",
"# Summary\n",
"\n",
"A Kubeflow Pipeline component to deploy a trained model from a Cloud Storage location to Cloud ML Engine.\n",
"\n",
"\n",
"# Details\n",
"\n",
"\n",
"## Intended use\n",
"\n",
"Use the component to deploy a trained model to Cloud ML Engine. The deployed model can serve online or batch predictions in a Kubeflow Pipeline.\n",
"\n",
"\n",
"## Runtime arguments\n",
"\n",
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
"|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|--------------|-----------------|---------|\n",
"| model_uri | The URI of a Cloud Storage directory that contains a trained model file.<br/> Or <br/> An [Estimator export base directory](https://www.tensorflow.org/guide/saved_model#perform_the_export) that contains a list of subdirectories named by timestamp. The directory with the latest timestamp is used to load the trained model file. | No | GCSPath | | |\n",
"| project_id | The ID of the Google Cloud Platform (GCP) project of the serving model. | No | GCPProjectID | | |\n",
"| model_id | The name of the trained model. | Yes | String | | None |\n",
"| version_id | The name of the version of the model. If it is not provided, the operation uses a random name. | Yes | String | | None |\n",
"| runtime_version | The Cloud ML Engine runtime version to use for this deployment. If it is not provided, the default stable version, 1.0, is used. | Yes | String | | None |\n",
"| python_version | The version of Python used in the prediction. If it is not provided, version 2.7 is used. You can use Python 3.5 if runtime_version is set to 1.4 or above. Python 2.7 works with all supported runtime versions. | Yes | String | | 2.7 |\n",
"| model | The JSON payload of the new [model](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models). | Yes | Dict | | None |\n",
"| version | The new [version](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions) of the trained model. | Yes | Dict | | None |\n",
"| replace_existing_version | Indicates whether to replace the existing version in case of a conflict (if the same version number is found.) | Yes | Boolean | | FALSE |\n",
"| set_default | Indicates whether to set the new version as the default version in the model. | Yes | Boolean | | FALSE |\n",
"| wait_interval | The number of seconds to wait in case the operation has a long run time. | Yes | Integer | | 30 |\n",
"\n",
"\n",
"\n",
"## Input data schema\n",
"\n",
"The component looks for a trained model in the location specified by the `model_uri` runtime argument. The accepted trained models are:\n",
"\n",
"\n",
"* [Tensorflow SavedModel](https://cloud.google.com/ml-engine/docs/tensorflow/exporting-for-prediction) \n",
"* [Scikit-learn & XGBoost model](https://cloud.google.com/ml-engine/docs/scikit/exporting-for-prediction)\n",
"\n",
"The accepted file formats are:\n",
"\n",
"* *.pb\n",
"* *.pbtext\n",
"* model.bst\n",
"* model.joblib\n",
"* model.pkl\n",
"\n",
"`model_uri` can also be an [Estimator export base directory, ](https://www.tensorflow.org/guide/saved_model#perform_the_export)which contains a list of subdirectories named by timestamp. The directory with the latest timestamp is used to load the trained model file.\n",
"\n",
"## Output\n",
"| Name | Description | Type |\n",
"|:------- |:---- | :--- |\n",
"| job_id | The ID of the created job. | String |\n",
"| job_dir | The Cloud Storage path that contains the trained model output files. | GCSPath |\n",
"\n",
"\n",
"## Cautions & requirements\n",
"\n",
"To use the component, you must:\n",
"\n",
"* [Set up the cloud environment](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant read access to the Cloud Storage bucket that contains the trained model to the Kubeflow user service account.\n",
"\n",
"## Detailed description\n",
"\n",
"Use the component to: \n",
"* Locate the trained model at the Cloud Storage location you specify.\n",
"* Create a new model if a model provided by you doesnt exist.\n",
"* Delete the existing model version if `replace_existing_version` is enabled.\n",
"* Create a new version of the model from the trained model.\n",
"* Set the new version as the default version of the model if `set_default` is enabled.\n",
"\n",
"Follow these steps to use the component in a pipeline:\n",
"\n",
"1. Install the Kubeflow Pipeline SDK:\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"mlengine_deploy_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/deploy/component.yaml')\n",
"help(mlengine_deploy_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample\n",
"Note: The following sample code works in IPython notebook or directly in Python code.\n",
"\n",
"In this sample, you deploy a pre-built trained model from `gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/` to Cloud ML Engine. The deployed model is `kfp_sample_model`. A new version is created every time the sample is run, and the latest version is set as the default version of the deployed model.\n",
"\n",
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Required Parameters\n",
"PROJECT_ID = '<Please put your project ID here>'\n",
"\n",
"# Optional Parameters\n",
"EXPERIMENT_NAME = 'CLOUDML - Deploy'\n",
"TRAINED_MODEL_PATH = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='CloudML deploy pipeline',\n",
" description='CloudML deploy pipeline'\n",
")\n",
"def pipeline(\n",
" model_uri = 'gs://ml-pipeline-playground/samples/ml_engine/census/trained_model/',\n",
" project_id = PROJECT_ID,\n",
" model_id = 'kfp_sample_model',\n",
" version_id = '',\n",
" runtime_version = '1.10',\n",
" python_version = '',\n",
" version = {},\n",
" replace_existing_version = 'False',\n",
" set_default = 'True',\n",
" wait_interval = '30'):\n",
" task = mlengine_deploy_op(\n",
" model_uri=model_uri, \n",
" project_id=project_id, \n",
" model_id=model_id, \n",
" version_id=version_id, \n",
" runtime_version=runtime_version, \n",
" python_version=python_version,\n",
" version=version, \n",
" replace_existing_version=replace_existing_version, \n",
" set_default=set_default, \n",
" wait_interval=wait_interval)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/deploy/sample.ipynb)\n",
"* [Cloud Machine Learning Engine Model REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models)\n",
"* [Cloud Machine Learning Engine Version REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.versions)\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,241 @@
# Name
Component: Submitting an AI Platform training job as a pipeline step
# Label
AI Platform, Kubeflow
# Summary
A Kubeflow pipeline component to submit an AI Platform training job as a step in a pipeline.
# Facets
<!--Make sure the asset has data for the following facets:
Use case
Technique
Input data type
ML workflow
The data must map to the acceptable values for these facets, as documented on the “taxonomy” sheet of go/aihub-facets
https://gitlab.aihub-content-external.com/aihubbot/kfp-components/commit/fe387ab46181b5d4c7425dcb8032cb43e70411c1
--->
Use case:
Other
Technique:
Other
Input data type:
Tabular
ML workflow:
Training
# Details
## Intended use
Use this component to submit a training job to AI Platform from a Kubeflow pipeline.
## Runtime arguments
| Argument | Description | Optional | Data type | Accepted values | Default |
|:------------------|:------------------|:----------|:--------------|:-----------------|:-------------|
| project_id | The Google Cloud Platform (GCP) project ID of the job. | No | GCPProjectID | - | - |
| python_module | The name of the Python module to run after installing the training program. | Yes | String | - | None |
| package_uris | The Cloud Storage location of the packages that contain the training program and any additional dependencies. The maximum number of package URIs is 100. | Yes | List | -| None |
| region | The Compute Engine region in which the training job is run. | Yes | GCPRegion | -| us-central1 |
| args | The command line arguments to pass to the training program. | Yes | List | - | None |
| job_dir | A Cloud Storage path in which to store the training outputs and other data needed for training. This path is passed to your TensorFlow program as the command-line argument, `job-dir`. The benefit of specifying this field is that Cloud ML validates the path for use in training. | Yes | GCSPath | - | None |
| python_version | The version of Python used in training. If it is not set, the default version is 2.7. Python 3.5 is available when the runtime version is set to 1.4 and above. | Yes | String | - | None |
| runtime_version | The runtime version of AI Platform to use for training. If it is not set, AI Platform uses the default. | Yes | String | - | 1 |
| master_image_uri | The Docker image to run on the master replica. This image must be in Container Registry. | Yes | GCRPath | - | None |
| worker_image_uri | The Docker image to run on the worker replica. This image must be in Container Registry. | Yes | GCRPath |- | None |
| training_input | The input parameters to create a training job. | Yes | Dict | [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput) | None |
| job_id_prefix | The prefix of the job ID that is generated. | Yes | String | - | None |
| job_id | The ID of the job to create, takes precedence over generated job id if set. | Yes | String | - | None |
| wait_interval | The number of seconds to wait between API calls to get the status of the job. | Yes | Integer | - | 30 |
## Input data schema
The component accepts two types of inputs:
* A list of Python packages from Cloud Storage.
* You can manually build a Python package and upload it to Cloud Storage by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/packaging-trainer#manual-build).
* A Docker container from Container Registry.
* Follow this [guide](https://cloud.google.com/ml-engine/docs/using-containers) to publish and use a Docker container with this component.
## Output
| Name | Description | Type |
|:------- |:---- | :--- |
| job_id | The ID of the created job. | String |
| job_dir | The Cloud Storage path that contains the output files with the trained model. | GCSPath |
## Cautions & requirements
To use the component, you must:
* Set up a cloud environment by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* Grant the following access to the Kubeflow user service account:
* Read access to the Cloud Storage buckets which contain the input data, packages, or Docker images.
* Write access to the Cloud Storage bucket of the output directory.
## Detailed description
The component builds the [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput) payload and submits a job via the [AI Platform REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs).
The steps to use the component in a pipeline are:
1. Install the Kubeflow pipeline's SDK:
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using the Kubeflow pipeline's SDK:
```python
import kfp.components as comp
mlengine_train_op = comp.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/train/component.yaml')
help(mlengine_train_op)
```
### Sample
The following sample code works in an IPython notebook or directly in Python code.
In this sample, you use the code from the [census estimator sample](https://github.com/GoogleCloudPlatform/cloudml-samples/tree/master/census/estimator) to train a model on AI Platform. To upload the code to AI Platform, package the Python code and upload it to a Cloud Storage bucket.
Note: You must have read and write permissions on the bucket that you use as the working directory.
#### Set sample parameters
```python
# Required parameters
PROJECT_ID = '<Put your project ID here>'
GCS_WORKING_DIR = 'gs://<Put your GCS path here>' # No ending slash
```
```python
# Optional parameters
EXPERIMENT_NAME = 'CLOUDML - Train'
TRAINER_GCS_PATH = GCS_WORKING_DIR + '/train/trainer.tar.gz'
OUTPUT_GCS_PATH = GCS_WORKING_DIR + '/train/output/'
```
#### Clean up the working directory
```python
%%capture --no-stderr
!gsutil rm -r $GCS_WORKING_DIR
```
#### Download the sample trainer code to a local directory
```python
%%capture --no-stderr
!wget https://github.com/GoogleCloudPlatform/cloudml-samples/archive/master.zip
!unzip master.zip
```
#### Package code and upload the package to Cloud Storage
```python
%%capture --no-stderr
%%bash -s "$TRAINER_GCS_PATH"
pushd ./cloudml-samples-master/census/estimator/
python setup.py sdist
gsutil cp dist/preprocessing-1.0.tar.gz $1
popd
rm -fr ./cloudml-samples-master/ ./master.zip ./dist
```
#### Example pipeline that uses the component
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='CloudML training pipeline',
description='CloudML training pipeline'
)
def pipeline(
project_id = PROJECT_ID,
python_module = 'trainer.task',
package_uris = json.dumps([TRAINER_GCS_PATH]),
region = 'us-central1',
args = json.dumps([
'--train-files', 'gs://cloud-samples-data/ml-engine/census/data/adult.data.csv',
'--eval-files', 'gs://cloud-samples-data/ml-engine/census/data/adult.test.csv',
'--train-steps', '1000',
'--eval-steps', '100',
'--verbosity', 'DEBUG'
]),
job_dir = OUTPUT_GCS_PATH,
python_version = '',
runtime_version = '1.10',
master_image_uri = '',
worker_image_uri = '',
training_input = '',
job_id_prefix = '',
job_id = '',
wait_interval = '30'):
task = mlengine_train_op(
project_id=project_id,
python_module=python_module,
package_uris=package_uris,
region=region,
args=args,
job_dir=job_dir,
python_version=python_version,
runtime_version=runtime_version,
master_image_uri=master_image_uri,
worker_image_uri=worker_image_uri,
training_input=training_input,
job_id_prefix=job_id_prefix,
job_id=job_id,
wait_interval=wait_interval)
```
#### Compile the pipeline
```python
pipeline_func = pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify values for the pipeline's arguments
arguments = {}
#Get or create an experiment
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
#### Inspect the results
Use the following command to inspect the contents in the output directory:
```python
!gsutil ls $OUTPUT_GCS_PATH
```
## References
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/train/sample.ipynb)
* [AI Platform REST API - Resource: Job](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -0,0 +1,135 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Submitting a Cloud ML training job as a pipeline step
description: |
A Kubeflow Pipeline component to submit a Cloud Machine Learning (Cloud ML)
Engine training job as a step in a pipeline.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: 'Required. The ID of the parent project of the job.'
type: GCPProjectID
- name: python_module
description: 'The Python module name to run after installing the packages.'
default: ''
type: String
- name: package_uris
description: >-
The Cloud Storage location of the packages (that contain the training program
and any additional dependencies). The maximum number of package URIs is 100.
default: ''
type: List
- name: region
description: 'The Compute Engine region in which the training job is run.'
default: ''
type: GCPRegion
- name: args
description: 'The command line arguments to pass to the program.'
default: ''
type: List
- name: job_dir
description: >-
A Cloud Storage path in which to store the training outputs and other data
needed for training. This path is passed to your TensorFlow program as the
`job-dir` command-line argument. The benefit of specifying this field is
that Cloud ML validates the path for use in training.
default: ''
type: GCSPath
- name: python_version
description: >-
The version of Python used in training. If not set, the default
version is `2.7`. Python `3.5` is available when runtimeVersion is set to `1.4`
and above.
default: ''
type: String
- name: runtime_version
description: >-
The Cloud ML Engine runtime version to use for training. If not set,
Cloud ML Engine uses the default stable version, 1.0.
default: ''
type: String
- name: master_image_uri
description: >-
The Docker image to run on the master replica. This image must be in
Container Registry.
default: ''
type: GCRPath
- name: worker_image_uri
description: >-
The Docker image to run on the worker replica. This image must be in
Container Registry.
default: ''
type: GCRPath
- name: training_input
description: >-
The input parameters to create a training job. It is the JSON payload
of a [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput)
default: ''
type: Dict
- name: job_id_prefix
description: 'The prefix of the generated job id.'
default: ''
type: String
- name: job_id
description: >-
The ID of the job to create, takes precedence over generated
job id if set.
default: ''
type: String
- name: wait_interval
description: >-
Optional. A time-interval to wait for between calls to get the job status.
Defaults to 30.'
default: '30'
type: Integer
outputs:
- name: job_id
description: 'The ID of the created job.'
type: String
- name: job_dir
description: >-
The output path in Cloud Storage of the training job, which contains
the trained model files.
type: GCSPath
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ["python", -u, -m, "kfp_component.launcher"]
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.ml_engine, train,
--project_id, {inputValue: project_id},
--python_module, {inputValue: python_module},
--package_uris, {inputValue: package_uris},
--region, {inputValue: region},
--args, {inputValue: args},
--job_dir, {inputValue: job_dir},
--python_version, {inputValue: python_version},
--runtime_version, {inputValue: runtime_version},
--master_image_uri, {inputValue: master_image_uri},
--worker_image_uri, {inputValue: worker_image_uri},
--training_input, {inputValue: training_input},
--job_id_prefix, {inputValue: job_id_prefix},
--job_id, {inputValue: job_id},
--wait_interval, {inputValue: wait_interval},
--job_id_output_path, {outputPath: job_id},
--job_dir_output_path, {outputPath: job_dir},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -0,0 +1,359 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"Submitting a Cloud Machine Learning Engine training job as a pipeline step\n",
"\n",
"# Label\n",
"GCP, Cloud ML Engine, Machine Learning, pipeline, component, Kubeflow, Kubeflow Pipeline\n",
"\n",
"# Summary\n",
"A Kubeflow Pipeline component to submit a Cloud ML Engine training job as a step in a pipeline.\n",
"\n",
"# Details\n",
"## Intended use\n",
"Use this component to submit a training job to Cloud ML Engine from a Kubeflow Pipeline. \n",
"\n",
"## Runtime arguments\n",
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
"|:------------------|:------------------|:----------|:--------------|:-----------------|:-------------|\n",
"| project_id | The ID of the Google Cloud Platform (GCP) project of the job. | No | GCPProjectID | | |\n",
"| python_module | The name of the Python module to run after installing the training program. | Yes | String | | None |\n",
"| package_uris | The Cloud Storage location of the packages that contain the training program and any additional dependencies. The maximum number of package URIs is 100. | Yes | List | | None |\n",
"| region | The Compute Engine region in which the training job is run. | Yes | GCPRegion | | us-central1 |\n",
"| args | The command line arguments to pass to the training program. | Yes | List | | None |\n",
"| job_dir | A Cloud Storage path in which to store the training outputs and other data needed for training. This path is passed to your TensorFlow program as the `job-dir` command-line argument. The benefit of specifying this field is that Cloud ML validates the path for use in training. | Yes | GCSPath | | None |\n",
"| python_version | The version of Python used in training. If it is not set, the default version is 2.7. Python 3.5 is available when the runtime version is set to 1.4 and above. | Yes | String | | None |\n",
"| runtime_version | The runtime version of Cloud ML Engine to use for training. If it is not set, Cloud ML Engine uses the default. | Yes | String | | 1 |\n",
"| master_image_uri | The Docker image to run on the master replica. This image must be in Container Registry. | Yes | GCRPath | | None |\n",
"| worker_image_uri | The Docker image to run on the worker replica. This image must be in Container Registry. | Yes | GCRPath | | None |\n",
"| training_input | The input parameters to create a training job. | Yes | Dict | [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput) | None |\n",
"| job_id_prefix | The prefix of the job ID that is generated. | Yes | String | | None |\n",
"| job_id | The ID of the job to create, takes precedence over generated job id if set. | Yes | String | - | None |\n",
"| wait_interval | The number of seconds to wait between API calls to get the status of the job. | Yes | Integer | | 30 |\n",
"\n",
"\n",
"\n",
"## Input data schema\n",
"\n",
"The component accepts two types of inputs:\n",
"* A list of Python packages from Cloud Storage.\n",
" * You can manually build a Python package and upload it to Cloud Storage by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/packaging-trainer#manual-build).\n",
"* A Docker container from Container Registry. \n",
" * Follow this [guide](https://cloud.google.com/ml-engine/docs/using-containers) to publish and use a Docker container with this component.\n",
"\n",
"## Output\n",
"| Name | Description | Type |\n",
"|:------- |:---- | :--- |\n",
"| job_id | The ID of the created job. | String |\n",
"| job_dir | The Cloud Storage path that contains the trained model output files. | GCSPath |\n",
"\n",
"\n",
"## Cautions & requirements\n",
"\n",
"To use the component, you must:\n",
"\n",
"* Set up a cloud environment by following this [guide](https://cloud.google.com/ml-engine/docs/tensorflow/getting-started-training-prediction#setup).\n",
"* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* Grant the following access to the Kubeflow user service account: \n",
" * Read access to the Cloud Storage buckets which contain the input data, packages, or Docker images.\n",
" * Write access to the Cloud Storage bucket of the output directory.\n",
"\n",
"## Detailed description\n",
"\n",
"The component builds the [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput) payload and submits a job via the [Cloud ML Engine REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs).\n",
"\n",
"The steps to use the component in a pipeline are:\n",
"\n",
"\n",
"1. Install the Kubeflow Pipeline SDK:\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"mlengine_train_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/ml_engine/train/component.yaml')\n",
"help(mlengine_train_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample\n",
"Note: The following sample code works in an IPython notebook or directly in Python code.\n",
"\n",
"In this sample, you use the code from the [census estimator sample](https://github.com/GoogleCloudPlatform/cloudml-samples/tree/master/census/estimator) to train a model in Cloud ML Engine. To upload the code to Cloud ML Engine, package the Python code and upload it to a Cloud Storage bucket. \n",
"\n",
"Note: You must have read and write permissions on the bucket that you use as the working directory.\n",
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Required Parameters\n",
"PROJECT_ID = '<Please put your project ID here>'\n",
"GCS_WORKING_DIR = 'gs://<Please put your GCS path here>' # No ending slash"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Optional Parameters\n",
"EXPERIMENT_NAME = 'CLOUDML - Train'\n",
"TRAINER_GCS_PATH = GCS_WORKING_DIR + '/train/trainer.tar.gz'\n",
"OUTPUT_GCS_PATH = GCS_WORKING_DIR + '/train/output/'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Clean up the working directory"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"!gsutil rm -r $GCS_WORKING_DIR"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Download the sample trainer code to local"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"!wget https://github.com/GoogleCloudPlatform/cloudml-samples/archive/master.zip\n",
"!unzip master.zip"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Package code and upload the package to Cloud Storage"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"%%bash -s \"$TRAINER_GCS_PATH\"\n",
"pushd ./cloudml-samples-master/census/estimator/\n",
"python setup.py sdist\n",
"gsutil cp dist/preprocessing-1.0.tar.gz $1\n",
"popd\n",
"rm -fr ./cloudml-samples-master/ ./master.zip ./dist"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Example pipeline that uses the component"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='CloudML training pipeline',\n",
" description='CloudML training pipeline'\n",
")\n",
"def pipeline(\n",
" project_id = PROJECT_ID,\n",
" python_module = 'trainer.task',\n",
" package_uris = json.dumps([TRAINER_GCS_PATH]),\n",
" region = 'us-central1',\n",
" args = json.dumps([\n",
" '--train-files', 'gs://cloud-samples-data/ml-engine/census/data/adult.data.csv',\n",
" '--eval-files', 'gs://cloud-samples-data/ml-engine/census/data/adult.test.csv',\n",
" '--train-steps', '1000',\n",
" '--eval-steps', '100',\n",
" '--verbosity', 'DEBUG'\n",
" ]),\n",
" job_dir = OUTPUT_GCS_PATH,\n",
" python_version = '',\n",
" runtime_version = '1.10',\n",
" master_image_uri = '',\n",
" worker_image_uri = '',\n",
" training_input = '',\n",
" job_id_prefix = '',\n",
" job_id = '',\n",
" wait_interval = '30'):\n",
" task = mlengine_train_op(\n",
" project_id=project_id, \n",
" python_module=python_module, \n",
" package_uris=package_uris, \n",
" region=region, \n",
" args=args, \n",
" job_dir=job_dir, \n",
" python_version=python_version,\n",
" runtime_version=runtime_version, \n",
" master_image_uri=master_image_uri, \n",
" worker_image_uri=worker_image_uri, \n",
" training_input=training_input, \n",
" job_id_prefix=job_id_prefix,\n",
" job_id=job_id,\n",
" wait_interval=wait_interval)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Inspect the results\n",
"\n",
"Use the following command to inspect the contents in the output directory:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!gsutil ls $OUTPUT_GCS_PATH"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/train/sample.ipynb)\n",
"* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,35 @@
name: Download from GCS
inputs:
- {name: GCS path, type: String}
outputs:
- {name: Data}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/download/component.yaml'
implementation:
container:
image: google/cloud-sdk
command:
- bash # Pattern comparison only works in Bash
- -ex
- -c
- |
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
fi
uri="$0"
output_path="$1"
# Checking whether the URI points to a single blob, a directory or a URI pattern
# URI points to a blob when that URI does not end with slash and listing that URI only yields the same URI
if [[ "$uri" != */ ]] && (gsutil ls "$uri" | grep --fixed-strings --line-regexp "$uri"); then
mkdir -p "$(dirname "$output_path")"
gsutil -m cp -r "$uri" "$output_path"
else
mkdir -p "$output_path" # When source path is a directory, gsutil requires the destination to also be a directory
gsutil -m rsync -r "$uri" "$output_path" # gsutil cp has different path handling than Linux cp. It always puts the source directory (name) inside the destination directory. gsutil rsync does not have that problem.
fi
- inputValue: GCS path
- outputPath: Data

View File

@ -0,0 +1,24 @@
name: Download from GCS
inputs:
- {name: GCS path, type: String}
outputs:
- {name: Data}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/download_blob/component.yaml'
implementation:
container:
image: google/cloud-sdk
command:
- sh
- -ex
- -c
- |
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
fi
mkdir -p "$(dirname "$1")"
gsutil -m cp -r "$0" "$1"
- inputValue: GCS path
- outputPath: Data

View File

@ -0,0 +1,24 @@
name: Download from GCS
inputs:
- {name: GCS path, type: String}
outputs:
- {name: Data}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/download_dir/component.yaml'
implementation:
container:
image: google/cloud-sdk
command:
- sh
- -ex
- -c
- |
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
fi
mkdir -p "$1"
gsutil -m cp -r "$0" "$1"
- inputValue: GCS path
- outputPath: Data

View File

@ -0,0 +1,25 @@
name: List blobs
inputs:
- {name: GCS path, type: String, description: 'GCS path for listing. For recursive listing use the "gs://bucket/path/**" syntax".'}
outputs:
- {name: Paths}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/list/component.yaml'
volatile_component: 'true'
implementation:
container:
image: google/cloud-sdk
command:
- sh
- -ex
- -c
- |
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
fi
mkdir -p "$(dirname "$1")"
gsutil ls "$0" > "$1"
- inputValue: GCS path
- outputPath: Paths

View File

@ -0,0 +1,27 @@
name: Upload to GCS
inputs:
- {name: Data}
- {name: GCS path, type: String}
outputs:
- {name: GCS path, type: String}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/upload_to_explicit_uri/component.yaml'
implementation:
container:
image: google/cloud-sdk
command:
- sh
- -ex
- -c
- |
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
fi
gsutil cp -r "$0" "$1"
mkdir -p "$(dirname "$2")"
echo "$1" > "$2"
- inputPath: Data
- inputValue: GCS path
- outputPath: GCS path

View File

@ -0,0 +1,28 @@
name: Upload to GCS
description: Upload to GCS with unique URI suffix
inputs:
- {name: Data}
- {name: GCS path prefix, type: String}
outputs:
- {name: GCS path, type: String}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/storage/upload_to_unique_uri/component.yaml'
implementation:
container:
image: google/cloud-sdk
command:
- sh
- -ex
- -c
- |
if [ -n "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
fi
gsutil cp -r "$0" "$1"
mkdir -p "$(dirname "$2")"
echo "$1" > "$2"
- inputPath: Data
- concat: [{inputValue: GCS path prefix}, '{{workflow.uid}}_{{pod.name}}']
- outputPath: GCS path

View File

@ -0,0 +1,3 @@
# Deprecation Warning
The components in this directory is now moved to [components/contrib/google-cloud/automl](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/automl). This directory will be removed by the end of 2021.

View File

@ -0,0 +1,3 @@
# Deprecation Warning
The components in this directory is now moved to [components/contrib/google-cloud/dataproc](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/dataproc). This directory will be removed by the end of 2021.

View File

@ -0,0 +1,3 @@
# Deprecation Warning
The components in this directory is now moved to [components/contrib/google-cloud/ml_engine](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/ml_engine). This directory will be removed by the end of 2021.

View File

@ -0,0 +1,3 @@
# Deprecation Warning
The components in this directory is now moved to [components/contrib/google-cloud/automl](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/automl). This directory will be removed by the end of September 2021.

View File

@ -0,0 +1,3 @@
# Deprecation Warning
The components in this directory is now moved to [components/contrib/google-cloud/Optimizer](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/Optimizer). This directory will be removed by the end of September 2021.

View File

@ -0,0 +1,3 @@
# Deprecation Warning
The components in this directory is now moved to [components/contrib/google-cloud/storage](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/storage). This directory will be removed by the end of September 2021.