chore(components): clean up deprecated GCP components (#8685)

* delete deprecated GCP components

* remove build and test scripts, references to code/folder from master
This commit is contained in:
Chen Sun 2023-02-14 14:42:33 -08:00 committed by GitHub
parent 8552226c41
commit 5fe67919db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
181 changed files with 68 additions and 16734 deletions

View File

@ -32,21 +32,6 @@ steps:
# id: 'copyPythonSDKToLatest'
# waitFor: ['preparePythonSDK']
# # Build the Python Component SDK
# - name: 'python:2-alpine'
# entrypoint: '/bin/sh'
# args: ['-c', 'cd /workspace/components/gcp/container/component_sdk/python;python setup.py sdist --format=gztar; cp dist/*.tar.gz /workspace/kfp-component.tar.gz']
# id: 'preparePythonComponentSDK'
# waitFor: ["-"]
# - name: 'gcr.io/cloud-builders/gsutil'
# args: ['cp', '/workspace/kfp-component.tar.gz', 'gs://$PROJECT_ID/builds/$COMMIT_SHA/kfp-component.tar.gz']
# id: 'copyPythonComponentSDK'
# waitFor: ['preparePythonComponentSDK']
# - name: 'gcr.io/cloud-builders/gsutil'
# args: ['cp', '/workspace/kfp-component.tar.gz', 'gs://$PROJECT_ID/builds/latest/kfp-component.tar.gz']
# id: 'copyPythonComponentSDKToLatest'
# waitFor: ['preparePythonComponentSDK']
# Build the pipeline system images
- name: 'gcr.io/cloud-builders/docker'
entrypoint: /bin/bash
@ -147,13 +132,6 @@ steps:
id: 'buildGpuTrainer'
waitFor: ["-"]
# Build the Generic GCP component image
- name: 'gcr.io/cloud-builders/docker'
entrypoint: '/bin/bash'
args: ['-c', 'cd /workspace/components/gcp/container/ && ./build_image.sh -p $PROJECT_ID -t $COMMIT_SHA']
id: 'buildGcpGenericComponent'
waitFor: ["-"]
# Build the local pipeline component images
- name: 'gcr.io/cloud-builders/docker'
entrypoint: '/bin/bash'

View File

@ -55,7 +55,7 @@ Name | Description | Type
:--- | :---------- | :---
cluster_name | The name of the cluster. | String
Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/delete_cluster).
Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/release-1.7/components/gcp/dataproc/delete_cluster).
## Cautions & requirements
@ -167,9 +167,9 @@ run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arg
## References
* [Kubernetes Engine for Kubeflow](https://www.kubeflow.org/docs/started/getting-started-gke/#gcp-service-accounts)
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/create_cluster/sample.ipynb)
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/create_cluster/sample.ipynb)
* [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create)
## License

View File

@ -39,7 +39,7 @@
":--- | :---------- | :---\n",
"cluster_name | The name of the cluster. | String\n",
"\n",
"Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/delete_cluster).\n",
"Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/release-1.7/components/gcp/dataproc/delete_cluster).\n",
"\n",
"\n",
"## Cautions & requirements\n",
@ -211,9 +211,9 @@
"source": [
"## References\n",
"* [Kubernetes Engine for Kubeflow](https://www.kubeflow.org/docs/started/getting-started-gke/#gcp-service-accounts)\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/create_cluster/sample.ipynb)\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/create_cluster/sample.ipynb)\n",
"* [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create)\n",
"\n",
"## License\n",
@ -242,4 +242,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@ -138,9 +138,9 @@ run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arg
## References
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/delete_cluster/sample.ipynb)
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/delete_cluster/sample.ipynb)
* [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete)

View File

@ -187,9 +187,9 @@
"source": [
"## References\n",
"\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/delete_cluster/sample.ipynb)\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/delete_cluster/sample.ipynb)\n",
"* [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete)\n",
"\n",
"\n",
@ -228,4 +228,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@ -196,9 +196,9 @@ The sample in the notebook will count the words in the input text and save them
```
## References
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)
* [Dataproc HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob)
# License

View File

@ -279,9 +279,9 @@
"metadata": {},
"source": [
"## References\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)\n",
"* [Dataproc HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob)\n",
"\n",
"## License\n",
@ -310,4 +310,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@ -180,9 +180,9 @@ run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arg
```
## References
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hive_job/sample.ipynb)
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_hive_job/sample.ipynb)
* [Dataproc HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob)
## License

View File

@ -230,9 +230,9 @@
"metadata": {},
"source": [
"## References\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hive_job/sample.ipynb)\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_hive_job/sample.ipynb)\n",
"* [Dataproc HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob)\n",
"\n",
"## License\n",
@ -261,4 +261,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@ -191,9 +191,9 @@ run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arg
## References
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_spark_job/sample.ipynb)
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_spark_job/sample.ipynb)
* [Dataproc SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob)
## License

View File

@ -232,9 +232,9 @@
"source": [
"## References\n",
"\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_spark_job/sample.ipynb)\n",
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)\n",
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_spark_job/sample.ipynb)\n",
"* [Dataproc SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob)\n",
"\n",
"## License\n",
@ -263,4 +263,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@ -191,9 +191,9 @@ OUTPUT_FILES_PATTERN = OUTPUT_GCS_PATH + '*'
```
## References
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/batch_predict/sample.ipynb)
* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)
* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/batch_predict/sample.ipynb)
* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)
## License

View File

@ -276,9 +276,9 @@
"metadata": {},
"source": [
"## References\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/batch_predict/sample.ipynb)\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/batch_predict/sample.ipynb)\n",
"* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)\n",
"\n",
"## License\n",
@ -307,4 +307,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@ -186,9 +186,9 @@ run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arg
```
## References
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/deploy/sample.ipynb)
* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)
* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/deploy/sample.ipynb)
* [Cloud Machine Learning Engine Model REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models)
* [Cloud Machine Learning Engine Version REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.versions)

View File

@ -247,9 +247,9 @@
"metadata": {},
"source": [
"## References\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/deploy/sample.ipynb)\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/deploy/sample.ipynb)\n",
"* [Cloud Machine Learning Engine Model REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models)\n",
"* [Cloud Machine Learning Engine Version REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.versions)\n",
"\n",

View File

@ -232,9 +232,9 @@ Use the following command to inspect the contents in the output directory:
```
## References
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/train/sample.ipynb)
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/train/sample.ipynb)
* [AI Platform REST API - Resource: Job](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)
## License

View File

@ -325,9 +325,9 @@
"metadata": {},
"source": [
"## References\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/train/sample.ipynb)\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/train/sample.ipynb)\n",
"* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)\n",
"\n",
"## License\n",

View File

@ -1,3 +0,0 @@
# Deprecation Warning
The components in this directory is now moved to [components/contrib/google-cloud/automl](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/automl). This directory will be removed by the end of 2021.

View File

@ -1,69 +0,0 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_create_dataset_for_tables(
gcp_project_id: str,
gcp_region: str,
display_name: str,
description: str = None,
tables_dataset_metadata: dict = {},
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('dataset_path', str), ('create_time', str), ('dataset_id', str), ('dataset_url', 'URI')]):
'''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables
'''
import google
from google.cloud import automl
client = automl.AutoMlClient()
location_path = client.location_path(gcp_project_id, gcp_region)
dataset_dict = {
'display_name': display_name,
'description': description,
'tables_dataset_metadata': tables_dataset_metadata,
}
dataset = client.create_dataset(
location_path,
dataset_dict,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
print(dataset)
dataset_id = dataset.name.rsplit('/', 1)[-1]
dataset_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id}/schemav2?project={project_id}'.format(
project_id=gcp_project_id,
region=gcp_region,
dataset_id=dataset_id,
)
return (dataset.name, str(dataset.create_time), dataset_id, dataset_url)
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_create_dataset_for_tables_op = create_component_from_func(
automl_create_dataset_for_tables,
output_component_file='component.yaml',
base_image='python:3.7',
packages_to_install=['google-cloud-automl==0.4.0'],
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_dataset_for_tables/component.yaml",
},
)

View File

@ -1,148 +0,0 @@
name: Automl create dataset for tables
description: automl_create_dataset_for_tables creates an empty Dataset for AutoML
tables
inputs:
- {name: gcp_project_id, type: String}
- {name: gcp_region, type: String}
- {name: display_name, type: String}
- {name: description, type: String, optional: true}
- {name: tables_dataset_metadata, type: JsonObject, default: '{}', optional: true}
- {name: retry, optional: true}
- {name: timeout, type: Float, optional: true}
- {name: metadata, type: JsonObject, optional: true}
outputs:
- {name: dataset_path, type: String}
- {name: create_time, type: String}
- {name: dataset_id, type: String}
- {name: dataset_url, type: URI}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_dataset_for_tables/component.yaml'
implementation:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-automl==0.4.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
install --quiet --no-warn-script-location 'google-cloud-automl==0.4.0' --user)
&& "$0" "$@"
- python3
- -u
- -c
- |
def automl_create_dataset_for_tables(
gcp_project_id ,
gcp_region ,
display_name ,
description = None,
tables_dataset_metadata = {},
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout = None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata = None,
) :
'''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables
'''
import google
from google.cloud import automl
client = automl.AutoMlClient()
location_path = client.location_path(gcp_project_id, gcp_region)
dataset_dict = {
'display_name': display_name,
'description': description,
'tables_dataset_metadata': tables_dataset_metadata,
}
dataset = client.create_dataset(
location_path,
dataset_dict,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
print(dataset)
dataset_id = dataset.name.rsplit('/', 1)[-1]
dataset_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id}/schemav2?project={project_id}'.format(
project_id=gcp_project_id,
region=gcp_region,
dataset_id=dataset_id,
)
return (dataset.name, str(dataset.create_time), dataset_id, dataset_url)
import json
def _serialize_str(str_value: str) -> str:
if not isinstance(str_value, str):
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
return str_value
import argparse
_parser = argparse.ArgumentParser(prog='Automl create dataset for tables', description='automl_create_dataset_for_tables creates an empty Dataset for AutoML tables')
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--description", dest="description", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--tables-dataset-metadata", dest="tables_dataset_metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=4)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_create_dataset_for_tables(**_parsed_args)
_output_serializers = [
_serialize_str,
_serialize_str,
_serialize_str,
str,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --gcp-project-id
- {inputValue: gcp_project_id}
- --gcp-region
- {inputValue: gcp_region}
- --display-name
- {inputValue: display_name}
- if:
cond: {isPresent: description}
then:
- --description
- {inputValue: description}
- if:
cond: {isPresent: tables_dataset_metadata}
then:
- --tables-dataset-metadata
- {inputValue: tables_dataset_metadata}
- if:
cond: {isPresent: retry}
then:
- --retry
- {inputValue: retry}
- if:
cond: {isPresent: timeout}
then:
- --timeout
- {inputValue: timeout}
- if:
cond: {isPresent: metadata}
then:
- --metadata
- {inputValue: metadata}
- '----output-paths'
- {outputPath: dataset_path}
- {outputPath: create_time}
- {outputPath: dataset_id}
- {outputPath: dataset_url}

View File

@ -1,71 +0,0 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_create_model_for_tables(
gcp_project_id: str,
gcp_region: str,
display_name: str,
dataset_id: str,
target_column_path: str = None,
input_feature_column_paths: list = None,
optimization_objective: str = 'MAXIMIZE_AU_PRC',
train_budget_milli_node_hours: int = 1000,
) -> NamedTuple('Outputs', [('model_path', str), ('model_id', str), ('model_page_url', 'URI'),]):
from google.cloud import automl
client = automl.AutoMlClient()
location_path = client.location_path(gcp_project_id, gcp_region)
model_dict = {
'display_name': display_name,
'dataset_id': dataset_id,
'tables_model_metadata': {
'target_column_spec': automl.types.ColumnSpec(name=target_column_path),
'input_feature_column_specs': [automl.types.ColumnSpec(name=path) for path in input_feature_column_paths] if input_feature_column_paths else None,
'optimization_objective': optimization_objective,
'train_budget_milli_node_hours': train_budget_milli_node_hours,
},
}
create_model_response = client.create_model(location_path, model_dict)
print('Create model operation: {}'.format(create_model_response.operation))
result = create_model_response.result()
print(result)
model_name = result.name
model_id = model_name.rsplit('/', 1)[-1]
model_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id};modelId={model_id};task=basic/train?project={project_id}'.format(
project_id=gcp_project_id,
region=gcp_region,
dataset_id=dataset_id,
model_id=model_id,
)
return (model_name, model_id, model_url)
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_create_model_for_tables_op = create_component_from_func(
automl_create_model_for_tables,
output_component_file='component.yaml',
base_image='python:3.7',
packages_to_install=['google-cloud-automl==0.4.0'],
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_model_for_tables/component.yaml",
},
)

View File

@ -1,142 +0,0 @@
name: Automl create model for tables
inputs:
- {name: gcp_project_id, type: String}
- {name: gcp_region, type: String}
- {name: display_name, type: String}
- {name: dataset_id, type: String}
- {name: target_column_path, type: String, optional: true}
- {name: input_feature_column_paths, type: JsonArray, optional: true}
- {name: optimization_objective, type: String, default: MAXIMIZE_AU_PRC, optional: true}
- {name: train_budget_milli_node_hours, type: Integer, default: '1000', optional: true}
outputs:
- {name: model_path, type: String}
- {name: model_id, type: String}
- {name: model_page_url, type: URI}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_model_for_tables/component.yaml'
implementation:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-automl==0.4.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
install --quiet --no-warn-script-location 'google-cloud-automl==0.4.0' --user)
&& "$0" "$@"
- python3
- -u
- -c
- |
def automl_create_model_for_tables(
gcp_project_id ,
gcp_region ,
display_name ,
dataset_id ,
target_column_path = None,
input_feature_column_paths = None,
optimization_objective = 'MAXIMIZE_AU_PRC',
train_budget_milli_node_hours = 1000,
) :
from google.cloud import automl
client = automl.AutoMlClient()
location_path = client.location_path(gcp_project_id, gcp_region)
model_dict = {
'display_name': display_name,
'dataset_id': dataset_id,
'tables_model_metadata': {
'target_column_spec': automl.types.ColumnSpec(name=target_column_path),
'input_feature_column_specs': [automl.types.ColumnSpec(name=path) for path in input_feature_column_paths] if input_feature_column_paths else None,
'optimization_objective': optimization_objective,
'train_budget_milli_node_hours': train_budget_milli_node_hours,
},
}
create_model_response = client.create_model(location_path, model_dict)
print('Create model operation: {}'.format(create_model_response.operation))
result = create_model_response.result()
print(result)
model_name = result.name
model_id = model_name.rsplit('/', 1)[-1]
model_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id};modelId={model_id};task=basic/train?project={project_id}'.format(
project_id=gcp_project_id,
region=gcp_region,
dataset_id=dataset_id,
model_id=model_id,
)
return (model_name, model_id, model_url)
def _serialize_str(str_value: str) -> str:
if not isinstance(str_value, str):
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
return str_value
import json
import argparse
_parser = argparse.ArgumentParser(prog='Automl create model for tables', description='')
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--dataset-id", dest="dataset_id", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--target-column-path", dest="target_column_path", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--input-feature-column-paths", dest="input_feature_column_paths", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--optimization-objective", dest="optimization_objective", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--train-budget-milli-node-hours", dest="train_budget_milli_node_hours", type=int, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=3)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_create_model_for_tables(**_parsed_args)
_output_serializers = [
_serialize_str,
_serialize_str,
str,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --gcp-project-id
- {inputValue: gcp_project_id}
- --gcp-region
- {inputValue: gcp_region}
- --display-name
- {inputValue: display_name}
- --dataset-id
- {inputValue: dataset_id}
- if:
cond: {isPresent: target_column_path}
then:
- --target-column-path
- {inputValue: target_column_path}
- if:
cond: {isPresent: input_feature_column_paths}
then:
- --input-feature-column-paths
- {inputValue: input_feature_column_paths}
- if:
cond: {isPresent: optimization_objective}
then:
- --optimization-objective
- {inputValue: optimization_objective}
- if:
cond: {isPresent: train_budget_milli_node_hours}
then:
- --train-budget-milli-node-hours
- {inputValue: train_budget_milli_node_hours}
- '----output-paths'
- {outputPath: model_path}
- {outputPath: model_id}
- {outputPath: model_page_url}

View File

@ -1,44 +0,0 @@
from typing import NamedTuple
from kfp.components import create_component_from_func
def automl_deploy_model(
model_path: str,
) -> NamedTuple('Outputs', [
('model_path', str),
]):
"""Deploys a trained model.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
"""
from google.cloud import automl
client = automl.AutoMlClient()
response = client.deploy_model(
name=model_path,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (model_path, )
if __name__ == '__main__':
automl_deploy_model_op = create_component_from_func(
automl_deploy_model,
output_component_file='component.yaml',
base_image='python:3.8',
packages_to_install=[
'google-cloud-automl==2.0.0',
],
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/deploy_model/component.yaml",
},
)

View File

@ -1,87 +0,0 @@
name: Automl deploy model
description: |-
Deploys a trained model.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
inputs:
- {name: model_path, type: String}
outputs:
- {name: model_path, type: String}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/deploy_model/component.yaml'
implementation:
container:
image: python:3.8
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-automl==2.0.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
install --quiet --no-warn-script-location 'google-cloud-automl==2.0.0' --user)
&& "$0" "$@"
- python3
- -u
- -c
- |
def automl_deploy_model(
model_path,
):
"""Deploys a trained model.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
"""
from google.cloud import automl
client = automl.AutoMlClient()
response = client.deploy_model(
name=model_path,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (model_path, )
def _serialize_str(str_value: str) -> str:
if not isinstance(str_value, str):
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
return str_value
import argparse
_parser = argparse.ArgumentParser(prog='Automl deploy model', description="Deploys a trained model.\n\n Args:\n model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>")
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_deploy_model(**_parsed_args)
_output_serializers = [
_serialize_str,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --model-path
- {inputValue: model_path}
- '----output-paths'
- {outputPath: model_path}

View File

@ -1,61 +0,0 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_export_data_to_gcs(
dataset_path: str,
gcs_output_uri_prefix: str = None,
#retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = {},
) -> NamedTuple('Outputs', [('gcs_output_uri_prefix', str)]):
"""Exports dataset data to GCS."""
import sys
import subprocess
subprocess.run([sys.executable, "-m", "pip", "install", "google-cloud-automl==0.4.0", "--quiet", "--no-warn-script-location"], env={"PIP_DISABLE_PIP_VERSION_CHECK": "1"}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
output_config = {"gcs_destination": {"output_uri_prefix": gcs_output_uri_prefix}}
response = client.export_data(
name=dataset_path,
output_config=output_config,
#retry=retry or google.api_core.gapic_v1.method.DEFAULT
timeout=timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata=metadata,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (gcs_output_uri_prefix, )
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_export_data_to_gcs_op = create_component_from_func(
automl_export_data_to_gcs,
output_component_file='component.yaml',base_image='python:3.7',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_data_to_gcs/component.yaml",
},
)

View File

@ -1,117 +0,0 @@
name: Automl export data to gcs
description: |
Exports dataset data to GCS.
inputs:
- name: dataset_path
type: String
- name: gcs_output_uri_prefix
optional: true
type: String
- name: timeout
optional: true
type: Float
- default: '{}'
name: metadata
optional: true
type: JsonObject
outputs:
- name: gcs_output_uri_prefix
type: String
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_data_to_gcs/component.yaml'
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_export_data_to_gcs(
dataset_path: str,
gcs_output_uri_prefix: str = None,
#retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = {},
) -> NamedTuple('Outputs', [('gcs_output_uri_prefix', str)]):
"""Exports dataset data to GCS."""
import sys
import subprocess
subprocess.run([sys.executable, "-m", "pip", "install", "google-cloud-automl==0.4.0", "--quiet", "--no-warn-script-location"], env={"PIP_DISABLE_PIP_VERSION_CHECK": "1"}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
output_config = {"gcs_destination": {"output_uri_prefix": gcs_output_uri_prefix}}
response = client.export_data(
name=dataset_path,
output_config=output_config,
#retry=retry or google.api_core.gapic_v1.method.DEFAULT
timeout=timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata=metadata,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (gcs_output_uri_prefix, )
import json
import argparse
_parser = argparse.ArgumentParser(prog='Automl export data to gcs', description='Exports dataset data to GCS.\n')
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_export_data_to_gcs(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
_output_serializers = [
str
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --dataset-path
- inputValue: dataset_path
- if:
cond:
isPresent: gcs_output_uri_prefix
then:
- --gcs-output-uri-prefix
- inputValue: gcs_output_uri_prefix
- if:
cond:
isPresent: timeout
then:
- --timeout
- inputValue: timeout
- if:
cond:
isPresent: metadata
then:
- --metadata
- inputValue: metadata
- '----output-paths'
- outputPath: gcs_output_uri_prefix

View File

@ -1,56 +0,0 @@
from typing import NamedTuple
from kfp.components import create_component_from_func
def automl_export_model_to_gcs(
model_path: str,
gcs_output_uri_prefix: str,
model_format: str = 'tf_saved_model',
) -> NamedTuple('Outputs', [
('model_directory', 'Uri'),
]):
"""Exports a trained model to a user specified Google Cloud Storage location.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
"""
from google.cloud import automl
client = automl.AutoMlClient()
response = client.export_model(
name=model_path,
output_config=automl.ModelExportOutputConfig(
model_format=model_format,
gcs_destination=automl.GcsDestination(
output_uri_prefix=gcs_output_uri_prefix,
),
),
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (metadata.export_model_details.output_info.gcs_output_directory, )
if __name__ == '__main__':
automl_export_model_to_gcs_op = create_component_from_func(
automl_export_model_to_gcs,
output_component_file='component.yaml',
base_image='python:3.8',
packages_to_install=[
'google-cloud-automl==2.0.0',
],
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_model_to_gcs/component.yaml",
},
)

View File

@ -1,107 +0,0 @@
name: Automl export model to gcs
description: |-
Exports a trained model to a user specified Google Cloud Storage location.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
inputs:
- {name: model_path, type: String}
- {name: gcs_output_uri_prefix, type: String}
- {name: model_format, type: String, default: tf_saved_model, optional: true}
outputs:
- {name: model_directory, type: Uri}
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_model_to_gcs/component.yaml'
implementation:
container:
image: python:3.8
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-automl==2.0.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
install --quiet --no-warn-script-location 'google-cloud-automl==2.0.0' --user)
&& "$0" "$@"
- python3
- -u
- -c
- |
def automl_export_model_to_gcs(
model_path,
gcs_output_uri_prefix,
model_format = 'tf_saved_model',
):
"""Exports a trained model to a user specified Google Cloud Storage location.
Args:
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
"""
from google.cloud import automl
client = automl.AutoMlClient()
response = client.export_model(
name=model_path,
output_config=automl.ModelExportOutputConfig(
model_format=model_format,
gcs_destination=automl.GcsDestination(
output_uri_prefix=gcs_output_uri_prefix,
),
),
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
return (metadata.export_model_details.output_info.gcs_output_directory, )
import argparse
_parser = argparse.ArgumentParser(prog='Automl export model to gcs', description="Exports a trained model to a user specified Google Cloud Storage location.\n\n Args:\n model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'\n gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.\n model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>")
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--model-format", dest="model_format", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_export_model_to_gcs(**_parsed_args)
_output_serializers = [
str,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --model-path
- {inputValue: model_path}
- --gcs-output-uri-prefix
- {inputValue: gcs_output_uri_prefix}
- if:
cond: {isPresent: model_format}
then:
- --model-format
- {inputValue: model_format}
- '----output-paths'
- {outputPath: model_directory}

View File

@ -1,61 +0,0 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_import_data_from_bigquery(
dataset_path,
input_uri: str,
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('dataset_path', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
input_config = {
'bigquery_source': {
'input_uri': input_uri,
},
}
response = client.import_data(
dataset_path,
input_config,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
result = response.result()
print(result)
metadata = response.metadata
print(metadata)
return (dataset_path)
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_import_data_from_bigquery_op = create_component_from_func(
automl_import_data_from_bigquery,
output_component_file='component.yaml',
base_image='python:3.7',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_bigquery/component.yaml",
},
)

View File

@ -1,112 +0,0 @@
name: Automl import data from bigquery
inputs:
- name: dataset_path
- name: input_uri
type: String
- name: retry
optional: true
- name: timeout
optional: true
- name: metadata
type: JsonObject
optional: true
outputs:
- name: dataset_path
type: String
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_bigquery/component.yaml'
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_import_data_from_bigquery(
dataset_path,
input_uri: str,
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('dataset_path', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
input_config = {
'bigquery_source': {
'input_uri': input_uri,
},
}
response = client.import_data(
dataset_path,
input_config,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
result = response.result()
print(result)
metadata = response.metadata
print(metadata)
return (dataset_path)
import json
import argparse
_missing_arg = object()
_parser = argparse.ArgumentParser(prog='Automl import data from bigquery', description='')
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
_parser.add_argument("--input-uri", dest="input_uri", type=str, required=True, default=_missing_arg)
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_import_data_from_bigquery(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(str(_outputs[idx]))
args:
- --dataset-path
- inputValue: dataset_path
- --input-uri
- inputValue: input_uri
- if:
cond:
isPresent: retry
then:
- --retry
- inputValue: retry
- if:
cond:
isPresent: timeout
then:
- --timeout
- inputValue: timeout
- if:
cond:
isPresent: metadata
then:
- --metadata
- inputValue: metadata
- '----output-paths'
- outputPath: dataset_path

View File

@ -1,62 +0,0 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_import_data_from_gcs(
dataset_path: str,
input_uris: list,
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('dataset_path', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
input_config = {
'gcs_source': {
'input_uris': input_uris,
},
}
response = client.import_data(
dataset_path,
input_config,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
result = response.result()
print(result)
metadata = response.metadata
print(metadata)
return (dataset_path)
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_import_data_from_gcs_op = create_component_from_func(
automl_import_data_from_gcs,
output_component_file='component.yaml',
base_image='python:3.7',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_gcs/component.yaml",
},
)

View File

@ -1,113 +0,0 @@
name: Automl import data from gcs
inputs:
- name: dataset_path
type: String
- name: input_uris
type: JsonArray
- name: retry
optional: true
- name: timeout
optional: true
- name: metadata
type: JsonObject
optional: true
outputs:
- name: dataset_path
type: String
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_gcs/component.yaml'
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_import_data_from_gcs(
dataset_path: str,
input_uris: list,
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('dataset_path', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
import google
from google.cloud import automl
client = automl.AutoMlClient()
input_config = {
'gcs_source': {
'input_uris': input_uris,
},
}
response = client.import_data(
dataset_path,
input_config,
retry or google.api_core.gapic_v1.method.DEFAULT,
timeout or google.api_core.gapic_v1.method.DEFAULT,
metadata,
)
result = response.result()
print(result)
metadata = response.metadata
print(metadata)
return (dataset_path)
import json
import argparse
_missing_arg = object()
_parser = argparse.ArgumentParser(prog='Automl import data from gcs', description='')
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
_parser.add_argument("--input-uris", dest="input_uris", type=json.loads, required=True, default=_missing_arg)
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_import_data_from_gcs(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(str(_outputs[idx]))
args:
- --dataset-path
- inputValue: dataset_path
- --input-uris
- inputValue: input_uris
- if:
cond:
isPresent: retry
then:
- --retry
- inputValue: retry
- if:
cond:
isPresent: timeout
then:
- --timeout
- inputValue: timeout
- if:
cond:
isPresent: metadata
then:
- --metadata
- inputValue: metadata
- '----output-paths'
- outputPath: dataset_path

View File

@ -1,78 +0,0 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_prediction_service_batch_predict(
model_path,
gcs_input_uris: list = None,
gcs_output_uri_prefix: str = None,
bq_input_uri: str = None,
bq_output_uri: str = None,
params=None,
retry=None, #google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('gcs_output_directory', str), ('bigquery_output_dataset', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
input_config = {}
if gcs_input_uris:
input_config['gcs_source'] = {'input_uris': gcs_input_uris}
if bq_input_uri:
input_config['bigquery_source'] = {'input_uri': bq_input_uri}
output_config = {}
if gcs_output_uri_prefix:
output_config['gcs_destination'] = {'output_uri_prefix': gcs_output_uri_prefix}
if bq_output_uri:
output_config['bigquery_destination'] = {'output_uri': bq_output_uri}
from google.cloud import automl
client = automl.PredictionServiceClient()
response = client.batch_predict(
model_path,
input_config,
output_config,
params,
retry,
timeout,
metadata,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
output_info = metadata.batch_predict_details.output_info
# Workaround for Argo issue - it fails when output is empty: https://github.com/argoproj/argo-workflows/pull/1277/files#r326028422
return (output_info.gcs_output_directory or '-', output_info.bigquery_output_dataset or '-')
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_prediction_service_batch_predict_op = create_component_from_func(
automl_prediction_service_batch_predict,
output_component_file='component.yaml',
base_image='python:3.7',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/prediction_service_batch_predict/component.yaml",
},
)

View File

@ -1,175 +0,0 @@
name: Automl prediction service batch predict
inputs:
- name: model_path
- name: gcs_input_uris
type: JsonArray
optional: true
- name: gcs_output_uri_prefix
type: String
optional: true
- name: bq_input_uri
type: String
optional: true
- name: bq_output_uri
type: String
optional: true
- name: params
optional: true
- name: retry
optional: true
- name: timeout
optional: true
- name: metadata
type: JsonObject
optional: true
outputs:
- name: gcs_output_directory
type: String
- name: bigquery_output_dataset
type: String
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/prediction_service_batch_predict/component.yaml'
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_prediction_service_batch_predict(
model_path,
gcs_input_uris: str = None,
gcs_output_uri_prefix: str = None,
bq_input_uri: str = None,
bq_output_uri: str = None,
params=None,
retry=None, #google.api_core.gapic_v1.method.DEFAULT,
timeout=None, #google.api_core.gapic_v1.method.DEFAULT,
metadata: dict = None,
) -> NamedTuple('Outputs', [('gcs_output_directory', str), ('bigquery_output_dataset', str)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
input_config = {}
if gcs_input_uris:
input_config['gcs_source'] = {'input_uris': gcs_input_uris}
if bq_input_uri:
input_config['bigquery_source'] = {'input_uri': bq_input_uri}
output_config = {}
if gcs_output_uri_prefix:
output_config['gcs_destination'] = {'output_uri_prefix': gcs_output_uri_prefix}
if bq_output_uri:
output_config['bigquery_destination'] = {'output_uri': bq_output_uri}
from google.cloud import automl
client = automl.PredictionServiceClient()
response = client.batch_predict(
model_path,
input_config,
output_config,
params,
retry,
timeout,
metadata,
)
print('Operation started:')
print(response.operation)
result = response.result()
metadata = response.metadata
print('Operation finished:')
print(metadata)
output_info = metadata.batch_predict_details.output_info
# Workaround for Argo issue - it fails when output is empty: https://github.com/argoproj/argo-workflows/pull/1277/files#r326028422
return (output_info.gcs_output_directory or '-', output_info.bigquery_output_dataset or '-')
import json
import argparse
_missing_arg = object()
_parser = argparse.ArgumentParser(prog='Automl prediction service batch predict', description='')
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=_missing_arg)
_parser.add_argument("--gcs-input-uris", dest="gcs_input_uris", type=json.loads, required=False, default=_missing_arg)
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=False, default=_missing_arg)
_parser.add_argument("--bq-input-uri", dest="bq_input_uri", type=str, required=False, default=_missing_arg)
_parser.add_argument("--bq-output-uri", dest="bq_output_uri", type=str, required=False, default=_missing_arg)
_parser.add_argument("--params", dest="params", type=str, required=False, default=_missing_arg)
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_prediction_service_batch_predict(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(str(_outputs[idx]))
args:
- --model-path
- inputValue: model_path
- if:
cond:
isPresent: gcs_input_uris
then:
- --gcs-input-uris
- inputValue: gcs_input_uris
- if:
cond:
isPresent: gcs_output_uri_prefix
then:
- --gcs-output-uri-prefix
- inputValue: gcs_output_uri_prefix
- if:
cond:
isPresent: bq_input_uri
then:
- --bq-input-uri
- inputValue: bq_input_uri
- if:
cond:
isPresent: bq_output_uri
then:
- --bq-output-uri
- inputValue: bq_output_uri
- if:
cond:
isPresent: params
then:
- --params
- inputValue: params
- if:
cond:
isPresent: retry
then:
- --retry
- inputValue: retry
- if:
cond:
isPresent: timeout
then:
- --timeout
- inputValue: timeout
- if:
cond:
isPresent: metadata
then:
- --metadata
- inputValue: metadata
- '----output-paths'
- outputPath: gcs_output_directory
- outputPath: bigquery_output_dataset

View File

@ -1,59 +0,0 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import NamedTuple
def automl_split_dataset_table_column_names(
dataset_path: str,
target_column_name: str,
table_index: int = 0,
) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
from google.cloud import automl
client = automl.AutoMlClient()
list_table_specs_response = client.list_table_specs(dataset_path)
table_specs = [s for s in list_table_specs_response]
print('table_specs=')
print(table_specs)
table_spec_name = table_specs[table_index].name
list_column_specs_response = client.list_column_specs(table_spec_name)
column_specs = [s for s in list_column_specs_response]
print('column_specs=')
print(column_specs)
target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0]
feature_column_specs = [s for s in column_specs if s.display_name != target_column_name]
feature_column_names = [s.name for s in feature_column_specs]
import json
return (target_column_spec.name, json.dumps(feature_column_names))
if __name__ == '__main__':
from kfp.components import create_component_from_func
automl_split_dataset_table_column_names_op = create_component_from_func(
automl_split_dataset_table_column_names,
output_component_file='component.yaml',
base_image='python:3.7',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/split_dataset_table_column_names/component.yaml",
},
)

View File

@ -1,95 +0,0 @@
name: Automl split dataset table column names
inputs:
- name: dataset_path
type: String
- name: target_column_name
type: String
- name: table_index
type: Integer
default: '0'
optional: true
outputs:
- name: target_column_path
type: String
- name: feature_column_paths
type: JsonArray
metadata:
annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/split_dataset_table_column_names/component.yaml'
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_split_dataset_table_column_names(
dataset_path: str,
target_column_name: str,
table_index: int = 0,
) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
from google.cloud import automl
client = automl.AutoMlClient()
list_table_specs_response = client.list_table_specs(dataset_path)
table_specs = [s for s in list_table_specs_response]
print('table_specs=')
print(table_specs)
table_spec_name = table_specs[table_index].name
list_column_specs_response = client.list_column_specs(table_spec_name)
column_specs = [s for s in list_column_specs_response]
print('column_specs=')
print(column_specs)
target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0]
feature_column_specs = [s for s in column_specs if s.display_name != target_column_name]
feature_column_names = [s.name for s in feature_column_specs]
import json
return (target_column_spec.name, json.dumps(feature_column_names))
import argparse
_missing_arg = object()
_parser = argparse.ArgumentParser(prog='Automl split dataset table column names', description='')
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
_parser.add_argument("--target-column-name", dest="target_column_name", type=str, required=True, default=_missing_arg)
_parser.add_argument("--table-index", dest="table_index", type=int, required=False, default=_missing_arg)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_split_dataset_table_column_names(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(str(_outputs[idx]))
args:
- --dataset-path
- inputValue: dataset_path
- --target-column-name
- inputValue: target_column_name
- if:
cond:
isPresent: table_index
then:
- --table-index
- inputValue: table_index
- '----output-paths'
- outputPath: target_column_path
- outputPath: feature_column_paths

View File

@ -1,3 +0,0 @@
# Deprecation Warning
The components in this directory is now moved to [components/google-cloud/google_cloud_pipeline_components/experimental/bigquery](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/experimental/bigquery). This directory will be removed by the end of 2021.

View File

@ -1,298 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Name\n",
"\n",
"Gather training data by querying BigQuery \n",
"\n",
"\n",
"# Labels\n",
"\n",
"GCP, BigQuery, Kubeflow, Pipeline\n",
"\n",
"\n",
"# Summary\n",
"\n",
"A Kubeflow Pipeline component to submit a query to BigQuery and store the result in a Cloud Storage bucket.\n",
"\n",
"\n",
"# Details\n",
"\n",
"\n",
"## Intended use\n",
"\n",
"Use this Kubeflow component to:\n",
"* Select training data by submitting a query to BigQuery.\n",
"* Output the training data into a Cloud Storage bucket as CSV files.\n",
"\n",
"\n",
"## Runtime arguments:\n",
"\n",
"\n",
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
"|----------|-------------|----------|-----------|-----------------|---------|\n",
"| query | The query used by BigQuery to fetch the results. | No | String | | |\n",
"| project_id | The project ID of the Google Cloud Platform (GCP) project to use to execute the query. | No | GCPProjectID | | |\n",
"| dataset_id | The ID of the persistent BigQuery dataset to store the results of the query. If the dataset does not exist, the operation will create a new one. | Yes | String | | None |\n",
"| table_id | The ID of the BigQuery table to store the results of the query. If the table ID is absent, the operation will generate a random ID for the table. | Yes | String | | None |\n",
"| output_gcs_path | The path to the Cloud Storage bucket to store the query output. | Yes | GCSPath | | None |\n",
"| dataset_location | The location where the dataset is created. Defaults to US. | Yes | String | | US |\n",
"| job_config | The full configuration specification for the query job. See [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) for details. | Yes | Dict | A JSONobject which has the same structure as [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) | None |\n",
"## Input data schema\n",
"\n",
"The input data is a BigQuery job containing a query that pulls data f rom various sources. \n",
"\n",
"\n",
"## Output:\n",
"\n",
"Name | Description | Type\n",
":--- | :---------- | :---\n",
"output_gcs_path | The path to the Cloud Storage bucket containing the query output in CSV format. | GCSPath\n",
"\n",
"## Cautions & requirements\n",
"\n",
"To use the component, the following requirements must be met:\n",
"\n",
"* The BigQuery API is enabled.\n",
"* The component can authenticate to use GCP APIs. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
"* The Kubeflow user service account is a member of the `roles/bigquery.admin` role of the project.\n",
"* The Kubeflow user service account is a member of the `roles/storage.objectCreator `role of the Cloud Storage output bucket.\n",
"\n",
"## Detailed description\n",
"This Kubeflow Pipeline component is used to:\n",
"* Submit a query to BigQuery.\n",
" * The query results are persisted in a dataset table in BigQuery.\n",
" * An extract job is created in BigQuery to extract the data from the dataset table and output it to a Cloud Storage bucket as CSV files.\n",
"\n",
" Use the code below as an example of how to run your BigQuery job.\n",
"\n",
"### Sample\n",
"\n",
"Note: The following sample code works in an IPython notebook or directly in Python code.\n",
"\n",
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture --no-stderr\n",
"\n",
"!pip3 install kfp --upgrade"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Load the component using KFP SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.components as comp\n",
"\n",
"bigquery_query_op = comp.load_component_from_url(\n",
" 'https://raw.githubusercontent.com/kubeflow/pipelines/01a23ae8672d3b18e88adf3036071496aca3552d/components/gcp/bigquery/query/component.yaml')\n",
"help(bigquery_query_op)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample\n",
"\n",
"Note: The following sample code works in IPython notebook or directly in Python code.\n",
"\n",
"In this sample, we send a query to get the top questions from stackdriver public data and output the data to a Cloud Storage bucket. Here is the query:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Set sample parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Required Parameters\n",
"PROJECT_ID = '<Please put your project ID here>'\n",
"GCS_WORKING_DIR = 'gs://<Please put your GCS path here>' # No ending slash"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Optional Parameters\n",
"EXPERIMENT_NAME = 'Bigquery -Query'\n",
"OUTPUT_PATH = '{}/bigquery/query/questions.csv'.format(GCS_WORKING_DIR)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Run the component as a single pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kfp.dsl as dsl\n",
"import json\n",
"@dsl.pipeline(\n",
" name='Bigquery query pipeline',\n",
" description='Bigquery query pipeline'\n",
")\n",
"def pipeline(\n",
" query=QUERY, \n",
" project_id = PROJECT_ID, \n",
" dataset_id='', \n",
" table_id='', \n",
" output_gcs_path=OUTPUT_PATH, \n",
" dataset_location='US', \n",
" job_config=''\n",
"):\n",
" bigquery_query_op(\n",
" query=query, \n",
" project_id=project_id, \n",
" dataset_id=dataset_id, \n",
" table_id=table_id, \n",
" output_gcs_path=output_gcs_path, \n",
" dataset_location=dataset_location, \n",
" job_config=job_config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compile the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_func = pipeline\n",
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
"import kfp.compiler as compiler\n",
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit the pipeline for execution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify pipeline argument values\n",
"arguments = {}\n",
"\n",
"#Get or create an experiment and submit a pipeline run\n",
"import kfp\n",
"client = kfp.Client()\n",
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"#Submit a pipeline run\n",
"run_name = pipeline_func.__name__ + ' run'\n",
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Inspect the output"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!gsutil cat $OUTPUT_PATH"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## References\n",
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/bigquery/_query.py)\n",
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/bigquery/query/sample.ipynb)\n",
"* [BigQuery query REST API](https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query)\n",
"\n",
"## License\n",
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,187 +0,0 @@
# Name
Gather data by querying BigQuery and save it in a CSV file.
# Labels
GCP, BigQuery, Kubeflow, Pipeline
# Summary
A Kubeflow Pipeline component to submit a query to BigQuery and store the result in a csv file avialble for other components to utalize.
# Details
## Intended use
Use this Kubeflow component to:
* Select training data by submitting a query to BigQuery.
* Output the training data into a CSV files.
## Runtime arguments:
## Runtime arguments:
| Argument | Description | Optional | Data type | Accepted values | Default |
|----------|-------------|----------|-----------|-----------------|---------|
| query | The query used by BigQuery to fetch the results. | No | String | | |
| project_id | The project ID of the Google Cloud Platform (GCP) project to use to execute the query. | No | GCPProjectID | | |
| output_filename | The file name of the output file. | Yes | String | | bq_results.csv |
| job_config | The full configuration specification for the query job. See [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) for details. | Yes | Dict | A JSONobject which has the same structure as [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) | None |
## Input data schema
The input data is a BigQuery job containing a query that pulls data from various sources.
## Output:
Name | Description | Type
:--- | :---------- | :---
output_path | The path to the file containing the query output in CSV format. | OutputPath
## Cautions & requirements
To use the component, the following requirements must be met:
* The BigQuery API is enabled.
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* The Kubeflow user service account is a member of the `roles/bigquery.admin` role of the project.
* The Kubeflow user service account is a member of the `roles/storage.objectCreator `role of the Cloud Storage output bucket.
## Detailed description
This Kubeflow Pipeline component is used to:
* Submit a query to BigQuery.
* The query results are extracted and stored as a csv file locally avilable for other kubeflow components.
Use the code below as an example of how to run your BigQuery job.
## Sample
Note: The following sample code works in an IPython notebook or directly in Python code.
#### Set sample parameters
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using KFP SDK
```python
import kfp.components as comp
bigquery_query_op = comp.load_component_from_url(
'https://raw.githubusercontent.com/kubeflow/pipelines/01a23ae8672d3b18e88adf3036071496aca3552d/components/gcp/bigquery/query/to?gcs/component.yaml')
help(bigquery_query_op)
```
### Query
In this sample, we send a query to get the top questions from stackdriver public data and output the data to CSV file which other components can access. Here is the query:
```python
QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'
```
#### Set sample parameters
```python
# Required Parameters
PROJECT_ID = '<Please put your project ID here>'
```
```python
# Optional Parameters
FILE_NAME = 'test.csv'
```
#### Run the component as a single pipeline
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Bigquery query pipeline',
description='Bigquery query pipeline'
)
def pipeline(
query=QUERY,
project_id = PROJECT_ID,
output_filename=FILE_NAME
job_config=''
):
bigquery_query_op(
query=query,
project_id=project_id,
job_config=job_config)
```
#### Compile the pipeline
```python
pipeline_func = pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify pipeline argument values
arguments = {}
#Get or create an experiment and submit a pipeline run
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
#### Use the output in a pipeline
Small example on how to use the output form the component, here `read_csv` any component of interest that can consume a csv file.
```python
def pipeline(
query=QUERY,
project_id = PROJECT_ID,
job_config=''
):
bq_out = bigquery_query(
query=query,
project_id=project_id,
output_filename=FILE_NAME,
job_config=job_config)
read_csv(input_path=bq_out.outputs["table"] + "/" + FILE_NAME)
```
## References
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/bigquery/_query.py)
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [BigQuery query REST API](https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -1,62 +0,0 @@
# Export to file for next processing step in pipeline
# Copyright 2020 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Bigquery - Query
description: |
A Kubeflow Pipeline component to submit a query to Google Cloud Bigquery and
store the results to a csv file.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: query
description: 'The query used by Bigquery service to fetch the results.'
type: String
- name: project_id
description: 'The project to execute the query job.'
type: GCPProjectID
- name: job_config
description: >-
The full config spec for the query job.See
[QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig)
for details.
default: ''
type: Dict
- name: output_filename
description: 'The output file name'
default: 'bq_results.csv'
type: String
outputs:
- name: MLPipeline UI metadata
type: UI metadata
- name: table
description: 'The path to the result from BigQuery'
type: CSV
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.bigquery, query,
--query, {inputValue: query},
--project_id, {inputValue: project_id},
--output_path, {outputPath: table},
--output_filename, {inputValue: output_filename},
--job_config, {inputValue: job_config},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -1,187 +0,0 @@
# Name
Gather data by querying BigQuery and save it to GCS.
# Labels
GCP, BigQuery, Kubeflow, Pipeline
# Summary
A Kubeflow Pipeline component to submit a query to BigQuery and store the result in table on BigQuery.
# Details
## Intended use
Use this Kubeflow component to:
* Select data by submitting a query to BigQuery.
* Output the data into a table on BigQuery.
## Runtime arguments:
| Argument | Description | Optional | Data type | Accepted values | Default |
|----------|-------------|----------|-----------|-----------------|---------|
| query | The query used by BigQuery to fetch the results. | No | String | | |
| project_id | The project ID of the Google Cloud Platform (GCP) project to use to execute the query. | No | GCPProjectID | | |
| dataset_id | The ID of the persistent BigQuery dataset to store the results of the query. If the dataset does not exist, the operation will create a new one. | Yes | String | | None |
| table_id | The ID of the BigQuery table to store the results of the query. If the table ID is absent, the operation will generate a random ID for the table. | Yes | String | | None |
| dataset_location | The location where the dataset is created. Defaults to US. | Yes | String | | US |
| job_config | The full configuration specification for the query job. See [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) for details. | Yes | Dict | A JSONobject which has the same structure as [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) | None |
## Input data schema
The input data is a BigQuery job containing a query that pulls data from various sources.
## Output:
Name | Description | Type
:--- | :---------- | :---
output_gcs_path | The path to the Cloud Storage bucket containing the query output in CSV format. | GCSPath
## Cautions & requirements
To use the component, the following requirements must be met:
* The BigQuery API is enabled.
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* The Kubeflow user service account is a member of the `roles/bigquery.admin` role of the project.
* The Kubeflow user service account is a member of the `roles/storage.objectCreator `role of the Cloud Storage output bucket.
## Detailed description
This Kubeflow Pipeline component is used to:
* Submit a query to BigQuery.
* The query results are persisted in a dataset table in BigQuery.
* The data is extracted localy and stored as a csv file.
Use the code below as an example of how to run your BigQuery job.
### Sample
Note: The following sample code works in an IPython notebook or directly in Python code.
#### Set sample parameters
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using KFP SDK
```python
import kfp.components as comp
bigquery_query_op = comp.load_component_from_url(
'https://raw.githubusercontent.com/kubeflow/pipelines/01a23ae8672d3b18e88adf3036071496aca3552d/components/gcp/bigquery/query/to?gcs/component.yaml')
help(bigquery_query_op)
```
### Query
In this sample, we send a query to get the top questions from stackdriver public data and output the data to a Cloud Storage bucket. Here is the query:
```python
QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'
```
#### Set sample parameters
```python
# Required Parameters
PROJECT_ID = '<Please put your project ID here>'
GCS_WORKING_DIR = 'gs://<Please put your GCS path here>' # No ending slash
```
```python
# Optional Parameters
EXPERIMENT_NAME = 'Bigquery-Query'
OUTPUT_PATH = '{}/bigquery/query/questions.csv'.format(GCS_WORKING_DIR)
```
#### Run the component as a single pipeline
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Bigquery query pipeline',
description='Bigquery query pipeline'
)
def pipeline(
query=QUERY,
project_id = PROJECT_ID,
dataset_id='',
table_id='',
output_gcs_path=OUTPUT_PATH,
dataset_location='US',
job_config=''
):
bigquery_query_op(
query=query,
project_id=project_id,
dataset_id=dataset_id,
table_id=table_id,
output_gcs_path=output_gcs_path,
dataset_location=dataset_location,
job_config=job_config)
```
#### Compile the pipeline
```python
pipeline_func = pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify pipeline argument values
arguments = {}
#Get or create an experiment and submit a pipeline run
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
#### Inspect the output
```python
!gsutil cat $OUTPUT_PATH
```
## References
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/bigquery/_query.py)
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/bigquery/query/sample.ipynb)
* [BigQuery query REST API](https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -1,88 +0,0 @@
# Export to bucket in gcs
# Copyright 2020 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Bigquery - Query
description: |
A Kubeflow Pipeline component to submit a query to Google Cloud Bigquery
service and dump outputs to a Google Cloud Storage blob.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: query
description: 'The query used by Bigquery service to fetch the results.'
type: String
- name: project_id
description: 'The project to execute the query job.'
type: GCPProjectID
- name: dataset_id
description: 'The ID of the persistent dataset to keep the results of the query.'
default: ''
type: String
- name: table_id
description: >-
The ID of the table to keep the results of the query. If absent, the operation
will generate a random id for the table.
default: ''
type: String
- name: output_gcs_path
description: 'The path to the Cloud Storage bucket to store the query output.'
default: ''
type: GCSPath
- name: output_destination_format
description: 'The name of the output destination format. Default is CSV, and you can also choose NEWLINE_DELIMITED_JSON and AVRO.'
default: 'CSV'
type: String
- name: dataset_location
description: 'The location to create the dataset. Defaults to `US`.'
default: 'US'
type: String
- name: job_config
description: >-
The full config spec for the query job.See
[QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig)
for details.
default: ''
type: Dict
- name: output_kfp_path
description: 'The path to where the file should be stored.'
default: ''
type: String
outputs:
- name: output_gcs_path
description: 'The path to the Cloud Storage bucket containing the query output in CSV format.'
type: GCSPath
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ["python", -u, -m, "kfp_component.launcher"]
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.bigquery, query,
--query, {inputValue: query},
--project_id, {inputValue: project_id},
--dataset_id, {inputValue: dataset_id},
--table_id, {inputValue: table_id},
--dataset_location, {inputValue: dataset_location},
--output_gcs_path, {inputValue: output_gcs_path},
--output_destination_format, {inputValue: output_destination_format},
--job_config, {inputValue: job_config},
--output_gcs_path_output_path, {outputPath: output_gcs_path},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -1,173 +0,0 @@
# Name
Gather data by querying BigQuery and save it to a table in BigQuery.
# Labels
GCP, BigQuery, Kubeflow, Pipeline
# Summary
A Kubeflow Pipeline component to submit a query to BigQuery and store the result in a table in BigQuery.
# Details
## Intended use
Use this Kubeflow component to:
* Select data by submitting a query to BigQuery.
* Output the data into a table in BigQuery.
## Runtime arguments:
| Argument | Description | Optional | Data type | Accepted values | Default |
|----------|-------------|----------|-----------|-----------------|---------|
| query | The query used by BigQuery to fetch the results. | No | String | | |
| project_id | The project ID of the Google Cloud Platform (GCP) project to use to execute the query. | No | GCPProjectID | | |
| dataset_id | The ID of the persistent BigQuery dataset to store the results of the query. If the dataset does not exist, the operation will create a new one. | Yes | String | | None |
| table_id | The ID of the BigQuery table to store the results of the query. If the table ID is absent, the operation will generate a random ID for the table. | Yes | String | | None |
| dataset_location | The location where the dataset is created. Defaults to US. | Yes | String | | US |
| job_config | The full configuration specification for the query job. See [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) for details. | Yes | Dict | A JSONobject which has the same structure as [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) | None |
## Input data schema
The input data is a BigQuery job containing a query that pulls data from various sources.
## Output:
## Cautions & requirements
To use the component, the following requirements must be met:
* The BigQuery API is enabled.
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
* The Kubeflow user service account is a member of the `roles/bigquery.admin` role of the project.
* The Kubeflow user service account is a member of the `roles/storage.objectCreator `role of the Cloud Storage output bucket.
## Detailed description
This Kubeflow Pipeline component is used to:
* Submit a query to BigQuery.
* The query results are persisted in a dataset table in BigQuery.
### Sample
Note: The following sample code works in an IPython notebook or directly in Python code.
#### Set sample parameters
```python
%%capture --no-stderr
!pip3 install kfp --upgrade
```
2. Load the component using KFP SDK
```python
import kfp.components as comp
bigquery_query_op = comp.load_component_from_url(
'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/bigquery/query/to_table/component.yaml')
help(bigquery_query_op)
```
### Query
In this sample, we send a query to get the top questions from stackdriver public data and write the result to a table.
```python
QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'
```
#### Set sample parameters
```python
# Required Parameters
PROJECT_ID = '<Please put your project ID here>'
```
```python
# Optional Parameters
EXPERIMENT_NAME = 'Bigquery-Query'
DATASET_ID = "TEST_DATASET"
TABLE_ID = "TEST_TABLE"
```
#### Run the component as a single pipeline
```python
import kfp.dsl as dsl
import json
@dsl.pipeline(
name='Bigquery query pipeline',
description='Bigquery query pipeline'
)
def pipeline(
query=QUERY,
project_id=PROJECT_ID,
dataset_id=DATASET_ID,
table_id=TABLE_ID,
dataset_location='US',
job_config=''
):
bigquery_query_op(
query=query,
project_id=project_id,
dataset_id=dataset_id,
table_id=table_id,
dataset_location=dataset_location,
job_config=job_config)
```
#### Compile the pipeline
```python
pipeline_func = pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'
import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)
```
#### Submit the pipeline for execution
```python
#Specify pipeline argument values
arguments = {}
#Get or create an experiment and submit a pipeline run
import kfp
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
```
#### Inspect the output
Find the create table under the specified dataset id and table id.
## References
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/bigquery/_query.py)
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
* [BigQuery query REST API](https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query)
## License
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.

View File

@ -1,70 +0,0 @@
# export to new table.
# Copyright 2020 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Bigquery - Query
description: |
A Kubeflow Pipeline component to submit a query to Google Cloud Bigquery
service and dump outputs to new table.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: query
description: 'The query used by Bigquery service to fetch the results.'
type: String
- name: project_id
description: 'The project to execute the query job.'
type: GCPProjectID
- name: dataset_id
description: 'The ID of the persistent dataset to keep the results of the query.'
default: ''
type: String
- name: table_id
description: >-
The ID of the table to keep the results of the query. If absent, the operation
will generate a random id for the table.
default: ''
type: String
- name: dataset_location
description: 'The location to create the dataset. Defaults to `US`.'
default: 'US'
type: String
- name: job_config
description: >-
The full config spec for the query job.See
[QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig)
for details.
default: ''
type: Dict
outputs:
- name: MLPipeline UI metadata
type: UI metadata
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
command: ['python', '-u', '-m', 'kfp_component.launcher']
args: [
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
kfp_component.google.bigquery, query,
--query, {inputValue: query},
--project_id, {inputValue: project_id},
--dataset_id, {inputValue: dataset_id},
--table_id, {inputValue: table_id},
--dataset_location, {inputValue: dataset_location},
--job_config, {inputValue: job_config},
]
env:
KFP_POD_NAME: "{{pod.name}}"

View File

@ -1,36 +0,0 @@
# Copyright 2021 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM python:3.7
RUN apt-get update && apt-get install -y --no-install-recommends \
wget patch \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
# Pin pip version to work around https://github.com/apache/beam/issues/22218
RUN python3 -m pip install pip==21.2.4
RUN python3 -m pip install -r \
requirements.txt --quiet --no-cache-dir \
&& rm -f requirements.txt
ADD build /ml
WORKDIR /ml
RUN pip install .
# The patch sets User Agent for telemetry purpose.
# It is based on "google-api-python-client==1.7.8", and needs to be updated when upgrading the package.
RUN patch /usr/local/lib/python3.7/site-packages/googleapiclient/http.py < /ml/patches/http.patch
ENTRYPOINT ["python", "-u", "-m", "kfp_component.launcher"]

View File

@ -1,20 +0,0 @@
#!/bin/bash -e
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
mkdir -p ./build
cp -arv ./component_sdk/python/. ./build/
../../build_image.sh -l ml-pipeline-gcp "$@"
rm -rf ./build

View File

@ -1,6 +0,0 @@
approvers:
- hongye-sun
reviewers:
- Ark-kun
- gaoning777
- hongye-sun

View File

@ -1,15 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import launcher, core, google

View File

@ -1,16 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._kfp_execution_context import KfpExecutionContext
from . import _display as display

View File

@ -1,117 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import threading
import logging
_OUTPUT_PATH = os.environ.get('KFP_UI_METADATA_PATH', '/mlpipeline-ui-metadata.json')
_OUTPUT_FILE_LOCK = threading.Lock()
def display(obj):
"""Display an object to KFP UI.
Args:
obj (object): the object to output the display metadata. It follows same
convention defined by IPython display API. The currently supported representation
functions:
* `_repr_markdown_`: it returns a markdown content which will be converted into a
web-app metadata to KFP UI.
* `_repr_kfpmetadata_`: it returns a KFP metadata json object, which follows
the convention from https://www.kubeflow.org/docs/pipelines/output-viewer/.
The supported builtin objects are markdown, Tensorboard, Link.
"""
obj_dir = dir(obj)
if '_repr_markdown_' in obj_dir:
display_markdown(obj)
if '_repr_kfpmetadata_' in obj_dir:
display_kfpmetadata(obj)
logging.info(str(obj))
def display_markdown(obj):
"""Display markdown representation to KFP UI.
"""
if '_repr_markdown_' not in dir(obj):
raise ValueError('_repr_markdown_ function is not present.')
markdown = obj._repr_markdown_()
_output_ui_metadata({
'type': 'markdown',
'source': markdown,
'storage': 'inline'
})
def display_kfpmetadata(obj):
"""Display from KFP UI metadata
"""
if '_repr_kfpmetadata_' not in dir(obj):
raise ValueError('_repr_kfpmetadata_ function is not present.')
kfp_metadata = obj._repr_kfpmetadata_()
_output_ui_metadata(kfp_metadata)
def _output_ui_metadata(output):
with _OUTPUT_FILE_LOCK:
metadata = {}
if os.path.isfile(_OUTPUT_PATH):
with open(_OUTPUT_PATH, 'r') as f:
metadata = json.load(f)
with open(_OUTPUT_PATH, 'w') as f:
if 'outputs' not in metadata:
metadata['outputs'] = []
metadata['outputs'].append(output)
json.dump(metadata, f)
class Markdown(object):
"""Class to hold markdown raw data.
"""
def __init__(self, data):
self._data = data
def _repr_markdown_(self):
return self._data
def __repr__(self):
return self._data
class Tensorboard(object):
"""Class to hold tensorboard metadata.
"""
def __init__(self, job_dir):
self._job_dir = job_dir
def _repr_kfpmetadata_(self):
return {
'type': 'tensorboard',
'source': self._job_dir
}
def __repr__(self):
return 'Open Tensorboard at: {}'.format(self._job_dir)
class Link(Markdown):
"""Class to hold an markdown hyperlink data.
"""
def __init__(self, href, text):
super(Link, self).__init__(
'## [{}]({})'.format(text, href))
self._href = href
self._text = text
def __repr__(self):
return '{}: {}'.format(self._text, self._href)

View File

@ -1,158 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import signal
import logging
import json
from datetime import datetime
import os
import hashlib
import uuid
import re
from kubernetes import client, config
from kubernetes.client.rest import ApiException
DEFAULT_NAMESPACE = 'kubeflow'
KFP_POD_ENV_NAME = 'KFP_POD_NAME'
KFP_NAMESPACE_ENV_NAME = 'KFP_NAMESPACE'
ARGO_EXECUTION_CONTROL_ANNOTATION = 'workflows.argoproj.io/execution'
ARGO_NODE_NAME_ANNOTATION = 'workflows.argoproj.io/node-name'
class KfpExecutionContext:
"""Execution context for running inside Kubeflow Pipelines.
The base class is aware of the KFP environment and can cascade
pipeline cancel or deadline event to the operation through
``on_cancel`` handler.
Args:
on_cancel: optional, function to handle KFP cancel event.
"""
def __init__(self, on_cancel=None):
self._load_kfp_environment()
self._context_id = self._generate_context_id()
logging.info('Start KFP context with ID: {}.'.format(
self._context_id))
self._on_cancel = on_cancel
self._original_sigterm_hanlder = None
def __enter__(self):
self._original_sigterm_hanlder = signal.getsignal(signal.SIGTERM)
signal.signal(signal.SIGTERM, self._exit_gracefully)
return self
def __exit__(self, type, value, traceback):
signal.signal(signal.SIGTERM, self._original_sigterm_hanlder)
def context_id(self):
"""Returns a stable context ID across retries. The ID is in
32 bytes hex format.
"""
return self._context_id
def under_kfp_environment(self):
"""Returns true if the execution is under KFP environment.
"""
return self._pod_name and self._k8s_client and self._argo_node_name
def _generate_context_id(self):
if self.under_kfp_environment():
stable_name_name = re.sub(r'\(\d+\)$', '', self._argo_node_name)
return hashlib.md5(bytes(stable_name_name.encode())).hexdigest()
else:
return uuid.uuid1().hex
def _load_kfp_environment(self):
self._pod_name = os.environ.get(KFP_POD_ENV_NAME, None)
self._namespace = os.environ.get(KFP_NAMESPACE_ENV_NAME, DEFAULT_NAMESPACE)
if not self._pod_name:
self._k8s_client = None
else:
try:
config.load_incluster_config()
self._k8s_client = client.CoreV1Api()
except Exception as e:
logging.warning('Failed to load kubernetes client:'
' {}.'.format(e))
self._k8s_client = None
if self._pod_name and self._k8s_client:
self._argo_node_name = self._get_argo_node_name()
if not self.under_kfp_environment():
logging.warning('Running without KFP context.')
def _get_argo_node_name(self):
pod = self._get_pod()
if not pod or not pod.metadata or not pod.metadata.annotations:
return None
return pod.metadata.annotations.get(
ARGO_NODE_NAME_ANNOTATION, None)
def _exit_gracefully(self, signum, frame):
logging.info('SIGTERM signal received.')
if (self._on_cancel and
self.under_kfp_environment() and
self._should_cancel()):
logging.info('Cancelling...')
self._on_cancel()
logging.info('Exit')
def _should_cancel(self):
"""Checks argo's execution config deadline and decide whether the operation
should be cancelled.
Argo cancels workflow by setting deadline to 0 and sends SIGTERM
signal to main container with 10s graceful period.
"""
pod = self._get_pod()
if not pod or not pod.metadata or not pod.metadata.annotations:
logging.info('No pod metadata or annotations.')
return False
argo_execution_config_json = pod.metadata.annotations.get(
ARGO_EXECUTION_CONTROL_ANNOTATION, None)
if not argo_execution_config_json:
logging.info('No argo execution config data.')
return False
try:
argo_execution_config = json.loads(argo_execution_config_json)
except Exception as e:
logging.error("Error deserializing argo execution config: {}".format(e))
return False
deadline_json = argo_execution_config.get('deadline', None)
if not deadline_json:
logging.info('No argo execution deadline config.')
return False
try:
deadline = datetime.strptime(deadline_json, '%Y-%m-%dT%H:%M:%SZ')
except Exception as e:
logging.error("Error converting deadline string to datetime: {}".format(e))
return False
return datetime.now() > deadline
def _get_pod(self):
logging.info('Fetching latest pod metadata: {}.'.format(
self._pod_name))
try:
return self._k8s_client.read_namespaced_pod(
self._pod_name, self._namespace)
except Exception as e:
logging.error('Failed to get pod: {}'.format(e))
return None

View File

@ -1,15 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import ml_engine, dataflow, dataproc

View File

@ -1,15 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._query import query

View File

@ -1,155 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
import os
from google.cloud import bigquery
from google.cloud.bigquery.job import ExtractJobConfig, DestinationFormat
from google.api_core import exceptions
from kfp_component.core import KfpExecutionContext, display
from .. import common as gcp_common
# TODO(hongyes): make this path configurable as a environment variable
KFP_OUTPUT_PATH = '/tmp/kfp/output/'
def query(query, project_id, dataset_id=None, table_id=None,
output_gcs_path=None, dataset_location='US', job_config=None,
output_path=None, output_filename=None, output_destination_format="CSV",
job_object_output_path='/tmp/kfp/output/bigquery/query-job.json',
output_gcs_path_output_path='/tmp/kfp/output/bigquery/query-output-path.txt',
output_dataset_id_output_path='/tmp/kfp/output/bigquery/query-dataset-id.txt',
output_table_id_output_path='/tmp/kfp/output/bigquery/query-table-id.txt',
):
"""Submit a query to Bigquery service and dump outputs to Bigquery table or
a GCS blob.
Args:
query (str): The query used by Bigquery service to fetch the results.
project_id (str): The project to execute the query job.
dataset_id (str): The ID of the persistent dataset to keep the results
of the query. If the dataset does not exist, the operation will
create a new one.
table_id (str): The ID of the table to keep the results of the query. If
absent, the operation will generate a random id for the table.
output_gcs_path (str): The GCS blob path to dump the query results to.
dataset_location (str): The location to create the dataset. Defaults to `US`.
job_config (dict): The full config spec for the query job.
output_path (str): The path to where query result will be stored
output_filename (str): The name of the file where the results will be stored
output_destination_format (str): The name of the output destination format.
Default is CSV, and you can also choose NEWLINE_DELIMITED_JSON and AVRO.
Returns:
The API representation of the completed query job.
"""
client = bigquery.Client(project=project_id, location=dataset_location)
if not job_config:
job_config = bigquery.QueryJobConfig()
job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
else:
job_config = bigquery.QueryJobConfig.from_api_repr(job_config)
job_id = None
def cancel():
if job_id:
client.cancel_job(job_id)
with KfpExecutionContext(on_cancel=cancel) as ctx:
job_id = 'query_' + ctx.context_id()
query_job = _get_job(client, job_id)
table_ref = None
if not query_job:
dataset_ref = _prepare_dataset_ref(client, dataset_id, output_gcs_path,
dataset_location)
if dataset_ref:
if not table_id:
table_id = job_id
table_ref = dataset_ref.table(table_id)
job_config.destination = table_ref
gcp_common.dump_file(output_dataset_id_output_path, table_ref.dataset_id)
gcp_common.dump_file(output_table_id_output_path, table_ref.table_id)
query_job = client.query(query, job_config, job_id=job_id)
_display_job_link(project_id, job_id)
if output_path != None: #Write to local file
result = query_job.result()
if not os.path.exists(output_path):
os.makedirs(output_path)
df = result.to_dataframe()
df.to_csv(os.path.join(output_path, output_filename))
else:
query_job.result()
if output_gcs_path:
job_id = 'extract_' + ctx.context_id()
extract_job = _get_job(client, job_id)
logging.info('Extracting data from table {} to {}.'.format(str(table_ref), output_gcs_path))
if not extract_job:
job_config = ExtractJobConfig(destination_format=output_destination_format)
extract_job = client.extract_table(table_ref, output_gcs_path, job_config=job_config)
extract_job.result() # Wait for export to finish
# TODO: Replace '-' with empty string when most users upgrade to Argo version which has the fix: https://github.com/argoproj/argo-workflows/pull/1653
gcp_common.dump_file(output_gcs_path_output_path, output_gcs_path or '-')
gcp_common.dump_file(job_object_output_path, json.dumps(query_job.to_api_repr()))
return query_job.to_api_repr()
def _get_job(client, job_id):
try:
return client.get_job(job_id)
except exceptions.NotFound:
return None
def _prepare_dataset_ref(client, dataset_id, output_gcs_path, dataset_location):
if not output_gcs_path and not dataset_id:
return None
if not dataset_id:
dataset_id = 'kfp_tmp_dataset'
dataset_ref = client.dataset(dataset_id)
dataset = _get_dataset(client, dataset_ref)
if not dataset:
logging.info('Creating dataset {}'.format(dataset_id))
dataset = _create_dataset(client, dataset_ref, dataset_location)
return dataset_ref
def _get_dataset(client, dataset_ref):
try:
return client.get_dataset(dataset_ref)
except exceptions.NotFound:
return None
def _create_dataset(client, dataset_ref, location):
dataset = bigquery.Dataset(dataset_ref)
dataset.location = location
return client.create_dataset(dataset)
def _display_job_link(project_id, job_id):
display.display(display.Link(
href= 'https://console.cloud.google.com/bigquery?project={}'
'&j={}&page=queryresults'.format(project_id, job_id),
text='Query Details'
))
def _dump_outputs(job, output_path, table_ref):
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-job.json',
json.dumps(job.to_api_repr()))
if not output_path:
output_path = '-' # Replace with empty string when we upgrade to Argo version which has the fix: https://github.com/argoproj/argo-workflows/pull/1653
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-output-path.txt',
output_path)
(dataset_id, table_id) = (table_ref.dataset_id, table_ref.table_id) if table_ref else ('-', '-')
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-dataset-id.txt',
dataset_id)
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-table-id.txt',
table_id)

View File

@ -1,16 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._utils import (normalize_name, dump_file,
check_resource_changed, wait_operation_done, ClientWithRetries)

View File

@ -1,177 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import logging
import re
import os
import time
from functools import wraps
from typing import Any, Callable, Optional, Tuple
def normalize_name(name,
valid_first_char_pattern='a-zA-Z',
valid_char_pattern='0-9a-zA-Z_',
invalid_char_placeholder='_',
prefix_placeholder='x_'):
"""Normalize a name to a valid resource name.
Uses ``valid_first_char_pattern`` and ``valid_char_pattern`` regex pattern
to find invalid characters from ``name`` and replaces them with
``invalid_char_placeholder`` or prefix the name with ``prefix_placeholder``.
Args:
name: The name to be normalized.
valid_first_char_pattern: The regex pattern for the first character.
valid_char_pattern: The regex pattern for all the characters in the name.
invalid_char_placeholder: The placeholder to replace invalid characters.
prefix_placeholder: The placeholder to prefix the name if the first char
is invalid.
Returns:
The normalized name. Unchanged if all characters are valid.
"""
if not name:
return name
normalized_name = re.sub('[^{}]+'.format(valid_char_pattern),
invalid_char_placeholder, name)
if not re.match('[{}]'.format(valid_first_char_pattern),
normalized_name[0]):
normalized_name = prefix_placeholder + normalized_name
if name != normalized_name:
logging.info('Normalize name from "{}" to "{}".'.format(
name, normalized_name))
return normalized_name
def dump_file(path, content):
"""Dumps string into local file.
Args:
path: the local path to the file.
content: the string content to dump.
"""
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
elif os.path.exists(path):
logging.warning('The file {} will be overwritten.'.format(path))
with open(path, 'w') as f:
f.write(content)
def check_resource_changed(requested_resource,
existing_resource, property_names):
"""Check if a resource has been changed.
The function checks requested resource with existing resource
by comparing specified property names. Check fails if any property
name in the list is in ``requested_resource`` but its value is
different with the value in ``existing_resource``.
Args:
requested_resource: the user requested resource paylod.
existing_resource: the existing resource payload from data storage.
property_names: a list of property names.
Return:
True if ``requested_resource`` has been changed.
"""
for property_name in property_names:
if not property_name in requested_resource:
continue
existing_value = existing_resource.get(property_name, None)
if requested_resource[property_name] != existing_value:
return True
return False
def wait_operation_done(get_operation, wait_interval):
"""Waits for an operation to be done.
Args:
get_operation: the name of the operation.
wait_interval: the wait interview between pulling job
status.
Returns:
The completed operation.
"""
while True:
operation = get_operation()
operation_name = operation.get('name')
done = operation.get('done', False)
if not done:
logging.info('Operation {} is not done. Wait for {}s.'.format(
operation_name, wait_interval))
time.sleep(wait_interval)
continue
error = operation.get('error', None)
if error:
raise RuntimeError('Failed to complete operation {}: {} {}'.format(
operation_name,
error.get('code', 'Unknown code'),
error.get('message', 'Unknown message'),
))
return operation
def with_retries(
func: Callable,
on_error: Optional[Callable[[], Any]] = None,
errors: Tuple[Exception, ...] = Exception,
number_of_retries: int = 5,
delay: float = 1,
):
"""Retry decorator.
The decorator catches `errors`, calls `on_error` and retries after waiting `delay` seconds.
Args:
number_of_retries (int): Total number of retries if error is raised.
delay (float): Number of seconds to wait between consecutive retries.
"""
@wraps(func)
def wrapper(self, *args, **kwargs):
remaining_retries = number_of_retries
while remaining_retries:
try:
return func(self, *args, **kwargs)
except errors as e:
remaining_retries -= 1
if not remaining_retries:
raise
logging.warning(
'Caught {}. Retrying in {} seconds...'.format(
e.__class__.__name__, delay
)
)
time.sleep(delay)
if on_error:
on_error()
return wrapper
class ClientWithRetries:
def __init__(self):
self._build_client()
for name, member in self.__dict__.items():
if callable(member) and not name.startswith("_"):
self.__dict__[name] = with_retries(func=member, errors=(BrokenPipeError, IOError), on_error=self._build_client)
@abc.abstractmethod
def _build_client(self):
raise NotImplementedError()

View File

@ -1,17 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._launch_template import launch_template
from ._launch_flex_template import launch_flex_template
from ._launch_python import launch_python

View File

@ -1,79 +0,0 @@
# Copyright 2021 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import googleapiclient.discovery as discovery
from googleapiclient import errors
from ..common import ClientWithRetries
class DataflowClient(ClientWithRetries):
def _build_client(self):
self._df = discovery.build('dataflow', 'v1b3', cache_discovery=False)
def launch_template(
self, project_id, gcs_path, location, validate_only, launch_parameters
):
return self._df.projects().locations().templates().launch(
projectId=project_id,
gcsPath=gcs_path,
location=location,
validateOnly=validate_only,
body=launch_parameters
).execute()
def launch_flex_template(self, project_id, request_body, location):
return self._df.projects().locations().flexTemplates().launch(
projectId=project_id, location=location, body=request_body
).execute()
def get_job(self, project_id, job_id, location=None, view=None):
return self._df.projects().locations().jobs().get(
projectId=project_id,
jobId=job_id,
location=self._get_location(location),
view=view
).execute()
def cancel_job(self, project_id, job_id, location):
return self._df.projects().locations().jobs().update(
projectId=project_id,
jobId=job_id,
location=self._get_location(location),
body={
'requestedState': 'JOB_STATE_CANCELLED'
}
).execute()
def list_aggregated_jobs(
self,
project_id,
filter=None,
view=None,
page_size=None,
page_token=None,
location=None
):
return self._df.projects().jobs().aggregated(
projectId=project_id,
filter=filter,
view=view,
pageSize=page_size,
pageToken=page_token,
location=location
).execute()
def _get_location(self, location):
if not location:
location = 'us-central1'
return location

View File

@ -1,121 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time
import json
import os
import tempfile
from kfp_component.core import display
from .. import common as gcp_common
from ..storage import download_blob, parse_blob_path, is_gcs_path
_JOB_SUCCESSFUL_STATES = ['JOB_STATE_DONE', 'JOB_STATE_UPDATED', 'JOB_STATE_DRAINED']
_JOB_FAILED_STATES = ['JOB_STATE_STOPPED', 'JOB_STATE_FAILED', 'JOB_STATE_CANCELLED']
_JOB_TERMINATED_STATES = _JOB_SUCCESSFUL_STATES + _JOB_FAILED_STATES
def wait_for_job_done(df_client, project_id, job_id, location=None, wait_interval=30):
while True:
job = df_client.get_job(project_id, job_id, location=location)
state = job.get('currentState', None)
if is_job_done(state):
return job
elif is_job_terminated(state):
# Terminated with error state
raise RuntimeError('Job {} failed with error state: {}.'.format(
job_id,
state
))
else:
logging.info('Job {} is in pending state {}.'
' Waiting for {} seconds for next poll.'.format(
job_id,
state,
wait_interval
))
time.sleep(wait_interval)
def wait_and_dump_job(df_client, project_id, location, job,
wait_interval,
job_id_output_path,
job_object_output_path,
):
display_job_link(project_id, job)
job_id = job.get('id')
job = wait_for_job_done(df_client, project_id, job_id,
location, wait_interval)
gcp_common.dump_file(job_object_output_path, json.dumps(job))
gcp_common.dump_file(job_id_output_path, job.get('id'))
return job
def is_job_terminated(job_state):
return job_state in _JOB_TERMINATED_STATES
def is_job_done(job_state):
return job_state in _JOB_SUCCESSFUL_STATES
def display_job_link(project_id, job):
location = job.get('location')
job_id = job.get('id')
display.display(display.Link(
href = 'https://console.cloud.google.com/dataflow/'
'jobsDetail/locations/{}/jobs/{}?project={}'.format(
location, job_id, project_id),
text = 'Job Details'
))
def stage_file(local_or_gcs_path):
if not is_gcs_path(local_or_gcs_path):
return local_or_gcs_path
_, blob_path = parse_blob_path(local_or_gcs_path)
file_name = os.path.basename(blob_path)
local_file_path = os.path.join(tempfile.mkdtemp(), file_name)
download_blob(local_or_gcs_path, local_file_path)
return local_file_path
def get_staging_location(staging_dir, context_id):
if not staging_dir:
return None
staging_location = os.path.join(staging_dir, context_id)
logging.info('staging_location: {}'.format(staging_location))
return staging_location
def read_job_id_and_location(storage_client, staging_location):
if staging_location:
job_blob = _get_job_blob(storage_client, staging_location)
if job_blob.exists():
job_data = job_blob.download_as_bytes().decode().split(',')
# Returns (job_id, location)
logging.info('Found existing job {}.'.format(job_data))
return (job_data[0], job_data[1])
return (None, None)
def upload_job_id_and_location(storage_client, staging_location, job_id, location):
if not staging_location:
return
if not location:
location = ''
data = '{},{}'.format(job_id, location)
job_blob = _get_job_blob(storage_client, staging_location)
logging.info('Uploading {} to {}.'.format(data, job_blob))
job_blob.upload_from_string(data)
def _get_job_blob(storage_client, staging_location):
bucket_name, staging_blob_name = parse_blob_path(staging_location)
job_blob_name = os.path.join(staging_blob_name, 'kfp/dataflow/launch_python/job.txt')
bucket = storage_client.bucket(bucket_name)
return bucket.blob(job_blob_name)

View File

@ -1,103 +0,0 @@
import logging
from google.cloud import storage
from kfp_component.core import KfpExecutionContext
from ._client import DataflowClient
from ._common_ops import (
wait_and_dump_job, get_staging_location, read_job_id_and_location,
upload_job_id_and_location
)
def launch_flex_template(
project_id,
location,
launch_parameters,
validate_only=False,
staging_dir=None,
wait_interval=30,
job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt',
job_object_output_path='/tmp/kfp/output/dataflow/job.json',
):
"""Launches a dataflow job from a flex template.
Args:
project_id (str): Required. The ID of the Cloud Platform project that the job belongs to.
location (str): The regional endpoint to which to direct the request.
launch_parameters (dict): Parameters to provide to the template
being launched. Schema defined in
https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.locations.flexTemplates/launch#LaunchFlexTemplateParameter.
`jobName` will be replaced by generated name.
validate_only (boolean): If true, the request is validated but
not actually executed. Defaults to false.
staging_dir (str): Optional. The GCS directory for keeping staging files.
A random subdirectory will be created under the directory to keep job info
for resuming the job in case of failure.
wait_interval (int): The wait seconds between polling.
job_id_output_path (str): Optional. Output file to save job_id of execution
job_object_output_path (str): Optional. Output file to save job details of execution
Returns:
The completed job.
"""
storage_client = storage.Client()
df_client = DataflowClient()
job_id = None
def cancel():
if job_id:
df_client.cancel_job(project_id, job_id, location)
with KfpExecutionContext(on_cancel=cancel) as ctx:
staging_location = get_staging_location(staging_dir, ctx.context_id())
job_id, _ = read_job_id_and_location(storage_client, staging_location)
# Continue waiting for the job if it's has been uploaded to staging location.
if job_id:
job = df_client.get_job(project_id, job_id, location)
job = wait_and_dump_job(
df_client,
project_id,
location,
job,
wait_interval,
job_id_output_path=job_id_output_path,
job_object_output_path=job_object_output_path,
)
logging.info(f'Skipping, existing job: {job}')
return job
if launch_parameters is None:
launch_parameters = {}
request_body = {
'launchParameter': launch_parameters,
'validateOnly': validate_only
}
request_body['launchParameter']['jobName'] = 'job-' + ctx.context_id()
response = df_client.launch_flex_template(
project_id, request_body, location
)
job = response.get('job', None)
if not job:
# Validate only mode
return job
job_id = job.get('id')
upload_job_id_and_location(
storage_client, staging_location, job_id, location
)
job = wait_and_dump_job(
df_client,
project_id,
location,
job,
wait_interval,
job_id_output_path=job_id_output_path,
job_object_output_path=job_object_output_path,
)
logging.info(f'Completed job: {job}')
return job

View File

@ -1,119 +0,0 @@
# Copyright 2021 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import subprocess
import re
import logging
import os
from google.cloud import storage
from kfp_component.core import KfpExecutionContext
from ._client import DataflowClient
from ._common_ops import (wait_and_dump_job, stage_file, get_staging_location,
read_job_id_and_location, upload_job_id_and_location)
from ._process import Process
from ..storage import parse_blob_path
def launch_python(python_file_path, project_id, region, staging_dir=None, requirements_file_path=None,
args=[], wait_interval=30,
job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt',
job_object_output_path='/tmp/kfp/output/dataflow/job.json',
):
"""Launch a self-executing beam python file.
Args:
python_file_path (str): The gcs or local path to the python file to run.
project_id (str): The ID of the GCP project to run the Dataflow job.
region (str): The GCP region to run the Dataflow job.
staging_dir (str): Optional. The GCS directory for keeping staging files.
A random subdirectory will be created under the directory to keep job info
for resuming the job in case of failure and it will be passed as
`staging_location` and `temp_location` command line args of the beam code.
requirements_file_path (str): Optional, the gcs or local path to the pip
requirements file.
args (list): The list of args to pass to the python file.
wait_interval (int): The wait seconds between polling.
Returns:
The completed job.
"""
storage_client = storage.Client()
df_client = DataflowClient()
job_id = None
location = None
def cancel():
if job_id:
df_client.cancel_job(
project_id,
job_id,
location
)
with KfpExecutionContext(on_cancel=cancel) as ctx:
staging_location = get_staging_location(staging_dir, ctx.context_id())
job_id, location = read_job_id_and_location(storage_client, staging_location)
# Continue waiting for the job if it's has been uploaded to staging location.
if job_id:
job = df_client.get_job(project_id, job_id, location)
return wait_and_dump_job(df_client, project_id, location, job,
wait_interval,
job_id_output_path=job_id_output_path,
job_object_output_path=job_object_output_path,
)
_install_requirements(requirements_file_path)
python_file_path = stage_file(python_file_path)
cmd = _prepare_cmd(project_id, region, python_file_path, args, staging_location)
sub_process = Process(cmd)
for line in sub_process.read_lines():
job_id, location = _extract_job_id_and_location(line)
if job_id:
logging.info('Found job id {} and location {}.'.format(job_id, location))
upload_job_id_and_location(storage_client, staging_location, job_id, location)
break
sub_process.wait_and_check()
if not job_id:
logging.warning('No dataflow job was found when '
'running the python file.')
return None
job = df_client.get_job(project_id, job_id,
location=location)
return wait_and_dump_job(df_client, project_id, location, job,
wait_interval,
job_id_output_path=job_id_output_path,
job_object_output_path=job_object_output_path,
)
def _prepare_cmd(project_id, region, python_file_path, args, staging_location):
dataflow_args = [
'--runner', 'DataflowRunner',
'--project', project_id,
'--region', region]
if staging_location:
dataflow_args += ['--staging_location', staging_location, '--temp_location', staging_location]
return (['python', '-u', python_file_path] +
dataflow_args + args)
def _extract_job_id_and_location(line):
"""Returns (job_id, location) from matched log.
"""
job_id_pattern = re.compile(
br'.*console.cloud.google.com/dataflow/jobs/(?P<location>[a-z|0-9|A-Z|\-|\_]+)/(?P<job_id>[a-z|0-9|A-Z|\-|\_]+).*')
matched_job_id = job_id_pattern.search(line or '')
if matched_job_id:
return (matched_job_id.group('job_id').decode(), matched_job_id.group('location').decode())
return (None, None)
def _install_requirements(requirements_file_path):
if not requirements_file_path:
return
requirements_file_path = stage_file(requirements_file_path)
subprocess.check_call(['pip', 'install', '-r', requirements_file_path])

View File

@ -1,93 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
import re
import time
from google.cloud import storage
from kfp_component.core import KfpExecutionContext
from ._client import DataflowClient
from ._common_ops import (wait_and_dump_job, get_staging_location,
read_job_id_and_location, upload_job_id_and_location)
def launch_template(project_id, gcs_path, launch_parameters,
location=None, validate_only=None, staging_dir=None,
wait_interval=30,
job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt',
job_object_output_path='/tmp/kfp/output/dataflow/job.json',
):
"""Launchs a dataflow job from template.
Args:
project_id (str): Required. The ID of the Cloud Platform project
that the job belongs to.
gcs_path (str): Required. A Cloud Storage path to the template
from which to create the job. Must be valid Cloud
Storage URL, beginning with 'gs://'.
launch_parameters (dict): Parameters to provide to the template
being launched. Schema defined in
https://cloud.google.com/dataflow/docs/reference/rest/v1b3/LaunchTemplateParameters.
`jobName` will be replaced by generated name.
location (str): The regional endpoint to which to direct the
request.
validate_only (boolean): If true, the request is validated but
not actually executed. Defaults to false.
staging_dir (str): Optional. The GCS directory for keeping staging files.
A random subdirectory will be created under the directory to keep job info
for resuming the job in case of failure.
wait_interval (int): The wait seconds between polling.
Returns:
The completed job.
"""
storage_client = storage.Client()
df_client = DataflowClient()
job_id = None
def cancel():
if job_id:
df_client.cancel_job(
project_id,
job_id,
location
)
with KfpExecutionContext(on_cancel=cancel) as ctx:
staging_location = get_staging_location(staging_dir, ctx.context_id())
job_id, _ = read_job_id_and_location(storage_client, staging_location)
# Continue waiting for the job if it's has been uploaded to staging location.
if job_id:
job = df_client.get_job(project_id, job_id, location)
return wait_and_dump_job(df_client, project_id, location, job,
wait_interval,
job_id_output_path=job_id_output_path,
job_object_output_path=job_object_output_path,
)
if not launch_parameters:
launch_parameters = {}
launch_parameters['jobName'] = 'job-' + ctx.context_id()
response = df_client.launch_template(project_id, gcs_path,
location, validate_only, launch_parameters)
job = response.get('job', None)
if not job:
# Validate only mode
return job
job_id = job.get('id')
upload_job_id_and_location(storage_client, staging_location, job_id, location)
return wait_and_dump_job(df_client, project_id, location, job,
wait_interval,
job_id_output_path=job_id_output_path,
job_object_output_path=job_object_output_path,
)

View File

@ -1,40 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import subprocess
import logging
class Process:
def __init__(self, cmd):
self._cmd = cmd
self.process = subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
close_fds=True,
shell=False)
def read_lines(self):
# stdout will end with empty bytes when process exits.
for line in iter(self.process.stdout.readline, b''):
logging.info('subprocess: {}'.format(line))
yield line
def wait_and_check(self):
for _ in self.read_lines():
pass
self.process.stdout.close()
return_code = self.process.wait()
logging.info('Subprocess exit with code {}.'.format(
return_code))
if return_code:
raise subprocess.CalledProcessError(return_code, self._cmd)

View File

@ -1,23 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._create_cluster import create_cluster
from ._delete_cluster import delete_cluster
from ._submit_job import submit_job
from ._submit_pyspark_job import submit_pyspark_job
from ._submit_spark_job import submit_spark_job
from ._submit_sparksql_job import submit_sparksql_job
from ._submit_hadoop_job import submit_hadoop_job
from ._submit_hive_job import submit_hive_job
from ._submit_pig_job import submit_pig_job

View File

@ -1,120 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time
import googleapiclient.discovery as discovery
from ..common import wait_operation_done, ClientWithRetries
class DataprocClient(ClientWithRetries):
""" Internal client for calling Dataproc APIs.
"""
def _build_client(self):
self._dataproc = discovery.build('dataproc', 'v1', cache_discovery=False)
def create_cluster(self, project_id, region, cluster, request_id):
"""Creates a new dataproc cluster.
"""
return self._dataproc.projects().regions().clusters().create(
projectId = project_id,
region = region,
requestId = request_id,
body = cluster
).execute()
def get_cluster(self, project_id, region, name):
"""Gets the resource representation for a cluster in a project.
"""
return self._dataproc.projects().regions().clusters().get(
projectId = project_id,
region = region,
clusterName = name
).execute()
def delete_cluster(self, project_id, region, name, request_id):
"""Deletes a cluster in a project.
"""
return self._dataproc.projects().regions().clusters().delete(
projectId = project_id,
region = region,
clusterName = name,
requestId = request_id
).execute()
def submit_job(self, project_id, region, job, request_id):
"""Submits a job to a cluster.
"""
return self._dataproc.projects().regions().jobs().submit(
projectId = project_id,
region = region,
body = {
'job': job,
'requestId': request_id
}
).execute()
def get_job(self, project_id, region, job_id):
"""Gets a job details
"""
return self._dataproc.projects().regions().jobs().get(
projectId = project_id,
region = region,
jobId = job_id
).execute()
def cancel_job(self, project_id, region, job_id):
"""Cancels a job
"""
return self._dataproc.projects().regions().jobs().cancel(
projectId = project_id,
region = region,
jobId = job_id
).execute()
def get_operation(self, operation_name):
"""Gets a operation by name.
"""
return self._dataproc.projects().regions().operations().get(
name = operation_name
).execute()
def wait_for_operation_done(self, operation_name, wait_interval):
"""Waits for an operation to be done.
Args:
operation_name: the name of the operation.
wait_interval: the wait interview between pulling job
status.
Returns:
The completed operation.
"""
return wait_operation_done(
lambda: self.get_operation(operation_name), wait_interval)
def cancel_operation(self, operation_name):
"""Cancels an operation.
Args:
operation_name: the name of the operation.
"""
if not operation_name:
return
self._dataproc.projects().regions().operations().cancel(
name = operation_name
).execute()

View File

@ -1,104 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
from fire import decorators
from ._client import DataprocClient
from kfp_component.core import KfpExecutionContext, display
from .. import common as gcp_common
@decorators.SetParseFns(image_version=str)
def create_cluster(project_id, region, name=None, name_prefix=None,
initialization_actions=None, config_bucket=None, image_version=None,
cluster=None, wait_interval=30,
cluster_name_output_path='/tmp/kfp/output/dataproc/cluster_name.txt',
cluster_object_output_path='/tmp/kfp/output/dataproc/cluster.json',
):
"""Creates a DataProc cluster under a project.
Args:
project_id (str): Required. The ID of the Google Cloud Platform project
that the cluster belongs to.
region (str): Required. The Cloud Dataproc region in which to handle the
request.
name (str): Optional. The cluster name. Cluster names within a project
must be unique. Names of deleted clusters can be reused.
name_prefix (str): Optional. The prefix of the cluster name.
initialization_actions (list): Optional. List of GCS URIs of executables
to execute on each node after config is completed. By default,
executables are run on master and all worker nodes.
config_bucket (str): Optional. A Google Cloud Storage bucket used to
stage job dependencies, config files, and job driver console output.
image_version (str): Optional. The version of software inside the cluster.
cluster (dict): Optional. The full cluster config. See [full details](
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#Cluster)
wait_interval (int): The wait seconds between polling the operation.
Defaults to 30s.
Returns:
The created cluster object.
Output Files:
$KFP_OUTPUT_PATH/dataproc/cluster_name.txt: The cluster name of the
created cluster.
"""
if not cluster:
cluster = {}
cluster['projectId'] = project_id
if 'config' not in cluster:
cluster['config'] = {}
if name:
cluster['clusterName'] = name
if initialization_actions:
cluster['config']['initializationActions'] = list(
map(lambda file: {
'executableFile': file
}, initialization_actions)
)
if config_bucket:
cluster['config']['configBucket'] = config_bucket
if image_version:
if 'softwareConfig' not in cluster['config']:
cluster['config']['softwareConfig'] = {}
cluster['config']['softwareConfig']['imageVersion'] = image_version
client = DataprocClient()
operation_name = None
with KfpExecutionContext(
on_cancel=lambda: client.cancel_operation(operation_name)) as ctx:
_set_cluster_name(cluster, ctx.context_id(), name_prefix)
_dump_metadata(cluster, region)
operation = client.create_cluster(project_id, region, cluster,
request_id=ctx.context_id())
operation_name = operation.get('name')
operation = client.wait_for_operation_done(operation_name,
wait_interval)
cluster = operation.get('response')
gcp_common.dump_file(cluster_object_output_path, json.dumps(cluster))
gcp_common.dump_file(cluster_name_output_path, cluster.get('clusterName'))
return cluster
def _set_cluster_name(cluster, context_id, name_prefix):
if 'clusterName' in cluster:
return
if not name_prefix:
name_prefix = 'cluster'
cluster['clusterName'] = name_prefix + '-' + context_id
def _dump_metadata(cluster, region):
display.display(display.Link(
'https://console.cloud.google.com/dataproc/clusters/{}?project={}&region={}'.format(
cluster.get('clusterName'), cluster.get('projectId'), region),
'Cluster Details'
))

View File

@ -1,47 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from googleapiclient import errors
from ._client import DataprocClient
from kfp_component.core import KfpExecutionContext
def delete_cluster(project_id, region, name, wait_interval=30):
"""Deletes a DataProc cluster.
Args:
project_id (str): Required. The ID of the Google Cloud Platform project
that the cluster belongs to.
region (str): Required. The Cloud Dataproc region in which to handle the
request.
name (str): Required. The cluster name to delete.
wait_interval (int): The wait seconds between polling the operation.
Defaults to 30s.
"""
client = DataprocClient()
operation_name = None
with KfpExecutionContext(
on_cancel=lambda: client.cancel_operation(operation_name)) as ctx:
try:
operation = client.delete_cluster(project_id, region, name,
request_id=ctx.context_id())
except errors.HttpError as e:
if e.resp.status == 404:
logging.info('Cluster {} is not found.'.format(name))
return
raise e
operation_name = operation.get('name')
return client.wait_for_operation_done(operation_name,
wait_interval)

View File

@ -1,62 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._submit_job import submit_job
def submit_hadoop_job(project_id, region, cluster_name, job_id_output_path,
main_jar_file_uri=None, main_class=None, args=[], hadoop_job={}, job={},
wait_interval=30):
"""Submits a Cloud Dataproc job for running Apache Hadoop MapReduce jobs
on Apache Hadoop YARN.
Args:
project_id (str): Required. The ID of the Google Cloud Platform project
that the cluster belongs to.
region (str): Required. The Cloud Dataproc region in which to handle the
request.
cluster_name (str): Required. The cluster to run the job.
main_jar_file_uri (str): The HCFS URI of the jar file containing the main
class. Examples:
`gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar`
`hdfs:/tmp/test-samples/custom-wordcount.jar`
`file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar`
main_class (str): The name of the driver's main class. The jar file
containing the class must be in the default CLASSPATH or specified
in `jarFileUris`.
args (list): Optional. The arguments to pass to the driver. Do not include
arguments, such as -libjars or -Dfoo=bar, that can be set as job properties,
since a collision may occur that causes an incorrect job submission.
hadoop_job (dict): Optional. The full payload of a [hadoop job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob).
job (dict): Optional. The full payload of a [Dataproc job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
wait_interval (int): The wait seconds between polling the operation.
Defaults to 30s.
job_id_output_path (str): Path for the ID of the created job
Returns:
The created job payload.
"""
if not hadoop_job:
hadoop_job = {}
if not job:
job = {}
if main_jar_file_uri:
hadoop_job['mainJarFileUri'] = main_jar_file_uri
if main_class:
hadoop_job['mainClass'] = main_class
if args:
hadoop_job['args'] = args
job['hadoopJob'] = hadoop_job
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)

View File

@ -1,56 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._submit_job import submit_job
def submit_hive_job(project_id, region, cluster_name, job_id_output_path,
queries=[], query_file_uri=None, script_variables={}, hive_job={},
job={}, wait_interval=30):
"""Submits a Cloud Dataproc job for running Apache Hive queries on YARN.
Args:
project_id (str): Required. The ID of the Google Cloud Platform project
that the cluster belongs to.
region (str): Required. The Cloud Dataproc region in which to handle the
request.
cluster_name (str): Required. The cluster to run the job.
queries (list): Required. The queries to execute. You do not need to
terminate a query with a semicolon. Multiple queries can be specified
in one string by separating each with a semicolon.
query_file_uri (str): The HCFS URI of the script that contains Hive queries.
script_variables (dict): Optional. Mapping of query variable names to
values (equivalent to the Hive command: SET name="value";).
hive_job (dict): Optional. The full payload of a [Hive job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob)
job (dict): Optional. The full payload of a [Dataproc job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
wait_interval (int): The wait seconds between polling the operation.
Defaults to 30s.
job_id_output_path (str): Path for the ID of the created job
Returns:
The created job payload.
"""
if not hive_job:
hive_job = {}
if not job:
job = {}
if queries:
hive_job['queryList'] = { 'queries': queries }
if query_file_uri:
hive_job['queryFileUri'] = query_file_uri
if script_variables:
hive_job['scriptVariables'] = script_variables
job['hiveJob'] = hive_job
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)

View File

@ -1,81 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import time
from ._client import DataprocClient
from kfp_component.core import KfpExecutionContext, display
from .. import common as gcp_common
def submit_job(project_id, region, cluster_name, job, wait_interval=30,
job_id_output_path='/tmp/kfp/output/dataproc/job_id.txt',
job_object_output_path='/tmp/kfp/output/dataproc/job.json',
):
"""Submits a Cloud Dataproc job.
Args:
project_id (str): Required. The ID of the Google Cloud Platform project
that the cluster belongs to.
region (str): Required. The Cloud Dataproc region in which to handle the
request.
cluster_name (str): Required. The cluster to run the job.
job (dict): Optional. The full payload of a [Dataproc job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
wait_interval (int): The wait seconds between polling the operation.
Defaults to 30s.
job_id_output_path (str): Path for the ID of the created job
job_object_output_path (str): Path for the created job object
Returns:
The created job payload.
"""
if 'reference' not in job:
job['reference'] = {}
job['reference']['projectId'] = project_id
if 'placement' not in job:
job['placement'] = {}
job['placement']['clusterName'] = cluster_name
client = DataprocClient()
job_id = None
with KfpExecutionContext(
on_cancel=lambda: client.cancel_job(
project_id, region, job_id)) as ctx:
submitted_job = client.submit_job(project_id, region, job,
request_id=ctx.context_id())
job_id = submitted_job['reference']['jobId']
_dump_metadata(submitted_job, region)
submitted_job = _wait_for_job_done(client, project_id, region,
job_id, wait_interval)
gcp_common.dump_file(job_object_output_path, json.dumps(submitted_job))
gcp_common.dump_file(job_id_output_path, submitted_job.get('reference').get('jobId'))
return submitted_job
def _wait_for_job_done(client, project_id, region, job_id, wait_interval):
while True:
job = client.get_job(project_id, region, job_id)
state = job['status']['state']
if state == 'DONE':
return job
if state == 'ERROR':
raise RuntimeError(job['status']['details'])
time.sleep(wait_interval)
def _dump_metadata(job, region):
display.display(display.Link(
'https://console.cloud.google.com/dataproc/jobs/{}?project={}&region={}'.format(
job.get('reference').get('jobId'),
job.get('reference').get('projectId'),
region),
'Job Details'
))

View File

@ -1,56 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._submit_job import submit_job
def submit_pig_job(project_id, region, cluster_name, job_id_output_path,
queries=[], query_file_uri=None, script_variables={}, pig_job={},
job={}, wait_interval=30):
"""Submits a Cloud Dataproc job for running Apache Pig queries on YARN.
Args:
project_id (str): Required. The ID of the Google Cloud Platform project
that the cluster belongs to.
region (str): Required. The Cloud Dataproc region in which to handle the
request.
cluster_name (str): Required. The cluster to run the job.
queries (list): Required. The queries to execute. You do not need to
terminate a query with a semicolon. Multiple queries can be specified
in one string by separating each with a semicolon.
query_file_uri (str): The HCFS URI of the script that contains Pig queries.
script_variables (dict): Optional. Mapping of query variable names to values
(equivalent to the Pig command: name=[value]).
pig_job (dict): Optional. The full payload of a [Pig job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob)
job (dict): Optional. The full payload of a [Dataproc job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
wait_interval (int): The wait seconds between polling the operation.
Defaults to 30s.
job_id_output_path (str): Path for the ID of the created job
Returns:
The created job payload.
"""
if not pig_job:
pig_job = {}
if not job:
job = {}
if queries:
pig_job['queryList'] = { 'queries': queries }
if query_file_uri:
pig_job['queryFileUri'] = query_file_uri
if script_variables:
pig_job['scriptVariables'] = script_variables
job['pigJob'] = pig_job
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)

View File

@ -1,53 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._submit_job import submit_job
def submit_pyspark_job(project_id, region, cluster_name, job_id_output_path,
main_python_file_uri=None, args=[], pyspark_job={}, job={},
wait_interval=30):
"""Submits a Cloud Dataproc job for running Apache PySpark applications on YARN.
Args:
project_id (str): Required. The ID of the Google Cloud Platform project
that the cluster belongs to.
region (str): Required. The Cloud Dataproc region in which to handle the
request.
cluster_name (str): Required. The cluster to run the job.
main_python_file_uri (str): Required. The HCFS URI of the main Python file to
use as the driver. Must be a .py file.
args (list): Optional. The arguments to pass to the driver. Do not include
arguments, such as --conf, that can be set as job properties, since a
collision may occur that causes an incorrect job submission.
pyspark_job (dict): Optional. The full payload of a [PySparkJob](
https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob).
job (dict): Optional. The full payload of a [Dataproc job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
wait_interval (int): The wait seconds between polling the operation.
Defaults to 30s.
job_id_output_path (str): Path for the ID of the created job
Returns:
The created job payload.
"""
if not pyspark_job:
pyspark_job = {}
if not job:
job = {}
if main_python_file_uri:
pyspark_job['mainPythonFileUri'] = main_python_file_uri
if args:
pyspark_job['args'] = args
job['pysparkJob'] = pyspark_job
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)

View File

@ -1,57 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._submit_job import submit_job
def submit_spark_job(project_id, region, cluster_name, job_id_output_path,
main_jar_file_uri=None, main_class=None, args=[], spark_job={}, job={},
wait_interval=30):
"""Submits a Cloud Dataproc job for running Apache Spark applications on YARN.
Args:
project_id (str): Required. The ID of the Google Cloud Platform project
that the cluster belongs to.
region (str): Required. The Cloud Dataproc region in which to handle the
request.
cluster_name (str): Required. The cluster to run the job.
main_jar_file_uri (str): The HCFS URI of the jar file that contains the main class.
main_class (str): The name of the driver's main class. The jar file that
contains the class must be in the default CLASSPATH or specified in
jarFileUris.
args (list): Optional. The arguments to pass to the driver. Do not include
arguments, such as --conf, that can be set as job properties, since a
collision may occur that causes an incorrect job submission.
spark_job (dict): Optional. The full payload of a [SparkJob](
https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob).
job (dict): Optional. The full payload of a [Dataproc job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
wait_interval (int): The wait seconds between polling the operation.
Defaults to 30s.
job_id_output_path (str): Path for the ID of the created job
Returns:
The created job payload.
"""
if not spark_job:
spark_job = {}
if not job:
job = {}
if main_jar_file_uri:
spark_job['mainJarFileUri'] = main_jar_file_uri
if main_class:
spark_job['mainClass'] = main_class
if args:
spark_job['args'] = args
job['sparkJob'] = spark_job
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)

View File

@ -1,56 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._submit_job import submit_job
def submit_sparksql_job(project_id, region, cluster_name, job_id_output_path,
queries=[], query_file_uri=None, script_variables={}, sparksql_job={},
job={}, wait_interval=30):
"""Submits a Cloud Dataproc job for running Apache Spark SQL queries.
Args:
project_id (str): Required. The ID of the Google Cloud Platform project
that the cluster belongs to.
region (str): Required. The Cloud Dataproc region in which to handle the
request.
cluster_name (str): Required. The cluster to run the job.
queries (list): Required. The queries to execute. You do not need to
terminate a query with a semicolon. Multiple queries can be specified
in one string by separating each with a semicolon.
query_file_uri (str): The HCFS URI of the script that contains SQL queries.
script_variables (dict): Optional. Mapping of query variable names to values
(equivalent to the Spark SQL command: SET name="value";).
sparksql_job (dict): Optional. The full payload of a [Spark SQL job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob)
job (dict): Optional. The full payload of a [Dataproc job](
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
wait_interval (int): The wait seconds between polling the operation.
Defaults to 30s.
job_id_output_path (str): Path for the ID of the created job
Returns:
The created job payload.
"""
if not sparksql_job:
sparksql_job = {}
if not job:
job = {}
if queries:
sparksql_job['queryList'] = { 'queries': queries }
if query_file_uri:
sparksql_job['queryFileUri'] = query_file_uri
if script_variables:
sparksql_job['scriptVariables'] = script_variables
job['sparkSqlJob'] = sparksql_job
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)

View File

@ -1,33 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module that contains a set of commands to call ML Engine APIs
The commands are aware of KFP execution context and can work under
retry and cancellation context. The currently supported commands
are: train, batch_prediction, create_model, create_version and
delete_version.
TODO(hongyes): Provides full ML Engine API support.
"""
from ._create_job import create_job
from ._create_model import create_model
from ._create_version import create_version
from ._delete_version import delete_version
from ._train import train
from ._batch_predict import batch_predict
from ._deploy import deploy
from ._set_default_version import set_default_version
from ._wait_job import wait_job

View File

@ -1,85 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from ._create_job import create_job
def batch_predict(project_id, model_path, input_paths, input_data_format,
output_path, region, job_id_output_path, output_data_format=None, prediction_input=None, job_id_prefix=None,
wait_interval=30):
"""Creates a MLEngine batch prediction job.
Args:
project_id (str): Required. The ID of the parent project of the job.
model_path (str): Required. The path to the model. It can be either:
`projects/[PROJECT_ID]/models/[MODEL_ID]` or
`projects/[PROJECT_ID]/models/[MODEL_ID]/versions/[VERSION_ID]`
or a GCS path of a model file.
input_paths (list): Required. The Google Cloud Storage location of
the input data files. May contain wildcards.
input_data_format (str): Required. The format of the input data files.
See https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat.
output_path (str): Required. The output Google Cloud Storage location.
region (str): Required. The Google Compute Engine region to run the
prediction job in.
output_data_format (str): Optional. Format of the output data files,
defaults to JSON.
prediction_input (dict): Input parameters to create a prediction job.
job_id_prefix (str): the prefix of the generated job id.
wait_interval (int): optional wait interval between calls
to get job status. Defaults to 30.
"""
if not prediction_input:
prediction_input = {}
if not model_path:
raise ValueError('model_path must be provided.')
if _is_model_name(model_path):
prediction_input['modelName'] = model_path
elif _is_model_version_name(model_path):
prediction_input['versionName'] = model_path
elif _is_gcs_path(model_path):
prediction_input['uri'] = model_path
else:
raise ValueError('model_path value is invalid.')
if input_paths:
prediction_input['inputPaths'] = input_paths
if input_data_format:
prediction_input['dataFormat'] = input_data_format
if output_path:
prediction_input['outputPath'] = output_path
if output_data_format:
prediction_input['outputDataFormat'] = output_data_format
if region:
prediction_input['region'] = region
job = {
'predictionInput': prediction_input
}
create_job(
project_id=project_id,
job=job,
job_id_prefix=job_id_prefix,
wait_interval=wait_interval,
job_id_output_path=job_id_output_path,
)
def _is_model_name(name):
return re.match(r'/projects/[^/]+/models/[^/]+$', name)
def _is_model_version_name(name):
return re.match(r'/projects/[^/]+/models/[^/]+/versions/[^/]+$', name)
def _is_gcs_path(name):
return name.startswith('gs://')

View File

@ -1,192 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import wraps
import logging
import time
import googleapiclient.discovery as discovery
from googleapiclient import errors
from ..common import wait_operation_done, ClientWithRetries
class MLEngineClient(ClientWithRetries):
""" Client for calling MLEngine APIs.
"""
def _build_client(self):
self._ml_client = discovery.build('ml', 'v1', cache_discovery=False)
def create_job(self, project_id, job):
"""Create a new job.
Args:
project_id: the ID of the parent project.
job: the payload of the job.
Returns:
The created job.
"""
return self._ml_client.projects().jobs().create(
parent = 'projects/{}'.format(project_id),
body = job
).execute()
def cancel_job(self, project_id, job_id):
"""Cancel the specified job.
Args:
project_id: the parent project ID of the job.
job_id: the ID of the job.
"""
job_name = 'projects/{}/jobs/{}'.format(project_id, job_id)
self._ml_client.projects().jobs().cancel(
name = job_name,
body = {
'name': job_name
},
).execute()
def get_job(self, project_id, job_id):
"""Gets the job by ID.
Args:
project_id: the ID of the parent project.
job_id: the ID of the job to retrieve.
Returns:
The retrieved job payload.
"""
job_name = 'projects/{}/jobs/{}'.format(project_id, job_id)
return self._ml_client.projects().jobs().get(
name=job_name).execute()
def create_model(self, project_id, model):
"""Creates a new model.
Args:
project_id: the ID of the parent project.
model: the payload of the model.
Returns:
The created model.
"""
return self._ml_client.projects().models().create(
parent = 'projects/{}'.format(project_id),
body = model
).execute()
def get_model(self, model_name):
"""Gets a model.
Args:
model_name: the name of the model.
Returns:
The retrieved model.
"""
return self._ml_client.projects().models().get(
name = model_name
).execute()
def create_version(self, model_name, version):
"""Creates a new version.
Args:
model_name: the name of the parent model.
version: the payload of the version.
Returns:
The created version.
"""
return self._ml_client.projects().models().versions().create(
parent = model_name,
body = version
).execute()
def get_version(self, version_name):
"""Gets a version.
Args:
version_name: the name of the version.
Returns:
The retrieved version. None if the version is not found.
"""
try:
return self._ml_client.projects().models().versions().get(
name = version_name
).execute()
except errors.HttpError as e:
if e.resp.status == 404:
return None
raise
def delete_version(self, version_name):
"""Deletes a version.
Args:
version_name: the name of the version.
Returns:
The delete operation. None if the version is not found.
"""
try:
return self._ml_client.projects().models().versions().delete(
name = version_name
).execute()
except errors.HttpError as e:
if e.resp.status == 404:
logging.info('The version has already been deleted.')
return None
raise
def set_default_version(self, version_name):
return self._ml_client.projects().models().versions().setDefault(
name = version_name
).execute()
def get_operation(self, operation_name):
"""Gets an operation.
Args:
operation_name: the name of the operation.
Returns:
The retrieved operation.
"""
return self._ml_client.projects().operations().get(
name = operation_name
).execute()
def wait_for_operation_done(self, operation_name, wait_interval):
"""Waits for an operation to be done.
Args:
operation_name: the name of the operation.
wait_interval: the wait interview between pulling job
status.
Returns:
The completed operation.
"""
return wait_operation_done(
lambda: self.get_operation(operation_name), wait_interval)
def cancel_operation(self, operation_name):
"""Cancels an operation.
Args:
operation_name: the name of the operation.
"""
self._ml_client.projects().operations().cancel(
name = operation_name
).execute()

View File

@ -1,161 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time
import json
from googleapiclient import errors
from kfp_component.core import display
from ._client import MLEngineClient
from .. import common as gcp_common
def wait_existing_version(ml_client, version_name, wait_interval):
while True:
existing_version = ml_client.get_version(version_name)
if not existing_version:
return None
state = existing_version.get('state', None)
if not state in ['CREATING', 'DELETING', 'UPDATING']:
return existing_version
logging.info('Version is in {} state. Wait for {}s'.format(
state, wait_interval
))
time.sleep(wait_interval)
def wait_for_operation_done(ml_client, operation_name, action, wait_interval):
"""Waits for an operation to be done.
Args:
operation_name: the name of the operation.
action: the action name of the operation.
wait_interval: the wait interview between pulling job
status.
Returns:
The completed operation.
Raises:
RuntimeError if the operation has error.
"""
operation = None
while True:
operation = ml_client.get_operation(operation_name)
done = operation.get('done', False)
if done:
break
logging.info('Operation {} is not done. Wait for {}s.'.format(operation_name, wait_interval))
time.sleep(wait_interval)
error = operation.get('error', None)
if error:
raise RuntimeError('Failed to complete {} operation {}: {} {}'.format(
action,
operation_name,
error.get('code', 'Unknown code'),
error.get('message', 'Unknown message'),
))
return operation
def wait_for_job_done(ml_client, project_id, job_id, wait_interval, show_tensorboard=True,
job_object_output_path='/tmp/kfp/output/ml_engine/job.json',
job_id_output_path='/tmp/kfp/output/ml_engine/job_id.txt',
job_dir_output_path='/tmp/kfp/output/ml_engine/job_dir.txt',
):
"""Waits for a CMLE job done.
Args:
ml_client: CMLE google api client
project_id: the ID of the project which has the job
job_id: the ID of the job to wait
wait_interval: the interval in seconds to wait between polls.
show_tensorboard: True to dump Tensorboard metadata.
Returns:
The completed job.
Raises:
RuntimeError if the job finishes with failed or cancelled state.
"""
metadata_dumped = False
while True:
job = ml_client.get_job(project_id, job_id)
print(job)
if not metadata_dumped:
_dump_job_metadata(project_id, job_id, job, show_tensorboard=show_tensorboard)
metadata_dumped = True
if job.get('state', None) in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
break
# Move to config from flag
logging.info('job status is {}, wait for {}s'.format(
job.get('state', None), wait_interval))
time.sleep(wait_interval)
_dump_job(
job=job,
job_object_output_path=job_object_output_path,
job_id_output_path=job_id_output_path,
job_dir_output_path=job_dir_output_path,
)
if job['state'] != 'SUCCEEDED':
raise RuntimeError('Job failed with state {}. Error: {}'.format(
job['state'], job.get('errorMessage', '')))
return job
def _dump_job_metadata(project_id, job_id, job, show_tensorboard=True):
display.display(display.Link(
'https://console.cloud.google.com/mlengine/jobs/{}?project={}'.format(
job_id, project_id),
'Job Details'
))
display.display(display.Link(
'https://console.cloud.google.com/logs/viewer?project={}&resource=ml_job/job_id/{}&interval=NO_LIMIT'.format(
project_id, job_id),
'Logs'
))
if show_tensorboard and 'trainingInput' in job and 'jobDir' in job['trainingInput']:
display.display(display.Tensorboard(
job['trainingInput']['jobDir']))
def _dump_job(
job,
job_object_output_path,
job_id_output_path,
job_dir_output_path,
):
logging.info('Dumping job: {}'.format(job))
gcp_common.dump_file(job_object_output_path, json.dumps(job))
gcp_common.dump_file(job_id_output_path, job['jobId'])
job_dir = ''
if 'trainingInput' in job and 'jobDir' in job['trainingInput']:
job_dir = job['trainingInput']['jobDir']
gcp_common.dump_file(job_dir_output_path, job_dir)
def cancel_job(ml_client, project_id, job_id):
"""Cancels a CMLE job.
Args:
ml_client: CMLE google api client
project_id: the ID of the project which has the job
job_id: the ID of the job to cancel
"""
try:
logging.info('Cancelling job {}.'.format(job_id))
ml_client.cancel_job(project_id, job_id)
logging.info('Cancelled job {}.'.format(job_id))
except errors.HttpError as e:
# Best effort to cancel the job
logging.error('Failed to cancel the job: {}'.format(e))
pass

View File

@ -1,122 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
import re
import time
from googleapiclient import errors
from ._common_ops import wait_for_job_done, cancel_job
from kfp_component.core import KfpExecutionContext
from ._client import MLEngineClient
from .. import common as gcp_common
def create_job(
project_id,
job,
job_id_prefix=None,
job_id=None,
wait_interval=30,
job_object_output_path='/tmp/kfp/output/ml_engine/job.json',
job_id_output_path='/tmp/kfp/output/ml_engine/job_id.txt',
job_dir_output_path='/tmp/kfp/output/ml_engine/job_dir.txt',
):
"""Creates a MLEngine job.
Args:
project_id: the ID of the parent project of the job.
job: the payload of the job. Must have ``jobId``
and ``trainingInput`` or ``predictionInput`.
job_id_prefix: the prefix of the generated job id.
job_id: the created job_id, takes precedence over generated job
id if set.
wait_interval: optional wait interval between calls
to get job status. Defaults to 30.
job_object_output_path: Path for the json payload of the create job.
job_id_output_path: Path for the ID of the created job.
job_dir_output_path: Path for the `jobDir` of the training job.
"""
return CreateJobOp(
project_id=project_id,
job=job,
job_id_prefix=job_id_prefix,
job_id=job_id,
wait_interval=wait_interval,
job_object_output_path=job_object_output_path,
job_id_output_path=job_id_output_path,
job_dir_output_path=job_dir_output_path,
).execute_and_wait()
class CreateJobOp:
def __init__(self,project_id, job, job_id_prefix=None, job_id=None,
wait_interval=30,
job_object_output_path=None,
job_id_output_path=None,
job_dir_output_path=None,
):
self._ml = MLEngineClient()
self._project_id = project_id
self._job_id_prefix = job_id_prefix
self._job_id = job_id
self._job = job
self._wait_interval = wait_interval
self._job_object_output_path = job_object_output_path
self._job_id_output_path = job_id_output_path
self._job_dir_output_path = job_dir_output_path
def execute_and_wait(self):
with KfpExecutionContext(on_cancel=lambda: cancel_job(self._ml, self._project_id, self._job_id)) as ctx:
self._set_job_id(ctx.context_id())
self._create_job()
return wait_for_job_done(self._ml, self._project_id, self._job_id, self._wait_interval,
job_object_output_path=self._job_object_output_path,
job_id_output_path=self._job_id_output_path,
job_dir_output_path=self._job_dir_output_path,
)
def _set_job_id(self, context_id):
if self._job_id:
job_id = self._job_id
elif self._job_id_prefix:
job_id = self._job_id_prefix + context_id[:16]
else:
job_id = 'job_' + context_id
job_id = gcp_common.normalize_name(job_id)
self._job_id = job_id
self._job['jobId'] = job_id
def _create_job(self):
try:
self._ml.create_job(
project_id = self._project_id,
job = self._job
)
except errors.HttpError as e:
if e.resp.status == 409:
if not self._is_dup_job():
logging.error('Another job has been created with same name before: {}'.format(self._job_id))
raise
logging.info('The job {} has been submitted before. Continue waiting.'.format(self._job_id))
else:
logging.error('Failed to create job.\nPayload: {}\nError: {}'.format(self._job, e))
raise
def _is_dup_job(self):
existing_job = self._ml.get_job(self._project_id, self._job_id)
return existing_job.get('trainingInput', None) == self._job.get('trainingInput', None) \
and existing_job.get('predictionInput', None) == self._job.get('predictionInput', None)

View File

@ -1,105 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
from googleapiclient import errors
from kfp_component.core import KfpExecutionContext, display
from ._client import MLEngineClient
from .. import common as gcp_common
def create_model(project_id, model_id=None, model=None,
model_name_output_path='/tmp/kfp/output/ml_engine/model_name.txt',
model_object_output_path='/tmp/kfp/output/ml_engine/model.json',
):
"""Creates a MLEngine model.
Args:
project_id (str): the ID of the parent project of the model.
model_id (str): optional, the name of the model. If absent, a new name will
be generated.
model (dict): the payload of the model.
"""
return CreateModelOp(project_id, model_id, model,
model_name_output_path=model_name_output_path,
model_object_output_path=model_object_output_path,
).execute()
class CreateModelOp:
def __init__(self, project_id, model_id, model,
model_name_output_path,
model_object_output_path,
):
self._ml = MLEngineClient()
self._project_id = project_id
self._model_id = model_id
self._model_name = None
if model:
self._model = model
else:
self._model = {}
self._model_name_output_path = model_name_output_path
self._model_object_output_path = model_object_output_path
def execute(self):
with KfpExecutionContext() as ctx:
self._set_model_name(ctx.context_id())
self._dump_metadata()
try:
created_model = self._ml.create_model(
project_id = self._project_id,
model = self._model)
except errors.HttpError as e:
if e.resp.status == 409:
existing_model = self._ml.get_model(self._model_name)
if not self._is_dup_model(existing_model):
raise
logging.info('The same model {} has been submitted'
' before. Continue the operation.'.format(
self._model_name))
created_model = existing_model
else:
raise
self._dump_model(created_model)
return created_model
def _set_model_name(self, context_id):
if not self._model_id:
self._model_id = 'model_' + context_id
self._model['name'] = gcp_common.normalize_name(self._model_id)
self._model_name = 'projects/{}/models/{}'.format(
self._project_id, self._model_id)
def _is_dup_model(self, existing_model):
return not gcp_common.check_resource_changed(
self._model,
existing_model,
['description', 'regions',
'onlinePredictionLogging', 'labels'])
def _dump_metadata(self):
display.display(display.Link(
'https://console.cloud.google.com/mlengine/models/{}?project={}'.format(
self._model_id, self._project_id),
'Model Details'
))
def _dump_model(self, model):
logging.info('Dumping model: {}'.format(model))
gcp_common.dump_file(self._model_object_output_path, json.dumps(model))
gcp_common.dump_file(self._model_name_output_path, self._model_name)

View File

@ -1,213 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
import time
import re
from googleapiclient import errors
from fire import decorators
from kfp_component.core import KfpExecutionContext, display
from ._client import MLEngineClient
from .. import common as gcp_common
from ._common_ops import wait_existing_version, wait_for_operation_done
@decorators.SetParseFns(python_version=str, runtime_version=str)
def create_version(model_name, deployemnt_uri=None, version_id=None,
runtime_version=None, python_version=None, version=None,
replace_existing=False, wait_interval=30,
version_name_output_path='/tmp/kfp/output/ml_engine/version_name.txt',
version_object_output_path='/tmp/kfp/output/ml_engine/version.json',
):
"""Creates a MLEngine version and wait for the operation to be done.
Args:
model_name (str): required, the name of the parent model.
deployment_uri (str): optional, the Google Cloud Storage location of
the trained model used to create the version.
version_id (str): optional, the user provided short name of
the version. If it is not provided, the operation uses a random name.
runtime_version (str): optinal, the Cloud ML Engine runtime version
to use for this deployment. If not set, Cloud ML Engine uses
the default stable version, 1.0.
python_version (str): optinal, the version of Python used in prediction.
If not set, the default version is '2.7'. Python '3.5' is available
when runtimeVersion is set to '1.4' and above. Python '2.7' works
with all supported runtime versions.
version (dict): optional, the payload of the new version.
replace_existing (boolean): boolean flag indicates whether to replace
existing version in case of conflict.
wait_interval (int): the interval to wait for a long running operation.
"""
if not version:
version = {}
if deployemnt_uri:
version['deploymentUri'] = deployemnt_uri
if version_id:
version['name'] = version_id
if runtime_version:
version['runtimeVersion'] = runtime_version
if python_version:
version['pythonVersion'] = python_version
return CreateVersionOp(model_name, version,
replace_existing, wait_interval,
version_name_output_path=version_name_output_path,
version_object_output_path=version_object_output_path,
).execute_and_wait()
class CreateVersionOp:
def __init__(self, model_name, version,
replace_existing, wait_interval,
version_name_output_path,
version_object_output_path,
):
self._ml = MLEngineClient()
self._model_name = model_name
self._project_id, self._model_id = self._parse_model_name(model_name)
# The name of the version resource, which is in the format
# of projects/*/models/*/versions/*
self._version_name = None
# The user provide short name of the version.
self._version_id = None
# The full payload of the version resource.
self._version = version
self._replace_existing = replace_existing
self._wait_interval = wait_interval
self._create_operation_name = None
self._delete_operation_name = None
self._version_name_output_path = version_name_output_path
self._version_object_output_path = version_object_output_path
def execute_and_wait(self):
with KfpExecutionContext(on_cancel=self._cancel) as ctx:
self._set_version_name(ctx.context_id())
self._dump_metadata()
existing_version = wait_existing_version(self._ml,
self._version_name,
self._wait_interval)
if existing_version and self._is_dup_version(existing_version):
return self._handle_completed_version(existing_version)
if existing_version and self._replace_existing:
logging.info('Deleting existing version...')
self._delete_version_and_wait()
elif existing_version:
raise RuntimeError(
'Existing version conflicts with the name of the new version.')
created_version = self._create_version_and_wait()
return self._handle_completed_version(created_version)
def _parse_model_name(self, model_name):
match = re.search(r'^projects/([^/]+)/models/([^/]+)$', model_name)
if not match:
raise ValueError('model name "{}" is not in desired format.'.format(model_name))
return (match.group(1), match.group(2))
def _set_version_name(self, context_id):
name = self._version.get('name', None)
if not name:
name = 'ver_' + context_id
name = gcp_common.normalize_name(name)
self._version_id = name
self._version['name'] = name
self._version_name = '{}/versions/{}'.format(self._model_name, name)
def _cancel(self):
if self._delete_operation_name:
self._ml.cancel_operation(self._delete_operation_name)
if self._create_operation_name:
self._ml.cancel_operation(self._create_operation_name)
def _create_version_and_wait(self):
operation = self._ml.create_version(self._model_name, self._version)
# Cache operation name for cancellation.
self._create_operation_name = operation.get('name')
try:
operation = wait_for_operation_done(
self._ml,
self._create_operation_name,
'create version',
self._wait_interval)
finally:
self._create_operation_name = None
return operation.get('response', None)
def _delete_version_and_wait(self):
operation = self._ml.delete_version(self._version_name)
# Cache operation name for cancellation.
self._delete_operation_name = operation.get('name')
try:
wait_for_operation_done(
self._ml,
self._delete_operation_name,
'delete version',
self._wait_interval)
finally:
self._delete_operation_name = None
def _handle_completed_version(self, version):
state = version.get('state', None)
if state == 'FAILED':
error_message = version.get('errorMessage', 'Unknown failure')
raise RuntimeError('Version is in failed state: {}'.format(
error_message))
# Workaround issue that CMLE doesn't return the full version name.
version['name'] = self._version_name
self._dump_version(version)
return version
def _dump_metadata(self):
display.display(display.Link(
'https://console.cloud.google.com/mlengine/models/{}/versions/{}?project={}'.format(
self._model_id, self._version_id, self._project_id),
'Version Details'
))
display.display(display.Markdown('''
## Online Prediction
### REST endpoint
The REST endpoint for online prediction is as follows:
```
POST https://ml.googleapis.com/v1/{}:predict
```
Try the REST endpoint in [Google OAuth 2.0 Playgound](https://developers.google.com/oauthplayground/#step3\
&apisSelect=https://www.googleapis.com/auth/cloud-platform&postData={{"instances":[]}}\
&url=https://ml.googleapis.com/v1/{}:predict&content_type=application/json&http_method=POST).
### GCloud command
```bash
gcloud ai-platform predict --model {} \
--version {} \
--json-instances instances.json
```
'''.format(self._version_name, self._version_name, self._model_id, self._version_id)))
def _dump_version(self, version):
logging.info('Dumping version: {}'.format(version))
gcp_common.dump_file(self._version_name_output_path, json.dumps(version))
gcp_common.dump_file(self._version_object_output_path, version['name'])
def _is_dup_version(self, existing_version):
return not gcp_common.check_resource_changed(
self._version,
existing_version,
['description', 'deploymentUri',
'runtimeVersion', 'machineType', 'labels',
'framework', 'pythonVersion', 'autoScaling',
'manualScaling'])

View File

@ -1,66 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
from googleapiclient import errors
from kfp_component.core import KfpExecutionContext
from ._client import MLEngineClient
from .. import common as gcp_common
from ._common_ops import wait_existing_version, wait_for_operation_done
def delete_version(version_name, wait_interval=30):
"""Deletes a MLEngine version and wait.
Args:
version_name (str): required, the name of the version.
wait_interval (int): the interval to wait for a long running operation.
"""
DeleteVersionOp(version_name, wait_interval).execute_and_wait()
class DeleteVersionOp:
def __init__(self, version_name, wait_interval):
self._ml = MLEngineClient()
self._version_name = version_name
self._wait_interval = wait_interval
self._delete_operation_name = None
def execute_and_wait(self):
with KfpExecutionContext(on_cancel=self._cancel):
existing_version = wait_existing_version(self._ml,
self._version_name,
self._wait_interval)
if not existing_version:
logging.info('The version has already been deleted.')
return None
logging.info('Deleting existing version...')
operation = self._ml.delete_version(self._version_name)
# Cache operation name for cancellation.
self._delete_operation_name = operation.get('name')
try:
wait_for_operation_done(
self._ml,
self._delete_operation_name,
'delete version',
self._wait_interval)
finally:
self._delete_operation_name = None
return None
def _cancel(self):
if self._delete_operation_name:
self._ml.cancel_operation(self._delete_operation_name)

View File

@ -1,112 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from fire import decorators
from google.cloud import storage
from .. import common as gcp_common
from ..storage import parse_blob_path
from ._create_model import create_model
from ._create_version import create_version
from ._set_default_version import set_default_version
KNOWN_MODEL_NAMES = ['saved_model.pb', 'saved_model.pbtext', 'model.pkl', 'model.pkl', 'model.pkl']
@decorators.SetParseFns(python_version=str, runtime_version=str)
def deploy(model_uri, project_id,
model_uri_output_path, model_name_output_path, version_name_output_path,
model_id=None, version_id=None,
runtime_version=None, python_version=None, model=None, version=None,
replace_existing_version=False, set_default=False, wait_interval=30):
"""Deploy a model to MLEngine from GCS URI
Args:
model_uri (str): Required, the GCS URI which contains a model file.
If no model file is found, the same path will be treated as an export
base directory of a TF Estimator. The last time-stamped sub-directory
will be chosen as model URI.
project_id (str): required, the ID of the parent project.
model_id (str): optional, the user provided name of the model.
version_id (str): optional, the user provided name of the version.
If it is not provided, the operation uses a random name.
runtime_version (str): optinal, the Cloud ML Engine runtime version
to use for this deployment. If not set, Cloud ML Engine uses
the default stable version, 1.0.
python_version (str): optinal, the version of Python used in prediction.
If not set, the default version is '2.7'. Python '3.5' is available
when runtimeVersion is set to '1.4' and above. Python '2.7' works
with all supported runtime versions.
model (dict): Optional, the JSON payload of the new model. The schema follows
[REST Model resource](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models).
version (dict): Optional, the JSON payload of the new version. The schema follows
the [REST Version resource](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions)
replace_existing_version (boolean): boolean flag indicates whether to replace
existing version in case of conflict.
set_default (boolean): boolean flag indicates whether to set the new
version as default version in the model.
wait_interval (int): the interval to wait for a long running operation.
"""
storage_client = storage.Client()
model_uri = _search_dir_with_model(storage_client, model_uri)
gcp_common.dump_file(model_uri_output_path, model_uri)
model = create_model(project_id, model_id, model,
model_name_output_path=model_name_output_path,
)
model_name = model.get('name')
version = create_version(model_name, model_uri, version_id,
runtime_version, python_version, version, replace_existing_version,
wait_interval, version_name_output_path=version_name_output_path,
)
if set_default:
version_name = version.get('name')
version = set_default_version(version_name)
return version
def _search_dir_with_model(storage_client, model_root_uri):
bucket_name, blob_name = parse_blob_path(model_root_uri)
bucket = storage_client.bucket(bucket_name)
if not blob_name.endswith('/'):
blob_name += '/'
it = bucket.list_blobs(prefix=blob_name, delimiter='/')
for resource in it:
basename = os.path.basename(resource.name)
if basename in KNOWN_MODEL_NAMES:
logging.info('Found model file under {}.'.format(model_root_uri))
return model_root_uri
model_dir = _search_tf_export_dir_base(storage_client, bucket, blob_name)
if not model_dir:
model_dir = model_root_uri
return model_dir
def _search_tf_export_dir_base(storage_client, bucket, export_dir_base):
logging.info('Searching model under export base dir: {}.'.format(export_dir_base))
it = bucket.list_blobs(prefix=export_dir_base, delimiter='/')
for _ in it.pages:
# Iterate to the last page to get the full prefixes.
pass
timestamped_dirs = []
for sub_dir in it.prefixes:
dir_name = os.path.basename(os.path.normpath(sub_dir))
if dir_name.isdigit():
timestamped_dirs.append(sub_dir)
if not timestamped_dirs:
logging.info('No timestamped sub-directory is found under {}'.format(export_dir_base))
return None
last_timestamped_dir = max(timestamped_dirs)
logging.info('Found timestamped sub-directory: {}.'.format(last_timestamped_dir))
return 'gs://{}/{}'.format(bucket.name, last_timestamped_dir)

View File

@ -1,20 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._client import MLEngineClient
def set_default_version(version_name):
"""Set specified version as default version.
"""
return MLEngineClient().set_default_version(version_name)

View File

@ -1,105 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from fire import decorators
from ._create_job import create_job
@decorators.SetParseFns(python_version=str, runtime_version=str)
def train(project_id,
job_id_output_path,
job_dir_output_path,
python_module=None,
package_uris=None,
region=None,
args=None,
job_dir=None,
python_version=None,
runtime_version=None,
master_image_uri=None,
worker_image_uri=None,
training_input=None,
job_id_prefix=None,
job_id=None,
wait_interval=30):
"""Creates a MLEngine training job.
Args:
project_id (str): Required. The ID of the parent project of the job.
job_id_output_path (str): Required. Path for the ID of the created job.
job_dir_output_path (str): Required. Path for the directory of the job.
python_module (str): Required. The Python module name to run after
installing the packages.
package_uris (list): Required. The Google Cloud Storage location of
the packages with the training program and any additional
dependencies. The maximum number of package URIs is 100.
region (str): Required. The Google Compute Engine region to run the
training job in
args (list): Command line arguments to pass to the program.
job_dir (str): A Google Cloud Storage path in which to store training
outputs and other data needed for training. This path is passed to
your TensorFlow program as the '--job-dir' command-line argument.
The benefit of specifying this field is that Cloud ML validates the
path for use in training.
python_version (str): Optional. The version of Python used in
training. If not set, the default version is '2.7'. Python '3.5' is
available when runtimeVersion is set to '1.4' and above. Python
'2.7' works with all supported runtime versions.
runtime_version (str): Optional. The Cloud ML Engine runtime version
to use for training. If not set, Cloud ML Engine uses the default
stable version, 1.0.
master_image_uri (str): The Docker image to run on the master replica.
This image must be in Container Registry.
worker_image_uri (str): The Docker image to run on the worker replica.
This image must be in Container Registry.
training_input (dict): Input parameters to create a training job.
job_id_prefix (str): the prefix of the generated job id.
job_id (str): the created job_id, takes precedence over generated job
id if set.
wait_interval (int): optional wait interval between calls to get job
status. Defaults to 30.
"""
if not training_input:
training_input = {}
if python_module:
training_input['pythonModule'] = python_module
if package_uris:
training_input['packageUris'] = package_uris
if region:
training_input['region'] = region
if args:
training_input['args'] = args
if job_dir:
training_input['jobDir'] = job_dir
if python_version:
training_input['pythonVersion'] = python_version
if runtime_version:
training_input['runtimeVersion'] = runtime_version
if master_image_uri:
if 'masterConfig' not in training_input:
training_input['masterConfig'] = {}
training_input['masterConfig']['imageUri'] = master_image_uri
if worker_image_uri:
if 'workerConfig' not in training_input:
training_input['workerConfig'] = {}
training_input['workerConfig']['imageUri'] = worker_image_uri
job = {'trainingInput': training_input}
return create_job(
project_id=project_id,
job=job,
job_id_prefix=job_id_prefix,
job_id=job_id,
wait_interval=wait_interval,
job_id_output_path=job_id_output_path,
job_dir_output_path=job_dir_output_path)

View File

@ -1,53 +0,0 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._common_ops import wait_for_job_done, cancel_job
from kfp_component.core import KfpExecutionContext
from ._client import MLEngineClient
from .. import common as gcp_common
def wait_job(
project_id,
job_id,
wait_interval=30,
show_tensorboard=True,
job_object_output_path='/tmp/kfp/output/ml_engine/job.json',
job_id_output_path='/tmp/kfp/output/ml_engine/job_id.txt',
job_dir_output_path='/tmp/kfp/output/ml_engine/job_dir.txt',
):
"""Waits a MLEngine job.
Args:
project_id (str): Required. The ID of the parent project of the job.
job_id (str): Required. The ID of the job to wait.
wait_interval (int): optional wait interval between calls
to get job status. Defaults to 30.
show_tensorboard (bool): optional. True to dump Tensorboard metadata.
job_object_output_path: Path for the json payload of the waiting job.
job_id_output_path: Path for the ID of the waiting job.
job_dir_output_path: Path for the `jobDir` of the waiting job.
"""
ml_client = MLEngineClient()
with KfpExecutionContext(on_cancel=lambda: cancel_job(ml_client, project_id, job_id)):
return wait_for_job_done(
ml_client=ml_client,
project_id=project_id,
job_id=job_id,
wait_interval=wait_interval,
show_tensorboard=show_tensorboard,
job_object_output_path=job_object_output_path,
job_id_output_path=job_id_output_path,
job_dir_output_path=job_dir_output_path,
)

View File

@ -1,16 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._download_blob import download_blob
from ._common_ops import parse_blob_path, is_gcs_path

View File

@ -1,41 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
def is_gcs_path(path):
"""Check if the path is a gcs path"""
return path.startswith('gs://')
def parse_blob_path(path):
"""Parse a gcs path into bucket name and blob name
Args:
path (str): the path to parse.
Returns:
(bucket name in the path, blob name in the path)
Raises:
ValueError if the path is not a valid gcs blob path.
Example:
`bucket_name, blob_name = parse_blob_path('gs://foo/bar')`
`bucket_name` is `foo` and `blob_name` is `bar`
"""
match = re.match('gs://([^/]+)/(.+)$', path)
if match:
return match.group(1), match.group(2)
raise ValueError('Path {} is invalid blob path.'.format(
path))

View File

@ -1,42 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from google.cloud import storage
from ._common_ops import parse_blob_path
def download_blob(source_blob_path, destination_file_path):
"""Downloads a blob from the bucket.
Args:
source_blob_path (str): the source blob path to download from.
destination_file_path (str): the local file path to download to.
"""
bucket_name, blob_name = parse_blob_path(source_blob_path)
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
dirname = os.path.dirname(destination_file_path)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(destination_file_path, 'wb+') as f:
blob.download_to_file(f)
logging.info('Blob {} downloaded to {}.'.format(
source_blob_path,
destination_file_path))

View File

@ -1,24 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Entrypoint module to launch python module or file dynamically.
This module makes it easier to build kfp component with python code
by defining a dynamic entrypoint and generate command line arg parser
by python-fire module. It can be used as an entrypoint in the
container spec to run arbitary python module or code in the local
image.
"""
from .launcher import launch

View File

@ -1,45 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import fire
import importlib
import os
import sys
import logging
from .launcher import launch
def main():
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(
prog='launcher',
description='Launch a python module or file.')
parser.add_argument('file_or_module', type=str,
help='Either a python file path or a module name.')
parser.add_argument(
'--ui_metadata_path',
type=str,
default='/mlpipeline-ui-metadata.json',
help='Path for the file where the mlpipeline-ui-metadata.json data '
'should be written.')
parser.add_argument('args', nargs=argparse.REMAINDER)
args = parser.parse_args()
if args.ui_metadata_path:
os.environ['KFP_UI_METADATA_PATH'] = args.ui_metadata_path
launch(args.file_or_module, args.args)
if __name__ == '__main__':
main()

View File

@ -1,45 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import fire
import importlib
import sys
import logging
def launch(file_or_module, args):
"""Launches a python file or module as a command entrypoint.
Args:
file_or_module: it is either a file path to python file
a module path.
args: the args passed to the entrypoint function.
Returns:
The return value from the launched function.
"""
try:
module = importlib.import_module(file_or_module)
except Exception:
try:
if sys.version_info.major > 2:
spec = importlib.util.spec_from_file_location('module', file_or_module)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
else:
import imp
module = imp.load_source('module', file_or_module)
except Exception:
logging.error('Failed to find the module or file: {}'.format(file_or_module))
sys.exit(1)
return fire.Fire(module, command=args, name=module.__name__)

View File

@ -1,8 +0,0 @@
--- http.py 2019-05-03 15:07:52.591411824 -0700
+++ http_new.py 2019-05-03 15:09:23.470304022 -0700
@@ -1784,4 +1784,4 @@
http_timeout = socket.getdefaulttimeout()
else:
http_timeout = DEFAULT_HTTP_TIMEOUT_SEC
- return httplib2.Http(timeout=http_timeout)
+ return set_user_agent(httplib2.Http(timeout=http_timeout), '-kfpipeline-')

View File

@ -1,2 +0,0 @@
pip install -U tox virtualenv
tox "$@"

View File

@ -1,50 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from setuptools import setup
PACKAGE_NAME = 'kfp-component'
VERSION = '1.8.0-alpha.0'
setup(
name=PACKAGE_NAME,
version=VERSION,
description='KubeFlow Pipelines Component SDK',
author='google',
install_requires=[
'kubernetes >= 8.0.1', 'urllib3>=1.15,<1.25', 'fire == 0.1.3',
'google-api-python-client == 1.7.8', 'google-cloud-storage == 1.14.0',
'google-cloud-bigquery == 1.9.0'
],
packages=[
'kfp_component',
],
classifiers=[
'Intended Audience :: Developers',
'Intended Audience :: Education',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Software Development',
'Topic :: Software Development :: Libraries',
'Topic :: Software Development :: Libraries :: Python Modules',
],
include_package_data=True,
)

View File

@ -1,4 +0,0 @@
.
flake8
pytest
mock

View File

@ -1,13 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,13 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,92 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from kfp_component.core import display
import mock
import unittest
@mock.patch('kfp_component.core._display.json')
@mock.patch('kfp_component.core._display.os')
@mock.patch('kfp_component.core._display.open')
class DisplayTest(unittest.TestCase):
def test_display_markdown(self, mock_open, mock_os, mock_json):
mock_os.path.isfile.return_value = False
display.display(display.Markdown('# test'))
mock_json.dump.assert_called_with({
'outputs': [{
'type': 'markdown',
'source': '# test',
'storage': 'inline'
}]
}, mock.ANY)
def test_display_markdown_append(self, mock_open, mock_os, mock_json):
mock_os.path.isfile.return_value = True
mock_json.load.return_value = {
'outputs': [{
'type': 'markdown',
'source': '# test 1',
'storage': 'inline'
}]
}
display.display(display.Markdown('# test 2'))
mock_json.dump.assert_called_with({
'outputs': [{
'type': 'markdown',
'source': '# test 1',
'storage': 'inline'
},{
'type': 'markdown',
'source': '# test 2',
'storage': 'inline'
}]
}, mock.ANY)
def test_display_tensorboard(self, mock_open, mock_os, mock_json):
mock_os.path.isfile.return_value = False
display.display(display.Tensorboard('gs://job/dir'))
mock_json.dump.assert_called_with({
'outputs': [{
'type': 'tensorboard',
'source': 'gs://job/dir'
}]
}, mock.ANY)
def test_display_link(self, mock_open, mock_os, mock_json):
mock_os.path.isfile.return_value = False
display.display(display.Link('https://test/link', 'Test Link'))
mock_json.dump.assert_called_with({
'outputs': [{
'type': 'markdown',
'source': '## [Test Link](https://test/link)',
'storage': 'inline'
}]
}, mock.ANY)
def test___repr__(self, mock_open, mock_os, mock_json):
self.assertEqual('# Title', str(display.Markdown('# Title')))
self.assertEqual('Open Tensorboard at: gs://trained/model/',
str(display.Tensorboard('gs://trained/model/')))
self.assertEqual('title: https://test/uri',
str(display.Link('https://test/uri', 'title')))

View File

@ -1,139 +0,0 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from kfp_component.core import KfpExecutionContext
from kubernetes import client, config
from kubernetes.client.rest import ApiException
import mock
import unittest
@mock.patch('kubernetes.config.load_incluster_config')
@mock.patch('kubernetes.client.CoreV1Api')
class KfpExecutionContextTest(unittest.TestCase):
def test_init_succeed_without_pod_name(self,
mock_k8s_client, mock_load_config):
with KfpExecutionContext() as ctx:
self.assertFalse(ctx.under_kfp_environment())
pass
@mock.patch.dict('os.environ', {
'KFP_POD_NAME': 'mock-pod-id'
})
def test_init_succeed_when_load_k8s_config_fail(self,
mock_k8s_client, mock_load_config):
mock_load_config.side_effect = Exception()
with KfpExecutionContext() as ctx:
self.assertFalse(ctx.under_kfp_environment())
pass
@mock.patch.dict('os.environ', {
'KFP_POD_NAME': 'mock-pod-id'
})
def test_init_succeed_when_load_k8s_client_fail(self,
mock_k8s_client, mock_load_config):
mock_k8s_client.side_effect = Exception()
with KfpExecutionContext() as ctx:
self.assertFalse(ctx.under_kfp_environment())
pass
@mock.patch.dict('os.environ', {
'KFP_POD_NAME': 'mock-pod-id'
})
def test_init_succeed_when_load_pod_fail(self,
mock_k8s_client, mock_load_config):
mock_k8s_client().read_namespaced_pod.side_effect = Exception()
with KfpExecutionContext() as ctx:
self.assertFalse(ctx.under_kfp_environment())
pass
@mock.patch.dict('os.environ', {
'KFP_POD_NAME': 'mock-pod-id'
})
def test_init_succeed_no_argo_node_name(self,
mock_k8s_client, mock_load_config):
mock_pod = mock_k8s_client().read_namespaced_pod.return_value
mock_pod.metadata.annotations = {}
with KfpExecutionContext() as ctx:
self.assertFalse(ctx.under_kfp_environment())
pass
@mock.patch.dict('os.environ', {
'KFP_POD_NAME': 'mock-pod-id',
'KFP_NAMESPACE': 'mock-namespace'
})
def test_init_succeed(self,
mock_k8s_client, mock_load_config):
mock_pod = mock_k8s_client().read_namespaced_pod.return_value
mock_pod.metadata.annotations = {
'workflows.argoproj.io/node-name': 'node-1'
}
with KfpExecutionContext() as ctx:
self.assertTrue(ctx.under_kfp_environment())
pass
mock_k8s_client().read_namespaced_pod.assert_called_with('mock-pod-id', 'mock-namespace')
@mock.patch.dict('os.environ', {
'KFP_POD_NAME': 'mock-pod-id'
})
def test__exit_gracefully_cancel(self,
mock_k8s_client, mock_load_config):
mock_pod = mock_k8s_client().read_namespaced_pod.return_value
mock_pod.metadata.annotations = {
'workflows.argoproj.io/node-name': 'node-1',
'workflows.argoproj.io/execution': '{"deadline": "1970-01-01T00:00:00Z"}'
}
cancel_handler = mock.Mock()
context = KfpExecutionContext(on_cancel=cancel_handler)
context._exit_gracefully(0, 0)
cancel_handler.assert_called_once()
@mock.patch.dict('os.environ', {
'KFP_POD_NAME': 'mock-pod-id'
})
def test__exit_gracefully_no_cancel(self,
mock_k8s_client, mock_load_config):
mock_pod = mock_k8s_client().read_namespaced_pod.return_value
mock_pod.metadata.annotations = {
'workflows.argoproj.io/node-name': 'node-1'
}
cancel_handler = mock.Mock()
context = KfpExecutionContext(on_cancel=cancel_handler)
context._exit_gracefully(0, 0)
cancel_handler.assert_not_called()
@mock.patch.dict('os.environ', {
'KFP_POD_NAME': 'mock-pod-id'
})
def test_context_id_stable_across_retries(self,
mock_k8s_client, mock_load_config):
mock_pod = mock_k8s_client().read_namespaced_pod.return_value
mock_pod.metadata.annotations = {
'workflows.argoproj.io/node-name': 'node-1'
}
ctx1 = KfpExecutionContext()
ctx2 = KfpExecutionContext()
self.assertEqual(ctx1.context_id(), ctx2.context_id())

Some files were not shown because too many files have changed in this diff Show More