chore(components): clean up deprecated GCP components (#8685)
* delete deprecated GCP components * remove build and test scripts, references to code/folder from master
This commit is contained in:
parent
8552226c41
commit
5fe67919db
|
|
@ -32,21 +32,6 @@ steps:
|
|||
# id: 'copyPythonSDKToLatest'
|
||||
# waitFor: ['preparePythonSDK']
|
||||
|
||||
# # Build the Python Component SDK
|
||||
# - name: 'python:2-alpine'
|
||||
# entrypoint: '/bin/sh'
|
||||
# args: ['-c', 'cd /workspace/components/gcp/container/component_sdk/python;python setup.py sdist --format=gztar; cp dist/*.tar.gz /workspace/kfp-component.tar.gz']
|
||||
# id: 'preparePythonComponentSDK'
|
||||
# waitFor: ["-"]
|
||||
# - name: 'gcr.io/cloud-builders/gsutil'
|
||||
# args: ['cp', '/workspace/kfp-component.tar.gz', 'gs://$PROJECT_ID/builds/$COMMIT_SHA/kfp-component.tar.gz']
|
||||
# id: 'copyPythonComponentSDK'
|
||||
# waitFor: ['preparePythonComponentSDK']
|
||||
# - name: 'gcr.io/cloud-builders/gsutil'
|
||||
# args: ['cp', '/workspace/kfp-component.tar.gz', 'gs://$PROJECT_ID/builds/latest/kfp-component.tar.gz']
|
||||
# id: 'copyPythonComponentSDKToLatest'
|
||||
# waitFor: ['preparePythonComponentSDK']
|
||||
|
||||
# Build the pipeline system images
|
||||
- name: 'gcr.io/cloud-builders/docker'
|
||||
entrypoint: /bin/bash
|
||||
|
|
@ -147,13 +132,6 @@ steps:
|
|||
id: 'buildGpuTrainer'
|
||||
waitFor: ["-"]
|
||||
|
||||
# Build the Generic GCP component image
|
||||
- name: 'gcr.io/cloud-builders/docker'
|
||||
entrypoint: '/bin/bash'
|
||||
args: ['-c', 'cd /workspace/components/gcp/container/ && ./build_image.sh -p $PROJECT_ID -t $COMMIT_SHA']
|
||||
id: 'buildGcpGenericComponent'
|
||||
waitFor: ["-"]
|
||||
|
||||
# Build the local pipeline component images
|
||||
- name: 'gcr.io/cloud-builders/docker'
|
||||
entrypoint: '/bin/bash'
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ Name | Description | Type
|
|||
:--- | :---------- | :---
|
||||
cluster_name | The name of the cluster. | String
|
||||
|
||||
Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/delete_cluster).
|
||||
Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/release-1.7/components/gcp/dataproc/delete_cluster).
|
||||
|
||||
|
||||
## Cautions & requirements
|
||||
|
|
@ -167,9 +167,9 @@ run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arg
|
|||
|
||||
## References
|
||||
* [Kubernetes Engine for Kubeflow](https://www.kubeflow.org/docs/started/getting-started-gke/#gcp-service-accounts)
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/create_cluster/sample.ipynb)
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/create_cluster/sample.ipynb)
|
||||
* [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create)
|
||||
|
||||
## License
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@
|
|||
":--- | :---------- | :---\n",
|
||||
"cluster_name | The name of the cluster. | String\n",
|
||||
"\n",
|
||||
"Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/delete_cluster).\n",
|
||||
"Note: You can recycle the cluster by using the [Dataproc delete cluster component](https://github.com/kubeflow/pipelines/tree/release-1.7/components/gcp/dataproc/delete_cluster).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
|
|
@ -211,9 +211,9 @@
|
|||
"source": [
|
||||
"## References\n",
|
||||
"* [Kubernetes Engine for Kubeflow](https://www.kubeflow.org/docs/started/getting-started-gke/#gcp-service-accounts)\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/create_cluster/sample.ipynb)\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_create_cluster.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/create_cluster/sample.ipynb)\n",
|
||||
"* [Dataproc create cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/create)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
|
|
@ -242,4 +242,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -138,9 +138,9 @@ run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arg
|
|||
|
||||
## References
|
||||
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/delete_cluster/sample.ipynb)
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/delete_cluster/sample.ipynb)
|
||||
* [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -187,9 +187,9 @@
|
|||
"source": [
|
||||
"## References\n",
|
||||
"\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/delete_cluster/sample.ipynb)\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_delete_cluster.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/delete_cluster/sample.ipynb)\n",
|
||||
"* [Dataproc delete cluster REST API](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters/delete)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
|
|
@ -228,4 +228,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -196,9 +196,9 @@ The sample in the notebook will count the words in the input text and save them
|
|||
```
|
||||
|
||||
## References
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)
|
||||
* [Dataproc HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob)
|
||||
|
||||
# License
|
||||
|
|
|
|||
|
|
@ -279,9 +279,9 @@
|
|||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hadoop_job.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_hadoop_job/sample.ipynb)\n",
|
||||
"* [Dataproc HadoopJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
|
|
@ -310,4 +310,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -180,9 +180,9 @@ run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arg
|
|||
```
|
||||
|
||||
## References
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hive_job/sample.ipynb)
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_hive_job/sample.ipynb)
|
||||
* [Dataproc HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob)
|
||||
|
||||
## License
|
||||
|
|
|
|||
|
|
@ -230,9 +230,9 @@
|
|||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_hive_job/sample.ipynb)\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_hive_job.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_hive_job/sample.ipynb)\n",
|
||||
"* [Dataproc HiveJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
|
|
@ -261,4 +261,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -191,9 +191,9 @@ run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arg
|
|||
|
||||
## References
|
||||
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_spark_job/sample.ipynb)
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_spark_job/sample.ipynb)
|
||||
* [Dataproc SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob)
|
||||
|
||||
## License
|
||||
|
|
|
|||
|
|
@ -232,9 +232,9 @@
|
|||
"source": [
|
||||
"## References\n",
|
||||
"\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/dataproc/submit_spark_job/sample.ipynb)\n",
|
||||
"* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/dataproc/_submit_spark_job.py)\n",
|
||||
"* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/dataproc/submit_spark_job/sample.ipynb)\n",
|
||||
"* [Dataproc SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
|
|
@ -263,4 +263,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -191,9 +191,9 @@ OUTPUT_FILES_PATTERN = OUTPUT_GCS_PATH + '*'
|
|||
```
|
||||
|
||||
## References
|
||||
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)
|
||||
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/batch_predict/sample.ipynb)
|
||||
* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)
|
||||
* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/batch_predict/sample.ipynb)
|
||||
* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)
|
||||
|
||||
## License
|
||||
|
|
|
|||
|
|
@ -276,9 +276,9 @@
|
|||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/batch_predict/sample.ipynb)\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_batch_predict.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/batch_predict/sample.ipynb)\n",
|
||||
"* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
|
|
@ -307,4 +307,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -186,9 +186,9 @@ run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arg
|
|||
```
|
||||
|
||||
## References
|
||||
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)
|
||||
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/deploy/sample.ipynb)
|
||||
* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)
|
||||
* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/deploy/sample.ipynb)
|
||||
* [Cloud Machine Learning Engine Model REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models)
|
||||
* [Cloud Machine Learning Engine Version REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.versions)
|
||||
|
||||
|
|
|
|||
|
|
@ -247,9 +247,9 @@
|
|||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/deploy/sample.ipynb)\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_deploy.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/deploy/sample.ipynb)\n",
|
||||
"* [Cloud Machine Learning Engine Model REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models)\n",
|
||||
"* [Cloud Machine Learning Engine Version REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.versions)\n",
|
||||
"\n",
|
||||
|
|
|
|||
|
|
@ -232,9 +232,9 @@ Use the following command to inspect the contents in the output directory:
|
|||
```
|
||||
|
||||
## References
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/train/sample.ipynb)
|
||||
* [Component Python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)
|
||||
* [Component Docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/train/sample.ipynb)
|
||||
* [AI Platform REST API - Resource: Job](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)
|
||||
|
||||
## License
|
||||
|
|
|
|||
|
|
@ -325,9 +325,9 @@
|
|||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/ml_engine/train/sample.ipynb)\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/component_sdk/python/kfp_component/google/ml_engine/_train.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/release-1.7/components/gcp/ml_engine/train/sample.ipynb)\n",
|
||||
"* [Cloud Machine Learning Engine job REST API](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
|
|
|
|||
|
|
@ -1,3 +0,0 @@
|
|||
# Deprecation Warning
|
||||
|
||||
The components in this directory is now moved to [components/contrib/google-cloud/automl](https://github.com/kubeflow/pipelines/tree/master/components/contrib/google-cloud/automl). This directory will be removed by the end of 2021.
|
||||
|
|
@ -1,69 +0,0 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_create_dataset_for_tables(
|
||||
gcp_project_id: str,
|
||||
gcp_region: str,
|
||||
display_name: str,
|
||||
description: str = None,
|
||||
tables_dataset_metadata: dict = {},
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('dataset_path', str), ('create_time', str), ('dataset_id', str), ('dataset_url', 'URI')]):
|
||||
'''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables
|
||||
'''
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
location_path = client.location_path(gcp_project_id, gcp_region)
|
||||
dataset_dict = {
|
||||
'display_name': display_name,
|
||||
'description': description,
|
||||
'tables_dataset_metadata': tables_dataset_metadata,
|
||||
}
|
||||
dataset = client.create_dataset(
|
||||
location_path,
|
||||
dataset_dict,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
print(dataset)
|
||||
dataset_id = dataset.name.rsplit('/', 1)[-1]
|
||||
dataset_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id}/schemav2?project={project_id}'.format(
|
||||
project_id=gcp_project_id,
|
||||
region=gcp_region,
|
||||
dataset_id=dataset_id,
|
||||
)
|
||||
return (dataset.name, str(dataset.create_time), dataset_id, dataset_url)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_create_dataset_for_tables_op = create_component_from_func(
|
||||
automl_create_dataset_for_tables,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
packages_to_install=['google-cloud-automl==0.4.0'],
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_dataset_for_tables/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -1,148 +0,0 @@
|
|||
name: Automl create dataset for tables
|
||||
description: automl_create_dataset_for_tables creates an empty Dataset for AutoML
|
||||
tables
|
||||
inputs:
|
||||
- {name: gcp_project_id, type: String}
|
||||
- {name: gcp_region, type: String}
|
||||
- {name: display_name, type: String}
|
||||
- {name: description, type: String, optional: true}
|
||||
- {name: tables_dataset_metadata, type: JsonObject, default: '{}', optional: true}
|
||||
- {name: retry, optional: true}
|
||||
- {name: timeout, type: Float, optional: true}
|
||||
- {name: metadata, type: JsonObject, optional: true}
|
||||
outputs:
|
||||
- {name: dataset_path, type: String}
|
||||
- {name: create_time, type: String}
|
||||
- {name: dataset_id, type: String}
|
||||
- {name: dataset_url, type: URI}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_dataset_for_tables/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-cloud-automl==0.4.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
|
||||
install --quiet --no-warn-script-location 'google-cloud-automl==0.4.0' --user)
|
||||
&& "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def automl_create_dataset_for_tables(
|
||||
gcp_project_id ,
|
||||
gcp_region ,
|
||||
display_name ,
|
||||
description = None,
|
||||
tables_dataset_metadata = {},
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout = None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata = None,
|
||||
) :
|
||||
'''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables
|
||||
'''
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
location_path = client.location_path(gcp_project_id, gcp_region)
|
||||
dataset_dict = {
|
||||
'display_name': display_name,
|
||||
'description': description,
|
||||
'tables_dataset_metadata': tables_dataset_metadata,
|
||||
}
|
||||
dataset = client.create_dataset(
|
||||
location_path,
|
||||
dataset_dict,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
print(dataset)
|
||||
dataset_id = dataset.name.rsplit('/', 1)[-1]
|
||||
dataset_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id}/schemav2?project={project_id}'.format(
|
||||
project_id=gcp_project_id,
|
||||
region=gcp_region,
|
||||
dataset_id=dataset_id,
|
||||
)
|
||||
return (dataset.name, str(dataset.create_time), dataset_id, dataset_url)
|
||||
|
||||
import json
|
||||
def _serialize_str(str_value: str) -> str:
|
||||
if not isinstance(str_value, str):
|
||||
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
|
||||
return str_value
|
||||
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl create dataset for tables', description='automl_create_dataset_for_tables creates an empty Dataset for AutoML tables')
|
||||
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--description", dest="description", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--tables-dataset-metadata", dest="tables_dataset_metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=4)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_create_dataset_for_tables(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_str,
|
||||
_serialize_str,
|
||||
_serialize_str,
|
||||
str,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --gcp-project-id
|
||||
- {inputValue: gcp_project_id}
|
||||
- --gcp-region
|
||||
- {inputValue: gcp_region}
|
||||
- --display-name
|
||||
- {inputValue: display_name}
|
||||
- if:
|
||||
cond: {isPresent: description}
|
||||
then:
|
||||
- --description
|
||||
- {inputValue: description}
|
||||
- if:
|
||||
cond: {isPresent: tables_dataset_metadata}
|
||||
then:
|
||||
- --tables-dataset-metadata
|
||||
- {inputValue: tables_dataset_metadata}
|
||||
- if:
|
||||
cond: {isPresent: retry}
|
||||
then:
|
||||
- --retry
|
||||
- {inputValue: retry}
|
||||
- if:
|
||||
cond: {isPresent: timeout}
|
||||
then:
|
||||
- --timeout
|
||||
- {inputValue: timeout}
|
||||
- if:
|
||||
cond: {isPresent: metadata}
|
||||
then:
|
||||
- --metadata
|
||||
- {inputValue: metadata}
|
||||
- '----output-paths'
|
||||
- {outputPath: dataset_path}
|
||||
- {outputPath: create_time}
|
||||
- {outputPath: dataset_id}
|
||||
- {outputPath: dataset_url}
|
||||
|
|
@ -1,71 +0,0 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_create_model_for_tables(
|
||||
gcp_project_id: str,
|
||||
gcp_region: str,
|
||||
display_name: str,
|
||||
dataset_id: str,
|
||||
target_column_path: str = None,
|
||||
input_feature_column_paths: list = None,
|
||||
optimization_objective: str = 'MAXIMIZE_AU_PRC',
|
||||
train_budget_milli_node_hours: int = 1000,
|
||||
) -> NamedTuple('Outputs', [('model_path', str), ('model_id', str), ('model_page_url', 'URI'),]):
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
location_path = client.location_path(gcp_project_id, gcp_region)
|
||||
model_dict = {
|
||||
'display_name': display_name,
|
||||
'dataset_id': dataset_id,
|
||||
'tables_model_metadata': {
|
||||
'target_column_spec': automl.types.ColumnSpec(name=target_column_path),
|
||||
'input_feature_column_specs': [automl.types.ColumnSpec(name=path) for path in input_feature_column_paths] if input_feature_column_paths else None,
|
||||
'optimization_objective': optimization_objective,
|
||||
'train_budget_milli_node_hours': train_budget_milli_node_hours,
|
||||
},
|
||||
}
|
||||
|
||||
create_model_response = client.create_model(location_path, model_dict)
|
||||
print('Create model operation: {}'.format(create_model_response.operation))
|
||||
result = create_model_response.result()
|
||||
print(result)
|
||||
model_name = result.name
|
||||
model_id = model_name.rsplit('/', 1)[-1]
|
||||
model_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id};modelId={model_id};task=basic/train?project={project_id}'.format(
|
||||
project_id=gcp_project_id,
|
||||
region=gcp_region,
|
||||
dataset_id=dataset_id,
|
||||
model_id=model_id,
|
||||
)
|
||||
|
||||
return (model_name, model_id, model_url)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_create_model_for_tables_op = create_component_from_func(
|
||||
automl_create_model_for_tables,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
packages_to_install=['google-cloud-automl==0.4.0'],
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_model_for_tables/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -1,142 +0,0 @@
|
|||
name: Automl create model for tables
|
||||
inputs:
|
||||
- {name: gcp_project_id, type: String}
|
||||
- {name: gcp_region, type: String}
|
||||
- {name: display_name, type: String}
|
||||
- {name: dataset_id, type: String}
|
||||
- {name: target_column_path, type: String, optional: true}
|
||||
- {name: input_feature_column_paths, type: JsonArray, optional: true}
|
||||
- {name: optimization_objective, type: String, default: MAXIMIZE_AU_PRC, optional: true}
|
||||
- {name: train_budget_milli_node_hours, type: Integer, default: '1000', optional: true}
|
||||
outputs:
|
||||
- {name: model_path, type: String}
|
||||
- {name: model_id, type: String}
|
||||
- {name: model_page_url, type: URI}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/create_model_for_tables/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-cloud-automl==0.4.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
|
||||
install --quiet --no-warn-script-location 'google-cloud-automl==0.4.0' --user)
|
||||
&& "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def automl_create_model_for_tables(
|
||||
gcp_project_id ,
|
||||
gcp_region ,
|
||||
display_name ,
|
||||
dataset_id ,
|
||||
target_column_path = None,
|
||||
input_feature_column_paths = None,
|
||||
optimization_objective = 'MAXIMIZE_AU_PRC',
|
||||
train_budget_milli_node_hours = 1000,
|
||||
) :
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
location_path = client.location_path(gcp_project_id, gcp_region)
|
||||
model_dict = {
|
||||
'display_name': display_name,
|
||||
'dataset_id': dataset_id,
|
||||
'tables_model_metadata': {
|
||||
'target_column_spec': automl.types.ColumnSpec(name=target_column_path),
|
||||
'input_feature_column_specs': [automl.types.ColumnSpec(name=path) for path in input_feature_column_paths] if input_feature_column_paths else None,
|
||||
'optimization_objective': optimization_objective,
|
||||
'train_budget_milli_node_hours': train_budget_milli_node_hours,
|
||||
},
|
||||
}
|
||||
|
||||
create_model_response = client.create_model(location_path, model_dict)
|
||||
print('Create model operation: {}'.format(create_model_response.operation))
|
||||
result = create_model_response.result()
|
||||
print(result)
|
||||
model_name = result.name
|
||||
model_id = model_name.rsplit('/', 1)[-1]
|
||||
model_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id};modelId={model_id};task=basic/train?project={project_id}'.format(
|
||||
project_id=gcp_project_id,
|
||||
region=gcp_region,
|
||||
dataset_id=dataset_id,
|
||||
model_id=model_id,
|
||||
)
|
||||
|
||||
return (model_name, model_id, model_url)
|
||||
|
||||
def _serialize_str(str_value: str) -> str:
|
||||
if not isinstance(str_value, str):
|
||||
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
|
||||
return str_value
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl create model for tables', description='')
|
||||
_parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--dataset-id", dest="dataset_id", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--target-column-path", dest="target_column_path", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--input-feature-column-paths", dest="input_feature_column_paths", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--optimization-objective", dest="optimization_objective", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--train-budget-milli-node-hours", dest="train_budget_milli_node_hours", type=int, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=3)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_create_model_for_tables(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_str,
|
||||
_serialize_str,
|
||||
str,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --gcp-project-id
|
||||
- {inputValue: gcp_project_id}
|
||||
- --gcp-region
|
||||
- {inputValue: gcp_region}
|
||||
- --display-name
|
||||
- {inputValue: display_name}
|
||||
- --dataset-id
|
||||
- {inputValue: dataset_id}
|
||||
- if:
|
||||
cond: {isPresent: target_column_path}
|
||||
then:
|
||||
- --target-column-path
|
||||
- {inputValue: target_column_path}
|
||||
- if:
|
||||
cond: {isPresent: input_feature_column_paths}
|
||||
then:
|
||||
- --input-feature-column-paths
|
||||
- {inputValue: input_feature_column_paths}
|
||||
- if:
|
||||
cond: {isPresent: optimization_objective}
|
||||
then:
|
||||
- --optimization-objective
|
||||
- {inputValue: optimization_objective}
|
||||
- if:
|
||||
cond: {isPresent: train_budget_milli_node_hours}
|
||||
then:
|
||||
- --train-budget-milli-node-hours
|
||||
- {inputValue: train_budget_milli_node_hours}
|
||||
- '----output-paths'
|
||||
- {outputPath: model_path}
|
||||
- {outputPath: model_id}
|
||||
- {outputPath: model_page_url}
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
from typing import NamedTuple
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
|
||||
def automl_deploy_model(
|
||||
model_path: str,
|
||||
) -> NamedTuple('Outputs', [
|
||||
('model_path', str),
|
||||
]):
|
||||
"""Deploys a trained model.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
"""
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
response = client.deploy_model(
|
||||
name=model_path,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (model_path, )
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
automl_deploy_model_op = create_component_from_func(
|
||||
automl_deploy_model,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.8',
|
||||
packages_to_install=[
|
||||
'google-cloud-automl==2.0.0',
|
||||
],
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/deploy_model/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -1,87 +0,0 @@
|
|||
name: Automl deploy model
|
||||
description: |-
|
||||
Deploys a trained model.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
inputs:
|
||||
- {name: model_path, type: String}
|
||||
outputs:
|
||||
- {name: model_path, type: String}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/deploy_model/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.8
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-cloud-automl==2.0.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
|
||||
install --quiet --no-warn-script-location 'google-cloud-automl==2.0.0' --user)
|
||||
&& "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def automl_deploy_model(
|
||||
model_path,
|
||||
):
|
||||
"""Deploys a trained model.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
"""
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
response = client.deploy_model(
|
||||
name=model_path,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (model_path, )
|
||||
|
||||
def _serialize_str(str_value: str) -> str:
|
||||
if not isinstance(str_value, str):
|
||||
raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
|
||||
return str_value
|
||||
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl deploy model', description="Deploys a trained model.\n\n Args:\n model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>")
|
||||
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_deploy_model(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
_serialize_str,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --model-path
|
||||
- {inputValue: model_path}
|
||||
- '----output-paths'
|
||||
- {outputPath: model_path}
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_export_data_to_gcs(
|
||||
dataset_path: str,
|
||||
gcs_output_uri_prefix: str = None,
|
||||
#retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = {},
|
||||
) -> NamedTuple('Outputs', [('gcs_output_uri_prefix', str)]):
|
||||
"""Exports dataset data to GCS."""
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, "-m", "pip", "install", "google-cloud-automl==0.4.0", "--quiet", "--no-warn-script-location"], env={"PIP_DISABLE_PIP_VERSION_CHECK": "1"}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
output_config = {"gcs_destination": {"output_uri_prefix": gcs_output_uri_prefix}}
|
||||
|
||||
response = client.export_data(
|
||||
name=dataset_path,
|
||||
output_config=output_config,
|
||||
#retry=retry or google.api_core.gapic_v1.method.DEFAULT
|
||||
timeout=timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata=metadata,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (gcs_output_uri_prefix, )
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_export_data_to_gcs_op = create_component_from_func(
|
||||
automl_export_data_to_gcs,
|
||||
output_component_file='component.yaml',base_image='python:3.7',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_data_to_gcs/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
name: Automl export data to gcs
|
||||
description: |
|
||||
Exports dataset data to GCS.
|
||||
inputs:
|
||||
- name: dataset_path
|
||||
type: String
|
||||
- name: gcs_output_uri_prefix
|
||||
optional: true
|
||||
type: String
|
||||
- name: timeout
|
||||
optional: true
|
||||
type: Float
|
||||
- default: '{}'
|
||||
name: metadata
|
||||
optional: true
|
||||
type: JsonObject
|
||||
outputs:
|
||||
- name: gcs_output_uri_prefix
|
||||
type: String
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_data_to_gcs/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
from typing import NamedTuple
|
||||
|
||||
def automl_export_data_to_gcs(
|
||||
dataset_path: str,
|
||||
gcs_output_uri_prefix: str = None,
|
||||
#retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = {},
|
||||
) -> NamedTuple('Outputs', [('gcs_output_uri_prefix', str)]):
|
||||
"""Exports dataset data to GCS."""
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, "-m", "pip", "install", "google-cloud-automl==0.4.0", "--quiet", "--no-warn-script-location"], env={"PIP_DISABLE_PIP_VERSION_CHECK": "1"}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
|
||||
output_config = {"gcs_destination": {"output_uri_prefix": gcs_output_uri_prefix}}
|
||||
|
||||
response = client.export_data(
|
||||
name=dataset_path,
|
||||
output_config=output_config,
|
||||
#retry=retry or google.api_core.gapic_v1.method.DEFAULT
|
||||
timeout=timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata=metadata,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (gcs_output_uri_prefix, )
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl export data to gcs', description='Exports dataset data to GCS.\n')
|
||||
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_export_data_to_gcs(**_parsed_args)
|
||||
|
||||
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
|
||||
_outputs = [_outputs]
|
||||
|
||||
_output_serializers = [
|
||||
str
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --dataset-path
|
||||
- inputValue: dataset_path
|
||||
- if:
|
||||
cond:
|
||||
isPresent: gcs_output_uri_prefix
|
||||
then:
|
||||
- --gcs-output-uri-prefix
|
||||
- inputValue: gcs_output_uri_prefix
|
||||
- if:
|
||||
cond:
|
||||
isPresent: timeout
|
||||
then:
|
||||
- --timeout
|
||||
- inputValue: timeout
|
||||
- if:
|
||||
cond:
|
||||
isPresent: metadata
|
||||
then:
|
||||
- --metadata
|
||||
- inputValue: metadata
|
||||
- '----output-paths'
|
||||
- outputPath: gcs_output_uri_prefix
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
from typing import NamedTuple
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
|
||||
def automl_export_model_to_gcs(
|
||||
model_path: str,
|
||||
gcs_output_uri_prefix: str,
|
||||
model_format: str = 'tf_saved_model',
|
||||
) -> NamedTuple('Outputs', [
|
||||
('model_directory', 'Uri'),
|
||||
]):
|
||||
"""Exports a trained model to a user specified Google Cloud Storage location.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
|
||||
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
"""
|
||||
from google.cloud import automl
|
||||
|
||||
client = automl.AutoMlClient()
|
||||
response = client.export_model(
|
||||
name=model_path,
|
||||
output_config=automl.ModelExportOutputConfig(
|
||||
model_format=model_format,
|
||||
gcs_destination=automl.GcsDestination(
|
||||
output_uri_prefix=gcs_output_uri_prefix,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (metadata.export_model_details.output_info.gcs_output_directory, )
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
automl_export_model_to_gcs_op = create_component_from_func(
|
||||
automl_export_model_to_gcs,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.8',
|
||||
packages_to_install=[
|
||||
'google-cloud-automl==2.0.0',
|
||||
],
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_model_to_gcs/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -1,107 +0,0 @@
|
|||
name: Automl export model to gcs
|
||||
description: |-
|
||||
Exports a trained model to a user specified Google Cloud Storage location.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
|
||||
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
inputs:
|
||||
- {name: model_path, type: String}
|
||||
- {name: gcs_output_uri_prefix, type: String}
|
||||
- {name: model_format, type: String, default: tf_saved_model, optional: true}
|
||||
outputs:
|
||||
- {name: model_directory, type: Uri}
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_model_to_gcs/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.8
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
|
||||
'google-cloud-automl==2.0.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
|
||||
install --quiet --no-warn-script-location 'google-cloud-automl==2.0.0' --user)
|
||||
&& "$0" "$@"
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
def automl_export_model_to_gcs(
|
||||
model_path,
|
||||
gcs_output_uri_prefix,
|
||||
model_format = 'tf_saved_model',
|
||||
):
|
||||
"""Exports a trained model to a user specified Google Cloud Storage location.
|
||||
|
||||
Args:
|
||||
model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'
|
||||
gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.
|
||||
model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig
|
||||
|
||||
Annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
"""
|
||||
from google.cloud import automl
|
||||
|
||||
client = automl.AutoMlClient()
|
||||
response = client.export_model(
|
||||
name=model_path,
|
||||
output_config=automl.ModelExportOutputConfig(
|
||||
model_format=model_format,
|
||||
gcs_destination=automl.GcsDestination(
|
||||
output_uri_prefix=gcs_output_uri_prefix,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
return (metadata.export_model_details.output_info.gcs_output_directory, )
|
||||
|
||||
import argparse
|
||||
_parser = argparse.ArgumentParser(prog='Automl export model to gcs', description="Exports a trained model to a user specified Google Cloud Storage location.\n\n Args:\n model_path: The resource name of the model to export. Format: 'projects/<project>/locations/<location>/models/<model>'\n gcs_output_uri_prefix: The Google Cloud Storage directory where the model should be written to. Must be in the same location as AutoML. Required location: us-central1.\n model_format: The format in which the model must be exported. The available, and default, formats depend on the problem and model type. Possible formats: tf_saved_model, tf_js, tflite, core_ml, edgetpu_tflite. See https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models/export?hl=en#modelexportoutputconfig\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>")
|
||||
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=True, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("--model-format", dest="model_format", type=str, required=False, default=argparse.SUPPRESS)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = vars(_parser.parse_args())
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_export_model_to_gcs(**_parsed_args)
|
||||
|
||||
_output_serializers = [
|
||||
str,
|
||||
|
||||
]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(_output_serializers[idx](_outputs[idx]))
|
||||
args:
|
||||
- --model-path
|
||||
- {inputValue: model_path}
|
||||
- --gcs-output-uri-prefix
|
||||
- {inputValue: gcs_output_uri_prefix}
|
||||
- if:
|
||||
cond: {isPresent: model_format}
|
||||
then:
|
||||
- --model-format
|
||||
- {inputValue: model_format}
|
||||
- '----output-paths'
|
||||
- {outputPath: model_directory}
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_import_data_from_bigquery(
|
||||
dataset_path,
|
||||
input_uri: str,
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('dataset_path', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
input_config = {
|
||||
'bigquery_source': {
|
||||
'input_uri': input_uri,
|
||||
},
|
||||
}
|
||||
response = client.import_data(
|
||||
dataset_path,
|
||||
input_config,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
result = response.result()
|
||||
print(result)
|
||||
metadata = response.metadata
|
||||
print(metadata)
|
||||
return (dataset_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
automl_import_data_from_bigquery_op = create_component_from_func(
|
||||
automl_import_data_from_bigquery,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_bigquery/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -1,112 +0,0 @@
|
|||
name: Automl import data from bigquery
|
||||
inputs:
|
||||
- name: dataset_path
|
||||
- name: input_uri
|
||||
type: String
|
||||
- name: retry
|
||||
optional: true
|
||||
- name: timeout
|
||||
optional: true
|
||||
- name: metadata
|
||||
type: JsonObject
|
||||
optional: true
|
||||
outputs:
|
||||
- name: dataset_path
|
||||
type: String
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_bigquery/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
from typing import NamedTuple
|
||||
|
||||
def automl_import_data_from_bigquery(
|
||||
dataset_path,
|
||||
input_uri: str,
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('dataset_path', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
input_config = {
|
||||
'bigquery_source': {
|
||||
'input_uri': input_uri,
|
||||
},
|
||||
}
|
||||
response = client.import_data(
|
||||
dataset_path,
|
||||
input_config,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
result = response.result()
|
||||
print(result)
|
||||
metadata = response.metadata
|
||||
print(metadata)
|
||||
return (dataset_path)
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_missing_arg = object()
|
||||
_parser = argparse.ArgumentParser(prog='Automl import data from bigquery', description='')
|
||||
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--input-uri", dest="input_uri", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_import_data_from_bigquery(**_parsed_args)
|
||||
|
||||
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
|
||||
_outputs = [_outputs]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(str(_outputs[idx]))
|
||||
args:
|
||||
- --dataset-path
|
||||
- inputValue: dataset_path
|
||||
- --input-uri
|
||||
- inputValue: input_uri
|
||||
- if:
|
||||
cond:
|
||||
isPresent: retry
|
||||
then:
|
||||
- --retry
|
||||
- inputValue: retry
|
||||
- if:
|
||||
cond:
|
||||
isPresent: timeout
|
||||
then:
|
||||
- --timeout
|
||||
- inputValue: timeout
|
||||
- if:
|
||||
cond:
|
||||
isPresent: metadata
|
||||
then:
|
||||
- --metadata
|
||||
- inputValue: metadata
|
||||
- '----output-paths'
|
||||
- outputPath: dataset_path
|
||||
|
|
@ -1,62 +0,0 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_import_data_from_gcs(
|
||||
dataset_path: str,
|
||||
input_uris: list,
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('dataset_path', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
input_config = {
|
||||
'gcs_source': {
|
||||
'input_uris': input_uris,
|
||||
},
|
||||
}
|
||||
response = client.import_data(
|
||||
dataset_path,
|
||||
input_config,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
result = response.result()
|
||||
print(result)
|
||||
metadata = response.metadata
|
||||
print(metadata)
|
||||
return (dataset_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_import_data_from_gcs_op = create_component_from_func(
|
||||
automl_import_data_from_gcs,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_gcs/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -1,113 +0,0 @@
|
|||
name: Automl import data from gcs
|
||||
inputs:
|
||||
- name: dataset_path
|
||||
type: String
|
||||
- name: input_uris
|
||||
type: JsonArray
|
||||
- name: retry
|
||||
optional: true
|
||||
- name: timeout
|
||||
optional: true
|
||||
- name: metadata
|
||||
type: JsonObject
|
||||
optional: true
|
||||
outputs:
|
||||
- name: dataset_path
|
||||
type: String
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_gcs/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
from typing import NamedTuple
|
||||
|
||||
def automl_import_data_from_gcs(
|
||||
dataset_path: str,
|
||||
input_uris: list,
|
||||
retry=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #=google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('dataset_path', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
import google
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
input_config = {
|
||||
'gcs_source': {
|
||||
'input_uris': input_uris,
|
||||
},
|
||||
}
|
||||
response = client.import_data(
|
||||
dataset_path,
|
||||
input_config,
|
||||
retry or google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout or google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata,
|
||||
)
|
||||
result = response.result()
|
||||
print(result)
|
||||
metadata = response.metadata
|
||||
print(metadata)
|
||||
return (dataset_path)
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_missing_arg = object()
|
||||
_parser = argparse.ArgumentParser(prog='Automl import data from gcs', description='')
|
||||
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--input-uris", dest="input_uris", type=json.loads, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1)
|
||||
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_import_data_from_gcs(**_parsed_args)
|
||||
|
||||
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
|
||||
_outputs = [_outputs]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(str(_outputs[idx]))
|
||||
args:
|
||||
- --dataset-path
|
||||
- inputValue: dataset_path
|
||||
- --input-uris
|
||||
- inputValue: input_uris
|
||||
- if:
|
||||
cond:
|
||||
isPresent: retry
|
||||
then:
|
||||
- --retry
|
||||
- inputValue: retry
|
||||
- if:
|
||||
cond:
|
||||
isPresent: timeout
|
||||
then:
|
||||
- --timeout
|
||||
- inputValue: timeout
|
||||
- if:
|
||||
cond:
|
||||
isPresent: metadata
|
||||
then:
|
||||
- --metadata
|
||||
- inputValue: metadata
|
||||
- '----output-paths'
|
||||
- outputPath: dataset_path
|
||||
|
|
@ -1,78 +0,0 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_prediction_service_batch_predict(
|
||||
model_path,
|
||||
gcs_input_uris: list = None,
|
||||
gcs_output_uri_prefix: str = None,
|
||||
bq_input_uri: str = None,
|
||||
bq_output_uri: str = None,
|
||||
params=None,
|
||||
retry=None, #google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('gcs_output_directory', str), ('bigquery_output_dataset', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
input_config = {}
|
||||
if gcs_input_uris:
|
||||
input_config['gcs_source'] = {'input_uris': gcs_input_uris}
|
||||
if bq_input_uri:
|
||||
input_config['bigquery_source'] = {'input_uri': bq_input_uri}
|
||||
|
||||
output_config = {}
|
||||
if gcs_output_uri_prefix:
|
||||
output_config['gcs_destination'] = {'output_uri_prefix': gcs_output_uri_prefix}
|
||||
if bq_output_uri:
|
||||
output_config['bigquery_destination'] = {'output_uri': bq_output_uri}
|
||||
|
||||
from google.cloud import automl
|
||||
client = automl.PredictionServiceClient()
|
||||
response = client.batch_predict(
|
||||
model_path,
|
||||
input_config,
|
||||
output_config,
|
||||
params,
|
||||
retry,
|
||||
timeout,
|
||||
metadata,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
output_info = metadata.batch_predict_details.output_info
|
||||
# Workaround for Argo issue - it fails when output is empty: https://github.com/argoproj/argo-workflows/pull/1277/files#r326028422
|
||||
return (output_info.gcs_output_directory or '-', output_info.bigquery_output_dataset or '-')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_prediction_service_batch_predict_op = create_component_from_func(
|
||||
automl_prediction_service_batch_predict,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/prediction_service_batch_predict/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -1,175 +0,0 @@
|
|||
name: Automl prediction service batch predict
|
||||
inputs:
|
||||
- name: model_path
|
||||
- name: gcs_input_uris
|
||||
type: JsonArray
|
||||
optional: true
|
||||
- name: gcs_output_uri_prefix
|
||||
type: String
|
||||
optional: true
|
||||
- name: bq_input_uri
|
||||
type: String
|
||||
optional: true
|
||||
- name: bq_output_uri
|
||||
type: String
|
||||
optional: true
|
||||
- name: params
|
||||
optional: true
|
||||
- name: retry
|
||||
optional: true
|
||||
- name: timeout
|
||||
optional: true
|
||||
- name: metadata
|
||||
type: JsonObject
|
||||
optional: true
|
||||
outputs:
|
||||
- name: gcs_output_directory
|
||||
type: String
|
||||
- name: bigquery_output_dataset
|
||||
type: String
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/prediction_service_batch_predict/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
from typing import NamedTuple
|
||||
|
||||
def automl_prediction_service_batch_predict(
|
||||
model_path,
|
||||
gcs_input_uris: str = None,
|
||||
gcs_output_uri_prefix: str = None,
|
||||
bq_input_uri: str = None,
|
||||
bq_output_uri: str = None,
|
||||
params=None,
|
||||
retry=None, #google.api_core.gapic_v1.method.DEFAULT,
|
||||
timeout=None, #google.api_core.gapic_v1.method.DEFAULT,
|
||||
metadata: dict = None,
|
||||
) -> NamedTuple('Outputs', [('gcs_output_directory', str), ('bigquery_output_dataset', str)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
input_config = {}
|
||||
if gcs_input_uris:
|
||||
input_config['gcs_source'] = {'input_uris': gcs_input_uris}
|
||||
if bq_input_uri:
|
||||
input_config['bigquery_source'] = {'input_uri': bq_input_uri}
|
||||
|
||||
output_config = {}
|
||||
if gcs_output_uri_prefix:
|
||||
output_config['gcs_destination'] = {'output_uri_prefix': gcs_output_uri_prefix}
|
||||
if bq_output_uri:
|
||||
output_config['bigquery_destination'] = {'output_uri': bq_output_uri}
|
||||
|
||||
from google.cloud import automl
|
||||
client = automl.PredictionServiceClient()
|
||||
response = client.batch_predict(
|
||||
model_path,
|
||||
input_config,
|
||||
output_config,
|
||||
params,
|
||||
retry,
|
||||
timeout,
|
||||
metadata,
|
||||
)
|
||||
print('Operation started:')
|
||||
print(response.operation)
|
||||
result = response.result()
|
||||
metadata = response.metadata
|
||||
print('Operation finished:')
|
||||
print(metadata)
|
||||
output_info = metadata.batch_predict_details.output_info
|
||||
# Workaround for Argo issue - it fails when output is empty: https://github.com/argoproj/argo-workflows/pull/1277/files#r326028422
|
||||
return (output_info.gcs_output_directory or '-', output_info.bigquery_output_dataset or '-')
|
||||
|
||||
import json
|
||||
import argparse
|
||||
_missing_arg = object()
|
||||
_parser = argparse.ArgumentParser(prog='Automl prediction service batch predict', description='')
|
||||
_parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--gcs-input-uris", dest="gcs_input_uris", type=json.loads, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--bq-input-uri", dest="bq_input_uri", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--bq-output-uri", dest="bq_output_uri", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--params", dest="params", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg)
|
||||
_parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
|
||||
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_prediction_service_batch_predict(**_parsed_args)
|
||||
|
||||
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
|
||||
_outputs = [_outputs]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(str(_outputs[idx]))
|
||||
args:
|
||||
- --model-path
|
||||
- inputValue: model_path
|
||||
- if:
|
||||
cond:
|
||||
isPresent: gcs_input_uris
|
||||
then:
|
||||
- --gcs-input-uris
|
||||
- inputValue: gcs_input_uris
|
||||
- if:
|
||||
cond:
|
||||
isPresent: gcs_output_uri_prefix
|
||||
then:
|
||||
- --gcs-output-uri-prefix
|
||||
- inputValue: gcs_output_uri_prefix
|
||||
- if:
|
||||
cond:
|
||||
isPresent: bq_input_uri
|
||||
then:
|
||||
- --bq-input-uri
|
||||
- inputValue: bq_input_uri
|
||||
- if:
|
||||
cond:
|
||||
isPresent: bq_output_uri
|
||||
then:
|
||||
- --bq-output-uri
|
||||
- inputValue: bq_output_uri
|
||||
- if:
|
||||
cond:
|
||||
isPresent: params
|
||||
then:
|
||||
- --params
|
||||
- inputValue: params
|
||||
- if:
|
||||
cond:
|
||||
isPresent: retry
|
||||
then:
|
||||
- --retry
|
||||
- inputValue: retry
|
||||
- if:
|
||||
cond:
|
||||
isPresent: timeout
|
||||
then:
|
||||
- --timeout
|
||||
- inputValue: timeout
|
||||
- if:
|
||||
cond:
|
||||
isPresent: metadata
|
||||
then:
|
||||
- --metadata
|
||||
- inputValue: metadata
|
||||
- '----output-paths'
|
||||
- outputPath: gcs_output_directory
|
||||
- outputPath: bigquery_output_dataset
|
||||
|
|
@ -1,59 +0,0 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
def automl_split_dataset_table_column_names(
|
||||
dataset_path: str,
|
||||
target_column_name: str,
|
||||
table_index: int = 0,
|
||||
) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
list_table_specs_response = client.list_table_specs(dataset_path)
|
||||
table_specs = [s for s in list_table_specs_response]
|
||||
print('table_specs=')
|
||||
print(table_specs)
|
||||
table_spec_name = table_specs[table_index].name
|
||||
|
||||
list_column_specs_response = client.list_column_specs(table_spec_name)
|
||||
column_specs = [s for s in list_column_specs_response]
|
||||
print('column_specs=')
|
||||
print(column_specs)
|
||||
|
||||
target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0]
|
||||
feature_column_specs = [s for s in column_specs if s.display_name != target_column_name]
|
||||
feature_column_names = [s.name for s in feature_column_specs]
|
||||
|
||||
import json
|
||||
return (target_column_spec.name, json.dumps(feature_column_names))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from kfp.components import create_component_from_func
|
||||
|
||||
automl_split_dataset_table_column_names_op = create_component_from_func(
|
||||
automl_split_dataset_table_column_names,
|
||||
output_component_file='component.yaml',
|
||||
base_image='python:3.7',
|
||||
annotations={
|
||||
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
|
||||
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/split_dataset_table_column_names/component.yaml",
|
||||
},
|
||||
)
|
||||
|
|
@ -1,95 +0,0 @@
|
|||
name: Automl split dataset table column names
|
||||
inputs:
|
||||
- name: dataset_path
|
||||
type: String
|
||||
- name: target_column_name
|
||||
type: String
|
||||
- name: table_index
|
||||
type: Integer
|
||||
default: '0'
|
||||
optional: true
|
||||
outputs:
|
||||
- name: target_column_path
|
||||
type: String
|
||||
- name: feature_column_paths
|
||||
type: JsonArray
|
||||
metadata:
|
||||
annotations:
|
||||
author: Alexey Volkov <alexey.volkov@ark-kun.com>
|
||||
canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/split_dataset_table_column_names/component.yaml'
|
||||
implementation:
|
||||
container:
|
||||
image: python:3.7
|
||||
command:
|
||||
- python3
|
||||
- -u
|
||||
- -c
|
||||
- |
|
||||
from typing import NamedTuple
|
||||
|
||||
def automl_split_dataset_table_column_names(
|
||||
dataset_path: str,
|
||||
target_column_name: str,
|
||||
table_index: int = 0,
|
||||
) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]):
|
||||
import sys
|
||||
import subprocess
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
|
||||
|
||||
from google.cloud import automl
|
||||
client = automl.AutoMlClient()
|
||||
list_table_specs_response = client.list_table_specs(dataset_path)
|
||||
table_specs = [s for s in list_table_specs_response]
|
||||
print('table_specs=')
|
||||
print(table_specs)
|
||||
table_spec_name = table_specs[table_index].name
|
||||
|
||||
list_column_specs_response = client.list_column_specs(table_spec_name)
|
||||
column_specs = [s for s in list_column_specs_response]
|
||||
print('column_specs=')
|
||||
print(column_specs)
|
||||
|
||||
target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0]
|
||||
feature_column_specs = [s for s in column_specs if s.display_name != target_column_name]
|
||||
feature_column_names = [s.name for s in feature_column_specs]
|
||||
|
||||
import json
|
||||
return (target_column_spec.name, json.dumps(feature_column_names))
|
||||
|
||||
import argparse
|
||||
_missing_arg = object()
|
||||
_parser = argparse.ArgumentParser(prog='Automl split dataset table column names', description='')
|
||||
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--target-column-name", dest="target_column_name", type=str, required=True, default=_missing_arg)
|
||||
_parser.add_argument("--table-index", dest="table_index", type=int, required=False, default=_missing_arg)
|
||||
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
|
||||
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
|
||||
_output_files = _parsed_args.pop("_output_paths", [])
|
||||
|
||||
_outputs = automl_split_dataset_table_column_names(**_parsed_args)
|
||||
|
||||
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
|
||||
_outputs = [_outputs]
|
||||
|
||||
import os
|
||||
for idx, output_file in enumerate(_output_files):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(output_file))
|
||||
except OSError:
|
||||
pass
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(str(_outputs[idx]))
|
||||
args:
|
||||
- --dataset-path
|
||||
- inputValue: dataset_path
|
||||
- --target-column-name
|
||||
- inputValue: target_column_name
|
||||
- if:
|
||||
cond:
|
||||
isPresent: table_index
|
||||
then:
|
||||
- --table-index
|
||||
- inputValue: table_index
|
||||
- '----output-paths'
|
||||
- outputPath: target_column_path
|
||||
- outputPath: feature_column_paths
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
# Deprecation Warning
|
||||
|
||||
The components in this directory is now moved to [components/google-cloud/google_cloud_pipeline_components/experimental/bigquery](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/experimental/bigquery). This directory will be removed by the end of 2021.
|
||||
|
|
@ -1,298 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Name\n",
|
||||
"\n",
|
||||
"Gather training data by querying BigQuery \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Labels\n",
|
||||
"\n",
|
||||
"GCP, BigQuery, Kubeflow, Pipeline\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Summary\n",
|
||||
"\n",
|
||||
"A Kubeflow Pipeline component to submit a query to BigQuery and store the result in a Cloud Storage bucket.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Intended use\n",
|
||||
"\n",
|
||||
"Use this Kubeflow component to:\n",
|
||||
"* Select training data by submitting a query to BigQuery.\n",
|
||||
"* Output the training data into a Cloud Storage bucket as CSV files.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Runtime arguments:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"| Argument | Description | Optional | Data type | Accepted values | Default |\n",
|
||||
"|----------|-------------|----------|-----------|-----------------|---------|\n",
|
||||
"| query | The query used by BigQuery to fetch the results. | No | String | | |\n",
|
||||
"| project_id | The project ID of the Google Cloud Platform (GCP) project to use to execute the query. | No | GCPProjectID | | |\n",
|
||||
"| dataset_id | The ID of the persistent BigQuery dataset to store the results of the query. If the dataset does not exist, the operation will create a new one. | Yes | String | | None |\n",
|
||||
"| table_id | The ID of the BigQuery table to store the results of the query. If the table ID is absent, the operation will generate a random ID for the table. | Yes | String | | None |\n",
|
||||
"| output_gcs_path | The path to the Cloud Storage bucket to store the query output. | Yes | GCSPath | | None |\n",
|
||||
"| dataset_location | The location where the dataset is created. Defaults to US. | Yes | String | | US |\n",
|
||||
"| job_config | The full configuration specification for the query job. See [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) for details. | Yes | Dict | A JSONobject which has the same structure as [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) | None |\n",
|
||||
"## Input data schema\n",
|
||||
"\n",
|
||||
"The input data is a BigQuery job containing a query that pulls data f rom various sources. \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Output:\n",
|
||||
"\n",
|
||||
"Name | Description | Type\n",
|
||||
":--- | :---------- | :---\n",
|
||||
"output_gcs_path | The path to the Cloud Storage bucket containing the query output in CSV format. | GCSPath\n",
|
||||
"\n",
|
||||
"## Cautions & requirements\n",
|
||||
"\n",
|
||||
"To use the component, the following requirements must be met:\n",
|
||||
"\n",
|
||||
"* The BigQuery API is enabled.\n",
|
||||
"* The component can authenticate to use GCP APIs. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.\n",
|
||||
"* The Kubeflow user service account is a member of the `roles/bigquery.admin` role of the project.\n",
|
||||
"* The Kubeflow user service account is a member of the `roles/storage.objectCreator `role of the Cloud Storage output bucket.\n",
|
||||
"\n",
|
||||
"## Detailed description\n",
|
||||
"This Kubeflow Pipeline component is used to:\n",
|
||||
"* Submit a query to BigQuery.\n",
|
||||
" * The query results are persisted in a dataset table in BigQuery.\n",
|
||||
" * An extract job is created in BigQuery to extract the data from the dataset table and output it to a Cloud Storage bucket as CSV files.\n",
|
||||
"\n",
|
||||
" Use the code below as an example of how to run your BigQuery job.\n",
|
||||
"\n",
|
||||
"### Sample\n",
|
||||
"\n",
|
||||
"Note: The following sample code works in an IPython notebook or directly in Python code.\n",
|
||||
"\n",
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"\n",
|
||||
"!pip3 install kfp --upgrade"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Load the component using KFP SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.components as comp\n",
|
||||
"\n",
|
||||
"bigquery_query_op = comp.load_component_from_url(\n",
|
||||
" 'https://raw.githubusercontent.com/kubeflow/pipelines/01a23ae8672d3b18e88adf3036071496aca3552d/components/gcp/bigquery/query/component.yaml')\n",
|
||||
"help(bigquery_query_op)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sample\n",
|
||||
"\n",
|
||||
"Note: The following sample code works in IPython notebook or directly in Python code.\n",
|
||||
"\n",
|
||||
"In this sample, we send a query to get the top questions from stackdriver public data and output the data to a Cloud Storage bucket. Here is the query:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Set sample parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Required Parameters\n",
|
||||
"PROJECT_ID = '<Please put your project ID here>'\n",
|
||||
"GCS_WORKING_DIR = 'gs://<Please put your GCS path here>' # No ending slash"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Optional Parameters\n",
|
||||
"EXPERIMENT_NAME = 'Bigquery -Query'\n",
|
||||
"OUTPUT_PATH = '{}/bigquery/query/questions.csv'.format(GCS_WORKING_DIR)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Run the component as a single pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kfp.dsl as dsl\n",
|
||||
"import json\n",
|
||||
"@dsl.pipeline(\n",
|
||||
" name='Bigquery query pipeline',\n",
|
||||
" description='Bigquery query pipeline'\n",
|
||||
")\n",
|
||||
"def pipeline(\n",
|
||||
" query=QUERY, \n",
|
||||
" project_id = PROJECT_ID, \n",
|
||||
" dataset_id='', \n",
|
||||
" table_id='', \n",
|
||||
" output_gcs_path=OUTPUT_PATH, \n",
|
||||
" dataset_location='US', \n",
|
||||
" job_config=''\n",
|
||||
"):\n",
|
||||
" bigquery_query_op(\n",
|
||||
" query=query, \n",
|
||||
" project_id=project_id, \n",
|
||||
" dataset_id=dataset_id, \n",
|
||||
" table_id=table_id, \n",
|
||||
" output_gcs_path=output_gcs_path, \n",
|
||||
" dataset_location=dataset_location, \n",
|
||||
" job_config=job_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Compile the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline_func = pipeline\n",
|
||||
"pipeline_filename = pipeline_func.__name__ + '.zip'\n",
|
||||
"import kfp.compiler as compiler\n",
|
||||
"compiler.Compiler().compile(pipeline_func, pipeline_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit the pipeline for execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Specify pipeline argument values\n",
|
||||
"arguments = {}\n",
|
||||
"\n",
|
||||
"#Get or create an experiment and submit a pipeline run\n",
|
||||
"import kfp\n",
|
||||
"client = kfp.Client()\n",
|
||||
"experiment = client.create_experiment(EXPERIMENT_NAME)\n",
|
||||
"\n",
|
||||
"#Submit a pipeline run\n",
|
||||
"run_name = pipeline_func.__name__ + ' run'\n",
|
||||
"run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Inspect the output"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gsutil cat $OUTPUT_PATH"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/bigquery/_query.py)\n",
|
||||
"* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)\n",
|
||||
"* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/bigquery/query/sample.ipynb)\n",
|
||||
"* [BigQuery query REST API](https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query)\n",
|
||||
"\n",
|
||||
"## License\n",
|
||||
"By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -1,187 +0,0 @@
|
|||
# Name
|
||||
|
||||
Gather data by querying BigQuery and save it in a CSV file.
|
||||
|
||||
|
||||
# Labels
|
||||
|
||||
GCP, BigQuery, Kubeflow, Pipeline
|
||||
|
||||
|
||||
# Summary
|
||||
|
||||
A Kubeflow Pipeline component to submit a query to BigQuery and store the result in a csv file avialble for other components to utalize.
|
||||
|
||||
|
||||
# Details
|
||||
|
||||
|
||||
## Intended use
|
||||
|
||||
Use this Kubeflow component to:
|
||||
* Select training data by submitting a query to BigQuery.
|
||||
* Output the training data into a CSV files.
|
||||
|
||||
|
||||
## Runtime arguments:
|
||||
|
||||
## Runtime arguments:
|
||||
|
||||
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|----------|-------------|----------|-----------|-----------------|---------|
|
||||
| query | The query used by BigQuery to fetch the results. | No | String | | |
|
||||
| project_id | The project ID of the Google Cloud Platform (GCP) project to use to execute the query. | No | GCPProjectID | | |
|
||||
| output_filename | The file name of the output file. | Yes | String | | bq_results.csv |
|
||||
| job_config | The full configuration specification for the query job. See [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) for details. | Yes | Dict | A JSONobject which has the same structure as [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) | None |
|
||||
## Input data schema
|
||||
|
||||
The input data is a BigQuery job containing a query that pulls data from various sources.
|
||||
|
||||
|
||||
## Output:
|
||||
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
output_path | The path to the file containing the query output in CSV format. | OutputPath
|
||||
|
||||
|
||||
## Cautions & requirements
|
||||
|
||||
To use the component, the following requirements must be met:
|
||||
|
||||
* The BigQuery API is enabled.
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* The Kubeflow user service account is a member of the `roles/bigquery.admin` role of the project.
|
||||
* The Kubeflow user service account is a member of the `roles/storage.objectCreator `role of the Cloud Storage output bucket.
|
||||
|
||||
## Detailed description
|
||||
This Kubeflow Pipeline component is used to:
|
||||
* Submit a query to BigQuery.
|
||||
* The query results are extracted and stored as a csv file locally avilable for other kubeflow components.
|
||||
|
||||
Use the code below as an example of how to run your BigQuery job.
|
||||
|
||||
## Sample
|
||||
|
||||
Note: The following sample code works in an IPython notebook or directly in Python code.
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using KFP SDK
|
||||
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
bigquery_query_op = comp.load_component_from_url(
|
||||
'https://raw.githubusercontent.com/kubeflow/pipelines/01a23ae8672d3b18e88adf3036071496aca3552d/components/gcp/bigquery/query/to?gcs/component.yaml')
|
||||
help(bigquery_query_op)
|
||||
```
|
||||
|
||||
### Query
|
||||
|
||||
In this sample, we send a query to get the top questions from stackdriver public data and output the data to CSV file which other components can access. Here is the query:
|
||||
|
||||
|
||||
```python
|
||||
QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'
|
||||
```
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
# Required Parameters
|
||||
PROJECT_ID = '<Please put your project ID here>'
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# Optional Parameters
|
||||
FILE_NAME = 'test.csv'
|
||||
```
|
||||
|
||||
#### Run the component as a single pipeline
|
||||
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Bigquery query pipeline',
|
||||
description='Bigquery query pipeline'
|
||||
)
|
||||
def pipeline(
|
||||
query=QUERY,
|
||||
project_id = PROJECT_ID,
|
||||
output_filename=FILE_NAME
|
||||
job_config=''
|
||||
):
|
||||
bigquery_query_op(
|
||||
query=query,
|
||||
project_id=project_id,
|
||||
job_config=job_config)
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
pipeline_func = pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify pipeline argument values
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment and submit a pipeline run
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
#### Use the output in a pipeline
|
||||
|
||||
Small example on how to use the output form the component, here `read_csv` any component of interest that can consume a csv file.
|
||||
|
||||
```python
|
||||
def pipeline(
|
||||
query=QUERY,
|
||||
project_id = PROJECT_ID,
|
||||
job_config=''
|
||||
):
|
||||
bq_out = bigquery_query(
|
||||
query=query,
|
||||
project_id=project_id,
|
||||
output_filename=FILE_NAME,
|
||||
job_config=job_config)
|
||||
read_csv(input_path=bq_out.outputs["table"] + "/" + FILE_NAME)
|
||||
```
|
||||
|
||||
|
||||
|
||||
## References
|
||||
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/bigquery/_query.py)
|
||||
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [BigQuery query REST API](https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -1,62 +0,0 @@
|
|||
# Export to file for next processing step in pipeline
|
||||
|
||||
# Copyright 2020 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Bigquery - Query
|
||||
description: |
|
||||
A Kubeflow Pipeline component to submit a query to Google Cloud Bigquery and
|
||||
store the results to a csv file.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: query
|
||||
description: 'The query used by Bigquery service to fetch the results.'
|
||||
type: String
|
||||
- name: project_id
|
||||
description: 'The project to execute the query job.'
|
||||
type: GCPProjectID
|
||||
- name: job_config
|
||||
description: >-
|
||||
The full config spec for the query job.See
|
||||
[QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig)
|
||||
for details.
|
||||
default: ''
|
||||
type: Dict
|
||||
- name: output_filename
|
||||
description: 'The output file name'
|
||||
default: 'bq_results.csv'
|
||||
type: String
|
||||
outputs:
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
- name: table
|
||||
description: 'The path to the result from BigQuery'
|
||||
type: CSV
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.bigquery, query,
|
||||
--query, {inputValue: query},
|
||||
--project_id, {inputValue: project_id},
|
||||
--output_path, {outputPath: table},
|
||||
--output_filename, {inputValue: output_filename},
|
||||
--job_config, {inputValue: job_config},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -1,187 +0,0 @@
|
|||
|
||||
# Name
|
||||
|
||||
Gather data by querying BigQuery and save it to GCS.
|
||||
|
||||
|
||||
# Labels
|
||||
|
||||
GCP, BigQuery, Kubeflow, Pipeline
|
||||
|
||||
|
||||
# Summary
|
||||
|
||||
A Kubeflow Pipeline component to submit a query to BigQuery and store the result in table on BigQuery.
|
||||
|
||||
|
||||
# Details
|
||||
|
||||
|
||||
## Intended use
|
||||
|
||||
Use this Kubeflow component to:
|
||||
* Select data by submitting a query to BigQuery.
|
||||
* Output the data into a table on BigQuery.
|
||||
|
||||
|
||||
## Runtime arguments:
|
||||
|
||||
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|----------|-------------|----------|-----------|-----------------|---------|
|
||||
| query | The query used by BigQuery to fetch the results. | No | String | | |
|
||||
| project_id | The project ID of the Google Cloud Platform (GCP) project to use to execute the query. | No | GCPProjectID | | |
|
||||
| dataset_id | The ID of the persistent BigQuery dataset to store the results of the query. If the dataset does not exist, the operation will create a new one. | Yes | String | | None |
|
||||
| table_id | The ID of the BigQuery table to store the results of the query. If the table ID is absent, the operation will generate a random ID for the table. | Yes | String | | None |
|
||||
| dataset_location | The location where the dataset is created. Defaults to US. | Yes | String | | US |
|
||||
| job_config | The full configuration specification for the query job. See [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) for details. | Yes | Dict | A JSONobject which has the same structure as [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) | None |
|
||||
|
||||
## Input data schema
|
||||
|
||||
The input data is a BigQuery job containing a query that pulls data from various sources.
|
||||
|
||||
|
||||
## Output:
|
||||
|
||||
Name | Description | Type
|
||||
:--- | :---------- | :---
|
||||
output_gcs_path | The path to the Cloud Storage bucket containing the query output in CSV format. | GCSPath
|
||||
|
||||
|
||||
## Cautions & requirements
|
||||
|
||||
To use the component, the following requirements must be met:
|
||||
|
||||
* The BigQuery API is enabled.
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* The Kubeflow user service account is a member of the `roles/bigquery.admin` role of the project.
|
||||
* The Kubeflow user service account is a member of the `roles/storage.objectCreator `role of the Cloud Storage output bucket.
|
||||
|
||||
## Detailed description
|
||||
This Kubeflow Pipeline component is used to:
|
||||
* Submit a query to BigQuery.
|
||||
* The query results are persisted in a dataset table in BigQuery.
|
||||
* The data is extracted localy and stored as a csv file.
|
||||
|
||||
Use the code below as an example of how to run your BigQuery job.
|
||||
|
||||
### Sample
|
||||
|
||||
Note: The following sample code works in an IPython notebook or directly in Python code.
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using KFP SDK
|
||||
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
bigquery_query_op = comp.load_component_from_url(
|
||||
'https://raw.githubusercontent.com/kubeflow/pipelines/01a23ae8672d3b18e88adf3036071496aca3552d/components/gcp/bigquery/query/to?gcs/component.yaml')
|
||||
help(bigquery_query_op)
|
||||
```
|
||||
|
||||
### Query
|
||||
|
||||
In this sample, we send a query to get the top questions from stackdriver public data and output the data to a Cloud Storage bucket. Here is the query:
|
||||
|
||||
|
||||
```python
|
||||
QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'
|
||||
```
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
# Required Parameters
|
||||
PROJECT_ID = '<Please put your project ID here>'
|
||||
GCS_WORKING_DIR = 'gs://<Please put your GCS path here>' # No ending slash
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# Optional Parameters
|
||||
EXPERIMENT_NAME = 'Bigquery-Query'
|
||||
OUTPUT_PATH = '{}/bigquery/query/questions.csv'.format(GCS_WORKING_DIR)
|
||||
```
|
||||
|
||||
#### Run the component as a single pipeline
|
||||
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Bigquery query pipeline',
|
||||
description='Bigquery query pipeline'
|
||||
)
|
||||
def pipeline(
|
||||
query=QUERY,
|
||||
project_id = PROJECT_ID,
|
||||
dataset_id='',
|
||||
table_id='',
|
||||
output_gcs_path=OUTPUT_PATH,
|
||||
dataset_location='US',
|
||||
job_config=''
|
||||
):
|
||||
bigquery_query_op(
|
||||
query=query,
|
||||
project_id=project_id,
|
||||
dataset_id=dataset_id,
|
||||
table_id=table_id,
|
||||
output_gcs_path=output_gcs_path,
|
||||
dataset_location=dataset_location,
|
||||
job_config=job_config)
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
pipeline_func = pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify pipeline argument values
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment and submit a pipeline run
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
#### Inspect the output
|
||||
|
||||
|
||||
```python
|
||||
!gsutil cat $OUTPUT_PATH
|
||||
```
|
||||
|
||||
## References
|
||||
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/bigquery/_query.py)
|
||||
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [Sample notebook](https://github.com/kubeflow/pipelines/blob/master/components/gcp/bigquery/query/sample.ipynb)
|
||||
* [BigQuery query REST API](https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
# Export to bucket in gcs
|
||||
|
||||
# Copyright 2020 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Bigquery - Query
|
||||
description: |
|
||||
A Kubeflow Pipeline component to submit a query to Google Cloud Bigquery
|
||||
service and dump outputs to a Google Cloud Storage blob.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: query
|
||||
description: 'The query used by Bigquery service to fetch the results.'
|
||||
type: String
|
||||
- name: project_id
|
||||
description: 'The project to execute the query job.'
|
||||
type: GCPProjectID
|
||||
- name: dataset_id
|
||||
description: 'The ID of the persistent dataset to keep the results of the query.'
|
||||
default: ''
|
||||
type: String
|
||||
- name: table_id
|
||||
description: >-
|
||||
The ID of the table to keep the results of the query. If absent, the operation
|
||||
will generate a random id for the table.
|
||||
default: ''
|
||||
type: String
|
||||
- name: output_gcs_path
|
||||
description: 'The path to the Cloud Storage bucket to store the query output.'
|
||||
default: ''
|
||||
type: GCSPath
|
||||
- name: output_destination_format
|
||||
description: 'The name of the output destination format. Default is CSV, and you can also choose NEWLINE_DELIMITED_JSON and AVRO.'
|
||||
default: 'CSV'
|
||||
type: String
|
||||
- name: dataset_location
|
||||
description: 'The location to create the dataset. Defaults to `US`.'
|
||||
default: 'US'
|
||||
type: String
|
||||
- name: job_config
|
||||
description: >-
|
||||
The full config spec for the query job.See
|
||||
[QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig)
|
||||
for details.
|
||||
default: ''
|
||||
type: Dict
|
||||
- name: output_kfp_path
|
||||
description: 'The path to where the file should be stored.'
|
||||
default: ''
|
||||
type: String
|
||||
outputs:
|
||||
- name: output_gcs_path
|
||||
description: 'The path to the Cloud Storage bucket containing the query output in CSV format.'
|
||||
type: GCSPath
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ["python", -u, -m, "kfp_component.launcher"]
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.bigquery, query,
|
||||
--query, {inputValue: query},
|
||||
--project_id, {inputValue: project_id},
|
||||
--dataset_id, {inputValue: dataset_id},
|
||||
--table_id, {inputValue: table_id},
|
||||
--dataset_location, {inputValue: dataset_location},
|
||||
--output_gcs_path, {inputValue: output_gcs_path},
|
||||
--output_destination_format, {inputValue: output_destination_format},
|
||||
--job_config, {inputValue: job_config},
|
||||
--output_gcs_path_output_path, {outputPath: output_gcs_path},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -1,173 +0,0 @@
|
|||
|
||||
# Name
|
||||
|
||||
Gather data by querying BigQuery and save it to a table in BigQuery.
|
||||
|
||||
|
||||
# Labels
|
||||
|
||||
GCP, BigQuery, Kubeflow, Pipeline
|
||||
|
||||
|
||||
# Summary
|
||||
|
||||
A Kubeflow Pipeline component to submit a query to BigQuery and store the result in a table in BigQuery.
|
||||
|
||||
|
||||
# Details
|
||||
|
||||
|
||||
## Intended use
|
||||
|
||||
Use this Kubeflow component to:
|
||||
* Select data by submitting a query to BigQuery.
|
||||
* Output the data into a table in BigQuery.
|
||||
|
||||
|
||||
## Runtime arguments:
|
||||
|
||||
|
||||
| Argument | Description | Optional | Data type | Accepted values | Default |
|
||||
|----------|-------------|----------|-----------|-----------------|---------|
|
||||
| query | The query used by BigQuery to fetch the results. | No | String | | |
|
||||
| project_id | The project ID of the Google Cloud Platform (GCP) project to use to execute the query. | No | GCPProjectID | | |
|
||||
| dataset_id | The ID of the persistent BigQuery dataset to store the results of the query. If the dataset does not exist, the operation will create a new one. | Yes | String | | None |
|
||||
| table_id | The ID of the BigQuery table to store the results of the query. If the table ID is absent, the operation will generate a random ID for the table. | Yes | String | | None |
|
||||
| dataset_location | The location where the dataset is created. Defaults to US. | Yes | String | | US |
|
||||
| job_config | The full configuration specification for the query job. See [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) for details. | Yes | Dict | A JSONobject which has the same structure as [QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig) | None |
|
||||
## Input data schema
|
||||
|
||||
The input data is a BigQuery job containing a query that pulls data from various sources.
|
||||
|
||||
|
||||
## Output:
|
||||
|
||||
|
||||
## Cautions & requirements
|
||||
|
||||
To use the component, the following requirements must be met:
|
||||
|
||||
* The BigQuery API is enabled.
|
||||
* The component can authenticate to GCP. Refer to [Authenticating Pipelines to GCP](https://www.kubeflow.org/docs/gke/authentication-pipelines/) for details.
|
||||
* The Kubeflow user service account is a member of the `roles/bigquery.admin` role of the project.
|
||||
* The Kubeflow user service account is a member of the `roles/storage.objectCreator `role of the Cloud Storage output bucket.
|
||||
|
||||
## Detailed description
|
||||
This Kubeflow Pipeline component is used to:
|
||||
* Submit a query to BigQuery.
|
||||
* The query results are persisted in a dataset table in BigQuery.
|
||||
|
||||
### Sample
|
||||
|
||||
Note: The following sample code works in an IPython notebook or directly in Python code.
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
%%capture --no-stderr
|
||||
|
||||
!pip3 install kfp --upgrade
|
||||
```
|
||||
|
||||
2. Load the component using KFP SDK
|
||||
|
||||
|
||||
```python
|
||||
import kfp.components as comp
|
||||
|
||||
bigquery_query_op = comp.load_component_from_url(
|
||||
'https://raw.githubusercontent.com/kubeflow/pipelines/1.7.0-rc.3/components/gcp/bigquery/query/to_table/component.yaml')
|
||||
help(bigquery_query_op)
|
||||
```
|
||||
|
||||
### Query
|
||||
|
||||
In this sample, we send a query to get the top questions from stackdriver public data and write the result to a table.
|
||||
|
||||
|
||||
```python
|
||||
QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'
|
||||
```
|
||||
|
||||
#### Set sample parameters
|
||||
|
||||
|
||||
```python
|
||||
# Required Parameters
|
||||
PROJECT_ID = '<Please put your project ID here>'
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# Optional Parameters
|
||||
EXPERIMENT_NAME = 'Bigquery-Query'
|
||||
DATASET_ID = "TEST_DATASET"
|
||||
TABLE_ID = "TEST_TABLE"
|
||||
```
|
||||
|
||||
#### Run the component as a single pipeline
|
||||
|
||||
|
||||
```python
|
||||
import kfp.dsl as dsl
|
||||
import json
|
||||
@dsl.pipeline(
|
||||
name='Bigquery query pipeline',
|
||||
description='Bigquery query pipeline'
|
||||
)
|
||||
def pipeline(
|
||||
query=QUERY,
|
||||
project_id=PROJECT_ID,
|
||||
dataset_id=DATASET_ID,
|
||||
table_id=TABLE_ID,
|
||||
dataset_location='US',
|
||||
job_config=''
|
||||
):
|
||||
bigquery_query_op(
|
||||
query=query,
|
||||
project_id=project_id,
|
||||
dataset_id=dataset_id,
|
||||
table_id=table_id,
|
||||
dataset_location=dataset_location,
|
||||
job_config=job_config)
|
||||
```
|
||||
|
||||
#### Compile the pipeline
|
||||
|
||||
|
||||
```python
|
||||
pipeline_func = pipeline
|
||||
pipeline_filename = pipeline_func.__name__ + '.zip'
|
||||
import kfp.compiler as compiler
|
||||
compiler.Compiler().compile(pipeline_func, pipeline_filename)
|
||||
```
|
||||
|
||||
#### Submit the pipeline for execution
|
||||
|
||||
|
||||
```python
|
||||
#Specify pipeline argument values
|
||||
arguments = {}
|
||||
|
||||
#Get or create an experiment and submit a pipeline run
|
||||
import kfp
|
||||
client = kfp.Client()
|
||||
experiment = client.create_experiment(EXPERIMENT_NAME)
|
||||
|
||||
#Submit a pipeline run
|
||||
run_name = pipeline_func.__name__ + ' run'
|
||||
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
|
||||
```
|
||||
|
||||
#### Inspect the output
|
||||
|
||||
Find the create table under the specified dataset id and table id.
|
||||
|
||||
## References
|
||||
* [Component python code](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/component_sdk/python/kfp_component/google/bigquery/_query.py)
|
||||
* [Component docker file](https://github.com/kubeflow/pipelines/blob/master/components/gcp/container/Dockerfile)
|
||||
* [BigQuery query REST API](https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query)
|
||||
|
||||
## License
|
||||
By deploying or using this software you agree to comply with the [AI Hub Terms of Service](https://aihub.cloud.google.com/u/0/aihub-tos) and the [Google APIs Terms of Service](https://developers.google.com/terms/). To the extent of a direct conflict of terms, the AI Hub Terms of Service will control.
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
# export to new table.
|
||||
|
||||
# Copyright 2020 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Bigquery - Query
|
||||
description: |
|
||||
A Kubeflow Pipeline component to submit a query to Google Cloud Bigquery
|
||||
service and dump outputs to new table.
|
||||
metadata:
|
||||
labels:
|
||||
add-pod-env: 'true'
|
||||
inputs:
|
||||
- name: query
|
||||
description: 'The query used by Bigquery service to fetch the results.'
|
||||
type: String
|
||||
- name: project_id
|
||||
description: 'The project to execute the query job.'
|
||||
type: GCPProjectID
|
||||
- name: dataset_id
|
||||
description: 'The ID of the persistent dataset to keep the results of the query.'
|
||||
default: ''
|
||||
type: String
|
||||
- name: table_id
|
||||
description: >-
|
||||
The ID of the table to keep the results of the query. If absent, the operation
|
||||
will generate a random id for the table.
|
||||
default: ''
|
||||
type: String
|
||||
- name: dataset_location
|
||||
description: 'The location to create the dataset. Defaults to `US`.'
|
||||
default: 'US'
|
||||
type: String
|
||||
- name: job_config
|
||||
description: >-
|
||||
The full config spec for the query job.See
|
||||
[QueryJobConfig](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig)
|
||||
for details.
|
||||
default: ''
|
||||
type: Dict
|
||||
outputs:
|
||||
- name: MLPipeline UI metadata
|
||||
type: UI metadata
|
||||
implementation:
|
||||
container:
|
||||
image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3
|
||||
command: ['python', '-u', '-m', 'kfp_component.launcher']
|
||||
args: [
|
||||
--ui_metadata_path, {outputPath: MLPipeline UI metadata},
|
||||
kfp_component.google.bigquery, query,
|
||||
--query, {inputValue: query},
|
||||
--project_id, {inputValue: project_id},
|
||||
--dataset_id, {inputValue: dataset_id},
|
||||
--table_id, {inputValue: table_id},
|
||||
--dataset_location, {inputValue: dataset_location},
|
||||
--job_config, {inputValue: job_config},
|
||||
]
|
||||
env:
|
||||
KFP_POD_NAME: "{{pod.name}}"
|
||||
|
|
@ -1,36 +0,0 @@
|
|||
# Copyright 2021 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM python:3.7
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget patch \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
# Pin pip version to work around https://github.com/apache/beam/issues/22218
|
||||
RUN python3 -m pip install pip==21.2.4
|
||||
RUN python3 -m pip install -r \
|
||||
requirements.txt --quiet --no-cache-dir \
|
||||
&& rm -f requirements.txt
|
||||
|
||||
ADD build /ml
|
||||
WORKDIR /ml
|
||||
RUN pip install .
|
||||
|
||||
# The patch sets User Agent for telemetry purpose.
|
||||
# It is based on "google-api-python-client==1.7.8", and needs to be updated when upgrading the package.
|
||||
RUN patch /usr/local/lib/python3.7/site-packages/googleapiclient/http.py < /ml/patches/http.patch
|
||||
|
||||
ENTRYPOINT ["python", "-u", "-m", "kfp_component.launcher"]
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
#!/bin/bash -e
|
||||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
mkdir -p ./build
|
||||
cp -arv ./component_sdk/python/. ./build/
|
||||
|
||||
../../build_image.sh -l ml-pipeline-gcp "$@"
|
||||
rm -rf ./build
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
approvers:
|
||||
- hongye-sun
|
||||
reviewers:
|
||||
- Ark-kun
|
||||
- gaoning777
|
||||
- hongye-sun
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from . import launcher, core, google
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._kfp_execution_context import KfpExecutionContext
|
||||
from . import _display as display
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import json
|
||||
import threading
|
||||
import logging
|
||||
|
||||
_OUTPUT_PATH = os.environ.get('KFP_UI_METADATA_PATH', '/mlpipeline-ui-metadata.json')
|
||||
_OUTPUT_FILE_LOCK = threading.Lock()
|
||||
|
||||
def display(obj):
|
||||
"""Display an object to KFP UI.
|
||||
|
||||
Args:
|
||||
obj (object): the object to output the display metadata. It follows same
|
||||
convention defined by IPython display API. The currently supported representation
|
||||
functions:
|
||||
|
||||
* `_repr_markdown_`: it returns a markdown content which will be converted into a
|
||||
web-app metadata to KFP UI.
|
||||
* `_repr_kfpmetadata_`: it returns a KFP metadata json object, which follows
|
||||
the convention from https://www.kubeflow.org/docs/pipelines/output-viewer/.
|
||||
|
||||
The supported builtin objects are markdown, Tensorboard, Link.
|
||||
"""
|
||||
obj_dir = dir(obj)
|
||||
if '_repr_markdown_' in obj_dir:
|
||||
display_markdown(obj)
|
||||
|
||||
if '_repr_kfpmetadata_' in obj_dir:
|
||||
display_kfpmetadata(obj)
|
||||
|
||||
logging.info(str(obj))
|
||||
|
||||
def display_markdown(obj):
|
||||
"""Display markdown representation to KFP UI.
|
||||
"""
|
||||
if '_repr_markdown_' not in dir(obj):
|
||||
raise ValueError('_repr_markdown_ function is not present.')
|
||||
markdown = obj._repr_markdown_()
|
||||
_output_ui_metadata({
|
||||
'type': 'markdown',
|
||||
'source': markdown,
|
||||
'storage': 'inline'
|
||||
})
|
||||
|
||||
def display_kfpmetadata(obj):
|
||||
"""Display from KFP UI metadata
|
||||
"""
|
||||
if '_repr_kfpmetadata_' not in dir(obj):
|
||||
raise ValueError('_repr_kfpmetadata_ function is not present.')
|
||||
kfp_metadata = obj._repr_kfpmetadata_()
|
||||
_output_ui_metadata(kfp_metadata)
|
||||
|
||||
def _output_ui_metadata(output):
|
||||
with _OUTPUT_FILE_LOCK:
|
||||
metadata = {}
|
||||
if os.path.isfile(_OUTPUT_PATH):
|
||||
with open(_OUTPUT_PATH, 'r') as f:
|
||||
metadata = json.load(f)
|
||||
|
||||
with open(_OUTPUT_PATH, 'w') as f:
|
||||
if 'outputs' not in metadata:
|
||||
metadata['outputs'] = []
|
||||
metadata['outputs'].append(output)
|
||||
json.dump(metadata, f)
|
||||
|
||||
class Markdown(object):
|
||||
"""Class to hold markdown raw data.
|
||||
"""
|
||||
def __init__(self, data):
|
||||
self._data = data
|
||||
|
||||
def _repr_markdown_(self):
|
||||
return self._data
|
||||
|
||||
def __repr__(self):
|
||||
return self._data
|
||||
|
||||
class Tensorboard(object):
|
||||
"""Class to hold tensorboard metadata.
|
||||
"""
|
||||
def __init__(self, job_dir):
|
||||
self._job_dir = job_dir
|
||||
|
||||
def _repr_kfpmetadata_(self):
|
||||
return {
|
||||
'type': 'tensorboard',
|
||||
'source': self._job_dir
|
||||
}
|
||||
|
||||
def __repr__(self):
|
||||
return 'Open Tensorboard at: {}'.format(self._job_dir)
|
||||
|
||||
class Link(Markdown):
|
||||
"""Class to hold an markdown hyperlink data.
|
||||
"""
|
||||
def __init__(self, href, text):
|
||||
super(Link, self).__init__(
|
||||
'## [{}]({})'.format(text, href))
|
||||
self._href = href
|
||||
self._text = text
|
||||
|
||||
def __repr__(self):
|
||||
return '{}: {}'.format(self._text, self._href)
|
||||
|
|
@ -1,158 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import signal
|
||||
import logging
|
||||
import json
|
||||
from datetime import datetime
|
||||
import os
|
||||
import hashlib
|
||||
import uuid
|
||||
import re
|
||||
|
||||
from kubernetes import client, config
|
||||
from kubernetes.client.rest import ApiException
|
||||
|
||||
DEFAULT_NAMESPACE = 'kubeflow'
|
||||
KFP_POD_ENV_NAME = 'KFP_POD_NAME'
|
||||
KFP_NAMESPACE_ENV_NAME = 'KFP_NAMESPACE'
|
||||
ARGO_EXECUTION_CONTROL_ANNOTATION = 'workflows.argoproj.io/execution'
|
||||
ARGO_NODE_NAME_ANNOTATION = 'workflows.argoproj.io/node-name'
|
||||
|
||||
class KfpExecutionContext:
|
||||
"""Execution context for running inside Kubeflow Pipelines.
|
||||
|
||||
The base class is aware of the KFP environment and can cascade
|
||||
pipeline cancel or deadline event to the operation through
|
||||
``on_cancel`` handler.
|
||||
|
||||
Args:
|
||||
on_cancel: optional, function to handle KFP cancel event.
|
||||
"""
|
||||
def __init__(self, on_cancel=None):
|
||||
self._load_kfp_environment()
|
||||
self._context_id = self._generate_context_id()
|
||||
logging.info('Start KFP context with ID: {}.'.format(
|
||||
self._context_id))
|
||||
self._on_cancel = on_cancel
|
||||
self._original_sigterm_hanlder = None
|
||||
|
||||
def __enter__(self):
|
||||
self._original_sigterm_hanlder = signal.getsignal(signal.SIGTERM)
|
||||
signal.signal(signal.SIGTERM, self._exit_gracefully)
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
signal.signal(signal.SIGTERM, self._original_sigterm_hanlder)
|
||||
|
||||
def context_id(self):
|
||||
"""Returns a stable context ID across retries. The ID is in
|
||||
32 bytes hex format.
|
||||
"""
|
||||
return self._context_id
|
||||
|
||||
def under_kfp_environment(self):
|
||||
"""Returns true if the execution is under KFP environment.
|
||||
"""
|
||||
return self._pod_name and self._k8s_client and self._argo_node_name
|
||||
|
||||
def _generate_context_id(self):
|
||||
if self.under_kfp_environment():
|
||||
stable_name_name = re.sub(r'\(\d+\)$', '', self._argo_node_name)
|
||||
return hashlib.md5(bytes(stable_name_name.encode())).hexdigest()
|
||||
else:
|
||||
return uuid.uuid1().hex
|
||||
|
||||
def _load_kfp_environment(self):
|
||||
self._pod_name = os.environ.get(KFP_POD_ENV_NAME, None)
|
||||
self._namespace = os.environ.get(KFP_NAMESPACE_ENV_NAME, DEFAULT_NAMESPACE)
|
||||
if not self._pod_name:
|
||||
self._k8s_client = None
|
||||
else:
|
||||
try:
|
||||
config.load_incluster_config()
|
||||
self._k8s_client = client.CoreV1Api()
|
||||
except Exception as e:
|
||||
logging.warning('Failed to load kubernetes client:'
|
||||
' {}.'.format(e))
|
||||
self._k8s_client = None
|
||||
if self._pod_name and self._k8s_client:
|
||||
self._argo_node_name = self._get_argo_node_name()
|
||||
|
||||
if not self.under_kfp_environment():
|
||||
logging.warning('Running without KFP context.')
|
||||
|
||||
def _get_argo_node_name(self):
|
||||
pod = self._get_pod()
|
||||
if not pod or not pod.metadata or not pod.metadata.annotations:
|
||||
return None
|
||||
|
||||
return pod.metadata.annotations.get(
|
||||
ARGO_NODE_NAME_ANNOTATION, None)
|
||||
|
||||
def _exit_gracefully(self, signum, frame):
|
||||
logging.info('SIGTERM signal received.')
|
||||
if (self._on_cancel and
|
||||
self.under_kfp_environment() and
|
||||
self._should_cancel()):
|
||||
logging.info('Cancelling...')
|
||||
self._on_cancel()
|
||||
|
||||
logging.info('Exit')
|
||||
|
||||
def _should_cancel(self):
|
||||
"""Checks argo's execution config deadline and decide whether the operation
|
||||
should be cancelled.
|
||||
|
||||
Argo cancels workflow by setting deadline to 0 and sends SIGTERM
|
||||
signal to main container with 10s graceful period.
|
||||
"""
|
||||
pod = self._get_pod()
|
||||
if not pod or not pod.metadata or not pod.metadata.annotations:
|
||||
logging.info('No pod metadata or annotations.')
|
||||
return False
|
||||
|
||||
argo_execution_config_json = pod.metadata.annotations.get(
|
||||
ARGO_EXECUTION_CONTROL_ANNOTATION, None)
|
||||
if not argo_execution_config_json:
|
||||
logging.info('No argo execution config data.')
|
||||
return False
|
||||
|
||||
try:
|
||||
argo_execution_config = json.loads(argo_execution_config_json)
|
||||
except Exception as e:
|
||||
logging.error("Error deserializing argo execution config: {}".format(e))
|
||||
return False
|
||||
|
||||
deadline_json = argo_execution_config.get('deadline', None)
|
||||
if not deadline_json:
|
||||
logging.info('No argo execution deadline config.')
|
||||
return False
|
||||
|
||||
try:
|
||||
deadline = datetime.strptime(deadline_json, '%Y-%m-%dT%H:%M:%SZ')
|
||||
except Exception as e:
|
||||
logging.error("Error converting deadline string to datetime: {}".format(e))
|
||||
return False
|
||||
|
||||
return datetime.now() > deadline
|
||||
|
||||
def _get_pod(self):
|
||||
logging.info('Fetching latest pod metadata: {}.'.format(
|
||||
self._pod_name))
|
||||
try:
|
||||
return self._k8s_client.read_namespaced_pod(
|
||||
self._pod_name, self._namespace)
|
||||
except Exception as e:
|
||||
logging.error('Failed to get pod: {}'.format(e))
|
||||
return None
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from . import ml_engine, dataflow, dataproc
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._query import query
|
||||
|
|
@ -1,155 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
from google.cloud import bigquery
|
||||
from google.cloud.bigquery.job import ExtractJobConfig, DestinationFormat
|
||||
from google.api_core import exceptions
|
||||
|
||||
from kfp_component.core import KfpExecutionContext, display
|
||||
from .. import common as gcp_common
|
||||
|
||||
# TODO(hongyes): make this path configurable as a environment variable
|
||||
KFP_OUTPUT_PATH = '/tmp/kfp/output/'
|
||||
|
||||
|
||||
def query(query, project_id, dataset_id=None, table_id=None,
|
||||
output_gcs_path=None, dataset_location='US', job_config=None,
|
||||
output_path=None, output_filename=None, output_destination_format="CSV",
|
||||
job_object_output_path='/tmp/kfp/output/bigquery/query-job.json',
|
||||
output_gcs_path_output_path='/tmp/kfp/output/bigquery/query-output-path.txt',
|
||||
output_dataset_id_output_path='/tmp/kfp/output/bigquery/query-dataset-id.txt',
|
||||
output_table_id_output_path='/tmp/kfp/output/bigquery/query-table-id.txt',
|
||||
):
|
||||
"""Submit a query to Bigquery service and dump outputs to Bigquery table or
|
||||
a GCS blob.
|
||||
|
||||
Args:
|
||||
query (str): The query used by Bigquery service to fetch the results.
|
||||
project_id (str): The project to execute the query job.
|
||||
dataset_id (str): The ID of the persistent dataset to keep the results
|
||||
of the query. If the dataset does not exist, the operation will
|
||||
create a new one.
|
||||
table_id (str): The ID of the table to keep the results of the query. If
|
||||
absent, the operation will generate a random id for the table.
|
||||
output_gcs_path (str): The GCS blob path to dump the query results to.
|
||||
dataset_location (str): The location to create the dataset. Defaults to `US`.
|
||||
job_config (dict): The full config spec for the query job.
|
||||
output_path (str): The path to where query result will be stored
|
||||
output_filename (str): The name of the file where the results will be stored
|
||||
output_destination_format (str): The name of the output destination format.
|
||||
Default is CSV, and you can also choose NEWLINE_DELIMITED_JSON and AVRO.
|
||||
Returns:
|
||||
The API representation of the completed query job.
|
||||
"""
|
||||
client = bigquery.Client(project=project_id, location=dataset_location)
|
||||
if not job_config:
|
||||
job_config = bigquery.QueryJobConfig()
|
||||
job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
|
||||
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
|
||||
else:
|
||||
job_config = bigquery.QueryJobConfig.from_api_repr(job_config)
|
||||
job_id = None
|
||||
def cancel():
|
||||
if job_id:
|
||||
client.cancel_job(job_id)
|
||||
with KfpExecutionContext(on_cancel=cancel) as ctx:
|
||||
job_id = 'query_' + ctx.context_id()
|
||||
query_job = _get_job(client, job_id)
|
||||
table_ref = None
|
||||
if not query_job:
|
||||
dataset_ref = _prepare_dataset_ref(client, dataset_id, output_gcs_path,
|
||||
dataset_location)
|
||||
if dataset_ref:
|
||||
if not table_id:
|
||||
table_id = job_id
|
||||
table_ref = dataset_ref.table(table_id)
|
||||
job_config.destination = table_ref
|
||||
gcp_common.dump_file(output_dataset_id_output_path, table_ref.dataset_id)
|
||||
gcp_common.dump_file(output_table_id_output_path, table_ref.table_id)
|
||||
query_job = client.query(query, job_config, job_id=job_id)
|
||||
_display_job_link(project_id, job_id)
|
||||
if output_path != None: #Write to local file
|
||||
result = query_job.result()
|
||||
if not os.path.exists(output_path):
|
||||
os.makedirs(output_path)
|
||||
df = result.to_dataframe()
|
||||
df.to_csv(os.path.join(output_path, output_filename))
|
||||
else:
|
||||
query_job.result()
|
||||
if output_gcs_path:
|
||||
job_id = 'extract_' + ctx.context_id()
|
||||
extract_job = _get_job(client, job_id)
|
||||
logging.info('Extracting data from table {} to {}.'.format(str(table_ref), output_gcs_path))
|
||||
if not extract_job:
|
||||
job_config = ExtractJobConfig(destination_format=output_destination_format)
|
||||
extract_job = client.extract_table(table_ref, output_gcs_path, job_config=job_config)
|
||||
extract_job.result() # Wait for export to finish
|
||||
# TODO: Replace '-' with empty string when most users upgrade to Argo version which has the fix: https://github.com/argoproj/argo-workflows/pull/1653
|
||||
gcp_common.dump_file(output_gcs_path_output_path, output_gcs_path or '-')
|
||||
|
||||
gcp_common.dump_file(job_object_output_path, json.dumps(query_job.to_api_repr()))
|
||||
return query_job.to_api_repr()
|
||||
|
||||
def _get_job(client, job_id):
|
||||
try:
|
||||
return client.get_job(job_id)
|
||||
except exceptions.NotFound:
|
||||
return None
|
||||
|
||||
def _prepare_dataset_ref(client, dataset_id, output_gcs_path, dataset_location):
|
||||
if not output_gcs_path and not dataset_id:
|
||||
return None
|
||||
if not dataset_id:
|
||||
dataset_id = 'kfp_tmp_dataset'
|
||||
dataset_ref = client.dataset(dataset_id)
|
||||
dataset = _get_dataset(client, dataset_ref)
|
||||
if not dataset:
|
||||
logging.info('Creating dataset {}'.format(dataset_id))
|
||||
dataset = _create_dataset(client, dataset_ref, dataset_location)
|
||||
return dataset_ref
|
||||
|
||||
def _get_dataset(client, dataset_ref):
|
||||
try:
|
||||
return client.get_dataset(dataset_ref)
|
||||
except exceptions.NotFound:
|
||||
return None
|
||||
|
||||
def _create_dataset(client, dataset_ref, location):
|
||||
dataset = bigquery.Dataset(dataset_ref)
|
||||
dataset.location = location
|
||||
return client.create_dataset(dataset)
|
||||
|
||||
def _display_job_link(project_id, job_id):
|
||||
display.display(display.Link(
|
||||
href= 'https://console.cloud.google.com/bigquery?project={}'
|
||||
'&j={}&page=queryresults'.format(project_id, job_id),
|
||||
text='Query Details'
|
||||
))
|
||||
|
||||
def _dump_outputs(job, output_path, table_ref):
|
||||
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-job.json',
|
||||
json.dumps(job.to_api_repr()))
|
||||
if not output_path:
|
||||
output_path = '-' # Replace with empty string when we upgrade to Argo version which has the fix: https://github.com/argoproj/argo-workflows/pull/1653
|
||||
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-output-path.txt',
|
||||
output_path)
|
||||
(dataset_id, table_id) = (table_ref.dataset_id, table_ref.table_id) if table_ref else ('-', '-')
|
||||
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-dataset-id.txt',
|
||||
dataset_id)
|
||||
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-table-id.txt',
|
||||
table_id)
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._utils import (normalize_name, dump_file,
|
||||
check_resource_changed, wait_operation_done, ClientWithRetries)
|
||||
|
|
@ -1,177 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import abc
|
||||
import logging
|
||||
import re
|
||||
import os
|
||||
import time
|
||||
from functools import wraps
|
||||
from typing import Any, Callable, Optional, Tuple
|
||||
|
||||
def normalize_name(name,
|
||||
valid_first_char_pattern='a-zA-Z',
|
||||
valid_char_pattern='0-9a-zA-Z_',
|
||||
invalid_char_placeholder='_',
|
||||
prefix_placeholder='x_'):
|
||||
"""Normalize a name to a valid resource name.
|
||||
|
||||
Uses ``valid_first_char_pattern`` and ``valid_char_pattern`` regex pattern
|
||||
to find invalid characters from ``name`` and replaces them with
|
||||
``invalid_char_placeholder`` or prefix the name with ``prefix_placeholder``.
|
||||
|
||||
Args:
|
||||
name: The name to be normalized.
|
||||
valid_first_char_pattern: The regex pattern for the first character.
|
||||
valid_char_pattern: The regex pattern for all the characters in the name.
|
||||
invalid_char_placeholder: The placeholder to replace invalid characters.
|
||||
prefix_placeholder: The placeholder to prefix the name if the first char
|
||||
is invalid.
|
||||
|
||||
Returns:
|
||||
The normalized name. Unchanged if all characters are valid.
|
||||
"""
|
||||
if not name:
|
||||
return name
|
||||
normalized_name = re.sub('[^{}]+'.format(valid_char_pattern),
|
||||
invalid_char_placeholder, name)
|
||||
if not re.match('[{}]'.format(valid_first_char_pattern),
|
||||
normalized_name[0]):
|
||||
normalized_name = prefix_placeholder + normalized_name
|
||||
if name != normalized_name:
|
||||
logging.info('Normalize name from "{}" to "{}".'.format(
|
||||
name, normalized_name))
|
||||
return normalized_name
|
||||
|
||||
def dump_file(path, content):
|
||||
"""Dumps string into local file.
|
||||
|
||||
Args:
|
||||
path: the local path to the file.
|
||||
content: the string content to dump.
|
||||
"""
|
||||
directory = os.path.dirname(path)
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
elif os.path.exists(path):
|
||||
logging.warning('The file {} will be overwritten.'.format(path))
|
||||
with open(path, 'w') as f:
|
||||
f.write(content)
|
||||
|
||||
def check_resource_changed(requested_resource,
|
||||
existing_resource, property_names):
|
||||
"""Check if a resource has been changed.
|
||||
|
||||
The function checks requested resource with existing resource
|
||||
by comparing specified property names. Check fails if any property
|
||||
name in the list is in ``requested_resource`` but its value is
|
||||
different with the value in ``existing_resource``.
|
||||
|
||||
Args:
|
||||
requested_resource: the user requested resource paylod.
|
||||
existing_resource: the existing resource payload from data storage.
|
||||
property_names: a list of property names.
|
||||
|
||||
Return:
|
||||
True if ``requested_resource`` has been changed.
|
||||
"""
|
||||
for property_name in property_names:
|
||||
if not property_name in requested_resource:
|
||||
continue
|
||||
existing_value = existing_resource.get(property_name, None)
|
||||
if requested_resource[property_name] != existing_value:
|
||||
return True
|
||||
return False
|
||||
|
||||
def wait_operation_done(get_operation, wait_interval):
|
||||
"""Waits for an operation to be done.
|
||||
|
||||
Args:
|
||||
get_operation: the name of the operation.
|
||||
wait_interval: the wait interview between pulling job
|
||||
status.
|
||||
|
||||
Returns:
|
||||
The completed operation.
|
||||
"""
|
||||
while True:
|
||||
operation = get_operation()
|
||||
operation_name = operation.get('name')
|
||||
done = operation.get('done', False)
|
||||
if not done:
|
||||
logging.info('Operation {} is not done. Wait for {}s.'.format(
|
||||
operation_name, wait_interval))
|
||||
time.sleep(wait_interval)
|
||||
continue
|
||||
error = operation.get('error', None)
|
||||
if error:
|
||||
raise RuntimeError('Failed to complete operation {}: {} {}'.format(
|
||||
operation_name,
|
||||
error.get('code', 'Unknown code'),
|
||||
error.get('message', 'Unknown message'),
|
||||
))
|
||||
return operation
|
||||
|
||||
|
||||
def with_retries(
|
||||
func: Callable,
|
||||
on_error: Optional[Callable[[], Any]] = None,
|
||||
errors: Tuple[Exception, ...] = Exception,
|
||||
number_of_retries: int = 5,
|
||||
delay: float = 1,
|
||||
):
|
||||
"""Retry decorator.
|
||||
|
||||
The decorator catches `errors`, calls `on_error` and retries after waiting `delay` seconds.
|
||||
|
||||
Args:
|
||||
number_of_retries (int): Total number of retries if error is raised.
|
||||
delay (float): Number of seconds to wait between consecutive retries.
|
||||
"""
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(self, *args, **kwargs):
|
||||
remaining_retries = number_of_retries
|
||||
while remaining_retries:
|
||||
try:
|
||||
return func(self, *args, **kwargs)
|
||||
except errors as e:
|
||||
remaining_retries -= 1
|
||||
if not remaining_retries:
|
||||
raise
|
||||
|
||||
logging.warning(
|
||||
'Caught {}. Retrying in {} seconds...'.format(
|
||||
e.__class__.__name__, delay
|
||||
)
|
||||
)
|
||||
|
||||
time.sleep(delay)
|
||||
if on_error:
|
||||
on_error()
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
class ClientWithRetries:
|
||||
|
||||
def __init__(self):
|
||||
self._build_client()
|
||||
for name, member in self.__dict__.items():
|
||||
if callable(member) and not name.startswith("_"):
|
||||
self.__dict__[name] = with_retries(func=member, errors=(BrokenPipeError, IOError), on_error=self._build_client)
|
||||
|
||||
@abc.abstractmethod
|
||||
def _build_client(self):
|
||||
raise NotImplementedError()
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._launch_template import launch_template
|
||||
from ._launch_flex_template import launch_flex_template
|
||||
from ._launch_python import launch_python
|
||||
|
|
@ -1,79 +0,0 @@
|
|||
# Copyright 2021 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import googleapiclient.discovery as discovery
|
||||
from googleapiclient import errors
|
||||
from ..common import ClientWithRetries
|
||||
|
||||
|
||||
class DataflowClient(ClientWithRetries):
|
||||
def _build_client(self):
|
||||
self._df = discovery.build('dataflow', 'v1b3', cache_discovery=False)
|
||||
|
||||
def launch_template(
|
||||
self, project_id, gcs_path, location, validate_only, launch_parameters
|
||||
):
|
||||
return self._df.projects().locations().templates().launch(
|
||||
projectId=project_id,
|
||||
gcsPath=gcs_path,
|
||||
location=location,
|
||||
validateOnly=validate_only,
|
||||
body=launch_parameters
|
||||
).execute()
|
||||
|
||||
def launch_flex_template(self, project_id, request_body, location):
|
||||
return self._df.projects().locations().flexTemplates().launch(
|
||||
projectId=project_id, location=location, body=request_body
|
||||
).execute()
|
||||
|
||||
def get_job(self, project_id, job_id, location=None, view=None):
|
||||
return self._df.projects().locations().jobs().get(
|
||||
projectId=project_id,
|
||||
jobId=job_id,
|
||||
location=self._get_location(location),
|
||||
view=view
|
||||
).execute()
|
||||
|
||||
def cancel_job(self, project_id, job_id, location):
|
||||
return self._df.projects().locations().jobs().update(
|
||||
projectId=project_id,
|
||||
jobId=job_id,
|
||||
location=self._get_location(location),
|
||||
body={
|
||||
'requestedState': 'JOB_STATE_CANCELLED'
|
||||
}
|
||||
).execute()
|
||||
|
||||
def list_aggregated_jobs(
|
||||
self,
|
||||
project_id,
|
||||
filter=None,
|
||||
view=None,
|
||||
page_size=None,
|
||||
page_token=None,
|
||||
location=None
|
||||
):
|
||||
return self._df.projects().jobs().aggregated(
|
||||
projectId=project_id,
|
||||
filter=filter,
|
||||
view=view,
|
||||
pageSize=page_size,
|
||||
pageToken=page_token,
|
||||
location=location
|
||||
).execute()
|
||||
|
||||
def _get_location(self, location):
|
||||
if not location:
|
||||
location = 'us-central1'
|
||||
return location
|
||||
|
|
@ -1,121 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from kfp_component.core import display
|
||||
from .. import common as gcp_common
|
||||
from ..storage import download_blob, parse_blob_path, is_gcs_path
|
||||
|
||||
_JOB_SUCCESSFUL_STATES = ['JOB_STATE_DONE', 'JOB_STATE_UPDATED', 'JOB_STATE_DRAINED']
|
||||
_JOB_FAILED_STATES = ['JOB_STATE_STOPPED', 'JOB_STATE_FAILED', 'JOB_STATE_CANCELLED']
|
||||
_JOB_TERMINATED_STATES = _JOB_SUCCESSFUL_STATES + _JOB_FAILED_STATES
|
||||
|
||||
def wait_for_job_done(df_client, project_id, job_id, location=None, wait_interval=30):
|
||||
while True:
|
||||
job = df_client.get_job(project_id, job_id, location=location)
|
||||
state = job.get('currentState', None)
|
||||
if is_job_done(state):
|
||||
return job
|
||||
elif is_job_terminated(state):
|
||||
# Terminated with error state
|
||||
raise RuntimeError('Job {} failed with error state: {}.'.format(
|
||||
job_id,
|
||||
state
|
||||
))
|
||||
else:
|
||||
logging.info('Job {} is in pending state {}.'
|
||||
' Waiting for {} seconds for next poll.'.format(
|
||||
job_id,
|
||||
state,
|
||||
wait_interval
|
||||
))
|
||||
time.sleep(wait_interval)
|
||||
|
||||
def wait_and_dump_job(df_client, project_id, location, job,
|
||||
wait_interval,
|
||||
job_id_output_path,
|
||||
job_object_output_path,
|
||||
):
|
||||
display_job_link(project_id, job)
|
||||
job_id = job.get('id')
|
||||
job = wait_for_job_done(df_client, project_id, job_id,
|
||||
location, wait_interval)
|
||||
gcp_common.dump_file(job_object_output_path, json.dumps(job))
|
||||
gcp_common.dump_file(job_id_output_path, job.get('id'))
|
||||
return job
|
||||
|
||||
def is_job_terminated(job_state):
|
||||
return job_state in _JOB_TERMINATED_STATES
|
||||
|
||||
def is_job_done(job_state):
|
||||
return job_state in _JOB_SUCCESSFUL_STATES
|
||||
|
||||
def display_job_link(project_id, job):
|
||||
location = job.get('location')
|
||||
job_id = job.get('id')
|
||||
display.display(display.Link(
|
||||
href = 'https://console.cloud.google.com/dataflow/'
|
||||
'jobsDetail/locations/{}/jobs/{}?project={}'.format(
|
||||
location, job_id, project_id),
|
||||
text = 'Job Details'
|
||||
))
|
||||
|
||||
def stage_file(local_or_gcs_path):
|
||||
if not is_gcs_path(local_or_gcs_path):
|
||||
return local_or_gcs_path
|
||||
_, blob_path = parse_blob_path(local_or_gcs_path)
|
||||
file_name = os.path.basename(blob_path)
|
||||
local_file_path = os.path.join(tempfile.mkdtemp(), file_name)
|
||||
download_blob(local_or_gcs_path, local_file_path)
|
||||
return local_file_path
|
||||
|
||||
def get_staging_location(staging_dir, context_id):
|
||||
if not staging_dir:
|
||||
return None
|
||||
|
||||
staging_location = os.path.join(staging_dir, context_id)
|
||||
logging.info('staging_location: {}'.format(staging_location))
|
||||
return staging_location
|
||||
|
||||
def read_job_id_and_location(storage_client, staging_location):
|
||||
if staging_location:
|
||||
job_blob = _get_job_blob(storage_client, staging_location)
|
||||
if job_blob.exists():
|
||||
job_data = job_blob.download_as_bytes().decode().split(',')
|
||||
# Returns (job_id, location)
|
||||
logging.info('Found existing job {}.'.format(job_data))
|
||||
return (job_data[0], job_data[1])
|
||||
|
||||
return (None, None)
|
||||
|
||||
def upload_job_id_and_location(storage_client, staging_location, job_id, location):
|
||||
if not staging_location:
|
||||
return
|
||||
if not location:
|
||||
location = ''
|
||||
data = '{},{}'.format(job_id, location)
|
||||
job_blob = _get_job_blob(storage_client, staging_location)
|
||||
logging.info('Uploading {} to {}.'.format(data, job_blob))
|
||||
job_blob.upload_from_string(data)
|
||||
|
||||
def _get_job_blob(storage_client, staging_location):
|
||||
bucket_name, staging_blob_name = parse_blob_path(staging_location)
|
||||
job_blob_name = os.path.join(staging_blob_name, 'kfp/dataflow/launch_python/job.txt')
|
||||
bucket = storage_client.bucket(bucket_name)
|
||||
return bucket.blob(job_blob_name)
|
||||
|
|
@ -1,103 +0,0 @@
|
|||
import logging
|
||||
|
||||
from google.cloud import storage
|
||||
|
||||
from kfp_component.core import KfpExecutionContext
|
||||
from ._client import DataflowClient
|
||||
from ._common_ops import (
|
||||
wait_and_dump_job, get_staging_location, read_job_id_and_location,
|
||||
upload_job_id_and_location
|
||||
)
|
||||
|
||||
|
||||
def launch_flex_template(
|
||||
project_id,
|
||||
location,
|
||||
launch_parameters,
|
||||
validate_only=False,
|
||||
staging_dir=None,
|
||||
wait_interval=30,
|
||||
job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt',
|
||||
job_object_output_path='/tmp/kfp/output/dataflow/job.json',
|
||||
):
|
||||
"""Launches a dataflow job from a flex template.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Cloud Platform project that the job belongs to.
|
||||
location (str): The regional endpoint to which to direct the request.
|
||||
launch_parameters (dict): Parameters to provide to the template
|
||||
being launched. Schema defined in
|
||||
https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.locations.flexTemplates/launch#LaunchFlexTemplateParameter.
|
||||
`jobName` will be replaced by generated name.
|
||||
validate_only (boolean): If true, the request is validated but
|
||||
not actually executed. Defaults to false.
|
||||
staging_dir (str): Optional. The GCS directory for keeping staging files.
|
||||
A random subdirectory will be created under the directory to keep job info
|
||||
for resuming the job in case of failure.
|
||||
wait_interval (int): The wait seconds between polling.
|
||||
job_id_output_path (str): Optional. Output file to save job_id of execution
|
||||
job_object_output_path (str): Optional. Output file to save job details of execution
|
||||
|
||||
Returns:
|
||||
The completed job.
|
||||
"""
|
||||
storage_client = storage.Client()
|
||||
df_client = DataflowClient()
|
||||
job_id = None
|
||||
|
||||
def cancel():
|
||||
if job_id:
|
||||
df_client.cancel_job(project_id, job_id, location)
|
||||
|
||||
with KfpExecutionContext(on_cancel=cancel) as ctx:
|
||||
staging_location = get_staging_location(staging_dir, ctx.context_id())
|
||||
job_id, _ = read_job_id_and_location(storage_client, staging_location)
|
||||
# Continue waiting for the job if it's has been uploaded to staging location.
|
||||
if job_id:
|
||||
job = df_client.get_job(project_id, job_id, location)
|
||||
job = wait_and_dump_job(
|
||||
df_client,
|
||||
project_id,
|
||||
location,
|
||||
job,
|
||||
wait_interval,
|
||||
job_id_output_path=job_id_output_path,
|
||||
job_object_output_path=job_object_output_path,
|
||||
)
|
||||
logging.info(f'Skipping, existing job: {job}')
|
||||
return job
|
||||
|
||||
if launch_parameters is None:
|
||||
launch_parameters = {}
|
||||
|
||||
request_body = {
|
||||
'launchParameter': launch_parameters,
|
||||
'validateOnly': validate_only
|
||||
}
|
||||
|
||||
request_body['launchParameter']['jobName'] = 'job-' + ctx.context_id()
|
||||
|
||||
response = df_client.launch_flex_template(
|
||||
project_id, request_body, location
|
||||
)
|
||||
|
||||
job = response.get('job', None)
|
||||
if not job:
|
||||
# Validate only mode
|
||||
return job
|
||||
|
||||
job_id = job.get('id')
|
||||
upload_job_id_and_location(
|
||||
storage_client, staging_location, job_id, location
|
||||
)
|
||||
job = wait_and_dump_job(
|
||||
df_client,
|
||||
project_id,
|
||||
location,
|
||||
job,
|
||||
wait_interval,
|
||||
job_id_output_path=job_id_output_path,
|
||||
job_object_output_path=job_object_output_path,
|
||||
)
|
||||
logging.info(f'Completed job: {job}')
|
||||
return job
|
||||
|
|
@ -1,119 +0,0 @@
|
|||
# Copyright 2021 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import subprocess
|
||||
import re
|
||||
import logging
|
||||
import os
|
||||
|
||||
from google.cloud import storage
|
||||
from kfp_component.core import KfpExecutionContext
|
||||
from ._client import DataflowClient
|
||||
from ._common_ops import (wait_and_dump_job, stage_file, get_staging_location,
|
||||
read_job_id_and_location, upload_job_id_and_location)
|
||||
from ._process import Process
|
||||
from ..storage import parse_blob_path
|
||||
|
||||
def launch_python(python_file_path, project_id, region, staging_dir=None, requirements_file_path=None,
|
||||
args=[], wait_interval=30,
|
||||
job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt',
|
||||
job_object_output_path='/tmp/kfp/output/dataflow/job.json',
|
||||
):
|
||||
"""Launch a self-executing beam python file.
|
||||
|
||||
Args:
|
||||
python_file_path (str): The gcs or local path to the python file to run.
|
||||
project_id (str): The ID of the GCP project to run the Dataflow job.
|
||||
region (str): The GCP region to run the Dataflow job.
|
||||
staging_dir (str): Optional. The GCS directory for keeping staging files.
|
||||
A random subdirectory will be created under the directory to keep job info
|
||||
for resuming the job in case of failure and it will be passed as
|
||||
`staging_location` and `temp_location` command line args of the beam code.
|
||||
requirements_file_path (str): Optional, the gcs or local path to the pip
|
||||
requirements file.
|
||||
args (list): The list of args to pass to the python file.
|
||||
wait_interval (int): The wait seconds between polling.
|
||||
Returns:
|
||||
The completed job.
|
||||
"""
|
||||
storage_client = storage.Client()
|
||||
df_client = DataflowClient()
|
||||
job_id = None
|
||||
location = None
|
||||
def cancel():
|
||||
if job_id:
|
||||
df_client.cancel_job(
|
||||
project_id,
|
||||
job_id,
|
||||
location
|
||||
)
|
||||
with KfpExecutionContext(on_cancel=cancel) as ctx:
|
||||
staging_location = get_staging_location(staging_dir, ctx.context_id())
|
||||
job_id, location = read_job_id_and_location(storage_client, staging_location)
|
||||
# Continue waiting for the job if it's has been uploaded to staging location.
|
||||
if job_id:
|
||||
job = df_client.get_job(project_id, job_id, location)
|
||||
return wait_and_dump_job(df_client, project_id, location, job,
|
||||
wait_interval,
|
||||
job_id_output_path=job_id_output_path,
|
||||
job_object_output_path=job_object_output_path,
|
||||
)
|
||||
|
||||
_install_requirements(requirements_file_path)
|
||||
python_file_path = stage_file(python_file_path)
|
||||
cmd = _prepare_cmd(project_id, region, python_file_path, args, staging_location)
|
||||
sub_process = Process(cmd)
|
||||
for line in sub_process.read_lines():
|
||||
job_id, location = _extract_job_id_and_location(line)
|
||||
if job_id:
|
||||
logging.info('Found job id {} and location {}.'.format(job_id, location))
|
||||
upload_job_id_and_location(storage_client, staging_location, job_id, location)
|
||||
break
|
||||
sub_process.wait_and_check()
|
||||
if not job_id:
|
||||
logging.warning('No dataflow job was found when '
|
||||
'running the python file.')
|
||||
return None
|
||||
job = df_client.get_job(project_id, job_id,
|
||||
location=location)
|
||||
return wait_and_dump_job(df_client, project_id, location, job,
|
||||
wait_interval,
|
||||
job_id_output_path=job_id_output_path,
|
||||
job_object_output_path=job_object_output_path,
|
||||
)
|
||||
|
||||
def _prepare_cmd(project_id, region, python_file_path, args, staging_location):
|
||||
dataflow_args = [
|
||||
'--runner', 'DataflowRunner',
|
||||
'--project', project_id,
|
||||
'--region', region]
|
||||
if staging_location:
|
||||
dataflow_args += ['--staging_location', staging_location, '--temp_location', staging_location]
|
||||
return (['python', '-u', python_file_path] +
|
||||
dataflow_args + args)
|
||||
|
||||
def _extract_job_id_and_location(line):
|
||||
"""Returns (job_id, location) from matched log.
|
||||
"""
|
||||
job_id_pattern = re.compile(
|
||||
br'.*console.cloud.google.com/dataflow/jobs/(?P<location>[a-z|0-9|A-Z|\-|\_]+)/(?P<job_id>[a-z|0-9|A-Z|\-|\_]+).*')
|
||||
matched_job_id = job_id_pattern.search(line or '')
|
||||
if matched_job_id:
|
||||
return (matched_job_id.group('job_id').decode(), matched_job_id.group('location').decode())
|
||||
return (None, None)
|
||||
|
||||
def _install_requirements(requirements_file_path):
|
||||
if not requirements_file_path:
|
||||
return
|
||||
requirements_file_path = stage_file(requirements_file_path)
|
||||
subprocess.check_call(['pip', 'install', '-r', requirements_file_path])
|
||||
|
|
@ -1,93 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
from google.cloud import storage
|
||||
from kfp_component.core import KfpExecutionContext
|
||||
from ._client import DataflowClient
|
||||
from ._common_ops import (wait_and_dump_job, get_staging_location,
|
||||
read_job_id_and_location, upload_job_id_and_location)
|
||||
|
||||
def launch_template(project_id, gcs_path, launch_parameters,
|
||||
location=None, validate_only=None, staging_dir=None,
|
||||
wait_interval=30,
|
||||
job_id_output_path='/tmp/kfp/output/dataflow/job_id.txt',
|
||||
job_object_output_path='/tmp/kfp/output/dataflow/job.json',
|
||||
):
|
||||
"""Launchs a dataflow job from template.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Cloud Platform project
|
||||
that the job belongs to.
|
||||
gcs_path (str): Required. A Cloud Storage path to the template
|
||||
from which to create the job. Must be valid Cloud
|
||||
Storage URL, beginning with 'gs://'.
|
||||
launch_parameters (dict): Parameters to provide to the template
|
||||
being launched. Schema defined in
|
||||
https://cloud.google.com/dataflow/docs/reference/rest/v1b3/LaunchTemplateParameters.
|
||||
`jobName` will be replaced by generated name.
|
||||
location (str): The regional endpoint to which to direct the
|
||||
request.
|
||||
validate_only (boolean): If true, the request is validated but
|
||||
not actually executed. Defaults to false.
|
||||
staging_dir (str): Optional. The GCS directory for keeping staging files.
|
||||
A random subdirectory will be created under the directory to keep job info
|
||||
for resuming the job in case of failure.
|
||||
wait_interval (int): The wait seconds between polling.
|
||||
|
||||
Returns:
|
||||
The completed job.
|
||||
"""
|
||||
storage_client = storage.Client()
|
||||
df_client = DataflowClient()
|
||||
job_id = None
|
||||
def cancel():
|
||||
if job_id:
|
||||
df_client.cancel_job(
|
||||
project_id,
|
||||
job_id,
|
||||
location
|
||||
)
|
||||
with KfpExecutionContext(on_cancel=cancel) as ctx:
|
||||
staging_location = get_staging_location(staging_dir, ctx.context_id())
|
||||
job_id, _ = read_job_id_and_location(storage_client, staging_location)
|
||||
# Continue waiting for the job if it's has been uploaded to staging location.
|
||||
if job_id:
|
||||
job = df_client.get_job(project_id, job_id, location)
|
||||
return wait_and_dump_job(df_client, project_id, location, job,
|
||||
wait_interval,
|
||||
job_id_output_path=job_id_output_path,
|
||||
job_object_output_path=job_object_output_path,
|
||||
)
|
||||
|
||||
if not launch_parameters:
|
||||
launch_parameters = {}
|
||||
launch_parameters['jobName'] = 'job-' + ctx.context_id()
|
||||
response = df_client.launch_template(project_id, gcs_path,
|
||||
location, validate_only, launch_parameters)
|
||||
job = response.get('job', None)
|
||||
if not job:
|
||||
# Validate only mode
|
||||
return job
|
||||
job_id = job.get('id')
|
||||
upload_job_id_and_location(storage_client, staging_location, job_id, location)
|
||||
return wait_and_dump_job(df_client, project_id, location, job,
|
||||
wait_interval,
|
||||
job_id_output_path=job_id_output_path,
|
||||
job_object_output_path=job_object_output_path,
|
||||
)
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import subprocess
|
||||
import logging
|
||||
|
||||
class Process:
|
||||
def __init__(self, cmd):
|
||||
self._cmd = cmd
|
||||
self.process = subprocess.Popen(cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
close_fds=True,
|
||||
shell=False)
|
||||
|
||||
def read_lines(self):
|
||||
# stdout will end with empty bytes when process exits.
|
||||
for line in iter(self.process.stdout.readline, b''):
|
||||
logging.info('subprocess: {}'.format(line))
|
||||
yield line
|
||||
|
||||
def wait_and_check(self):
|
||||
for _ in self.read_lines():
|
||||
pass
|
||||
self.process.stdout.close()
|
||||
return_code = self.process.wait()
|
||||
logging.info('Subprocess exit with code {}.'.format(
|
||||
return_code))
|
||||
if return_code:
|
||||
raise subprocess.CalledProcessError(return_code, self._cmd)
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._create_cluster import create_cluster
|
||||
from ._delete_cluster import delete_cluster
|
||||
from ._submit_job import submit_job
|
||||
from ._submit_pyspark_job import submit_pyspark_job
|
||||
from ._submit_spark_job import submit_spark_job
|
||||
from ._submit_sparksql_job import submit_sparksql_job
|
||||
from ._submit_hadoop_job import submit_hadoop_job
|
||||
from ._submit_hive_job import submit_hive_job
|
||||
from ._submit_pig_job import submit_pig_job
|
||||
|
|
@ -1,120 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import googleapiclient.discovery as discovery
|
||||
from ..common import wait_operation_done, ClientWithRetries
|
||||
|
||||
|
||||
class DataprocClient(ClientWithRetries):
|
||||
""" Internal client for calling Dataproc APIs.
|
||||
"""
|
||||
|
||||
def _build_client(self):
|
||||
self._dataproc = discovery.build('dataproc', 'v1', cache_discovery=False)
|
||||
|
||||
def create_cluster(self, project_id, region, cluster, request_id):
|
||||
"""Creates a new dataproc cluster.
|
||||
"""
|
||||
return self._dataproc.projects().regions().clusters().create(
|
||||
projectId = project_id,
|
||||
region = region,
|
||||
requestId = request_id,
|
||||
body = cluster
|
||||
).execute()
|
||||
|
||||
def get_cluster(self, project_id, region, name):
|
||||
"""Gets the resource representation for a cluster in a project.
|
||||
"""
|
||||
return self._dataproc.projects().regions().clusters().get(
|
||||
projectId = project_id,
|
||||
region = region,
|
||||
clusterName = name
|
||||
).execute()
|
||||
|
||||
def delete_cluster(self, project_id, region, name, request_id):
|
||||
"""Deletes a cluster in a project.
|
||||
"""
|
||||
return self._dataproc.projects().regions().clusters().delete(
|
||||
projectId = project_id,
|
||||
region = region,
|
||||
clusterName = name,
|
||||
requestId = request_id
|
||||
).execute()
|
||||
|
||||
def submit_job(self, project_id, region, job, request_id):
|
||||
"""Submits a job to a cluster.
|
||||
"""
|
||||
return self._dataproc.projects().regions().jobs().submit(
|
||||
projectId = project_id,
|
||||
region = region,
|
||||
body = {
|
||||
'job': job,
|
||||
'requestId': request_id
|
||||
}
|
||||
).execute()
|
||||
|
||||
def get_job(self, project_id, region, job_id):
|
||||
"""Gets a job details
|
||||
"""
|
||||
return self._dataproc.projects().regions().jobs().get(
|
||||
projectId = project_id,
|
||||
region = region,
|
||||
jobId = job_id
|
||||
).execute()
|
||||
|
||||
def cancel_job(self, project_id, region, job_id):
|
||||
"""Cancels a job
|
||||
"""
|
||||
return self._dataproc.projects().regions().jobs().cancel(
|
||||
projectId = project_id,
|
||||
region = region,
|
||||
jobId = job_id
|
||||
).execute()
|
||||
|
||||
def get_operation(self, operation_name):
|
||||
"""Gets a operation by name.
|
||||
"""
|
||||
return self._dataproc.projects().regions().operations().get(
|
||||
name = operation_name
|
||||
).execute()
|
||||
|
||||
def wait_for_operation_done(self, operation_name, wait_interval):
|
||||
"""Waits for an operation to be done.
|
||||
|
||||
Args:
|
||||
operation_name: the name of the operation.
|
||||
wait_interval: the wait interview between pulling job
|
||||
status.
|
||||
|
||||
Returns:
|
||||
The completed operation.
|
||||
"""
|
||||
return wait_operation_done(
|
||||
lambda: self.get_operation(operation_name), wait_interval)
|
||||
|
||||
def cancel_operation(self, operation_name):
|
||||
"""Cancels an operation.
|
||||
|
||||
Args:
|
||||
operation_name: the name of the operation.
|
||||
"""
|
||||
if not operation_name:
|
||||
return
|
||||
|
||||
self._dataproc.projects().regions().operations().cancel(
|
||||
name = operation_name
|
||||
).execute()
|
||||
|
|
@ -1,104 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
|
||||
from fire import decorators
|
||||
from ._client import DataprocClient
|
||||
from kfp_component.core import KfpExecutionContext, display
|
||||
from .. import common as gcp_common
|
||||
|
||||
@decorators.SetParseFns(image_version=str)
|
||||
def create_cluster(project_id, region, name=None, name_prefix=None,
|
||||
initialization_actions=None, config_bucket=None, image_version=None,
|
||||
cluster=None, wait_interval=30,
|
||||
cluster_name_output_path='/tmp/kfp/output/dataproc/cluster_name.txt',
|
||||
cluster_object_output_path='/tmp/kfp/output/dataproc/cluster.json',
|
||||
):
|
||||
"""Creates a DataProc cluster under a project.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Google Cloud Platform project
|
||||
that the cluster belongs to.
|
||||
region (str): Required. The Cloud Dataproc region in which to handle the
|
||||
request.
|
||||
name (str): Optional. The cluster name. Cluster names within a project
|
||||
must be unique. Names of deleted clusters can be reused.
|
||||
name_prefix (str): Optional. The prefix of the cluster name.
|
||||
initialization_actions (list): Optional. List of GCS URIs of executables
|
||||
to execute on each node after config is completed. By default,
|
||||
executables are run on master and all worker nodes.
|
||||
config_bucket (str): Optional. A Google Cloud Storage bucket used to
|
||||
stage job dependencies, config files, and job driver console output.
|
||||
image_version (str): Optional. The version of software inside the cluster.
|
||||
cluster (dict): Optional. The full cluster config. See [full details](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#Cluster)
|
||||
wait_interval (int): The wait seconds between polling the operation.
|
||||
Defaults to 30s.
|
||||
|
||||
Returns:
|
||||
The created cluster object.
|
||||
|
||||
Output Files:
|
||||
$KFP_OUTPUT_PATH/dataproc/cluster_name.txt: The cluster name of the
|
||||
created cluster.
|
||||
"""
|
||||
if not cluster:
|
||||
cluster = {}
|
||||
cluster['projectId'] = project_id
|
||||
if 'config' not in cluster:
|
||||
cluster['config'] = {}
|
||||
if name:
|
||||
cluster['clusterName'] = name
|
||||
if initialization_actions:
|
||||
cluster['config']['initializationActions'] = list(
|
||||
map(lambda file: {
|
||||
'executableFile': file
|
||||
}, initialization_actions)
|
||||
)
|
||||
if config_bucket:
|
||||
cluster['config']['configBucket'] = config_bucket
|
||||
if image_version:
|
||||
if 'softwareConfig' not in cluster['config']:
|
||||
cluster['config']['softwareConfig'] = {}
|
||||
cluster['config']['softwareConfig']['imageVersion'] = image_version
|
||||
|
||||
client = DataprocClient()
|
||||
operation_name = None
|
||||
with KfpExecutionContext(
|
||||
on_cancel=lambda: client.cancel_operation(operation_name)) as ctx:
|
||||
_set_cluster_name(cluster, ctx.context_id(), name_prefix)
|
||||
_dump_metadata(cluster, region)
|
||||
operation = client.create_cluster(project_id, region, cluster,
|
||||
request_id=ctx.context_id())
|
||||
operation_name = operation.get('name')
|
||||
operation = client.wait_for_operation_done(operation_name,
|
||||
wait_interval)
|
||||
cluster = operation.get('response')
|
||||
gcp_common.dump_file(cluster_object_output_path, json.dumps(cluster))
|
||||
gcp_common.dump_file(cluster_name_output_path, cluster.get('clusterName'))
|
||||
return cluster
|
||||
|
||||
def _set_cluster_name(cluster, context_id, name_prefix):
|
||||
if 'clusterName' in cluster:
|
||||
return
|
||||
if not name_prefix:
|
||||
name_prefix = 'cluster'
|
||||
cluster['clusterName'] = name_prefix + '-' + context_id
|
||||
|
||||
def _dump_metadata(cluster, region):
|
||||
display.display(display.Link(
|
||||
'https://console.cloud.google.com/dataproc/clusters/{}?project={}®ion={}'.format(
|
||||
cluster.get('clusterName'), cluster.get('projectId'), region),
|
||||
'Cluster Details'
|
||||
))
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from googleapiclient import errors
|
||||
from ._client import DataprocClient
|
||||
from kfp_component.core import KfpExecutionContext
|
||||
|
||||
def delete_cluster(project_id, region, name, wait_interval=30):
|
||||
"""Deletes a DataProc cluster.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Google Cloud Platform project
|
||||
that the cluster belongs to.
|
||||
region (str): Required. The Cloud Dataproc region in which to handle the
|
||||
request.
|
||||
name (str): Required. The cluster name to delete.
|
||||
wait_interval (int): The wait seconds between polling the operation.
|
||||
Defaults to 30s.
|
||||
|
||||
"""
|
||||
client = DataprocClient()
|
||||
operation_name = None
|
||||
with KfpExecutionContext(
|
||||
on_cancel=lambda: client.cancel_operation(operation_name)) as ctx:
|
||||
try:
|
||||
operation = client.delete_cluster(project_id, region, name,
|
||||
request_id=ctx.context_id())
|
||||
except errors.HttpError as e:
|
||||
if e.resp.status == 404:
|
||||
logging.info('Cluster {} is not found.'.format(name))
|
||||
return
|
||||
raise e
|
||||
operation_name = operation.get('name')
|
||||
return client.wait_for_operation_done(operation_name,
|
||||
wait_interval)
|
||||
|
|
@ -1,62 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._submit_job import submit_job
|
||||
|
||||
def submit_hadoop_job(project_id, region, cluster_name, job_id_output_path,
|
||||
main_jar_file_uri=None, main_class=None, args=[], hadoop_job={}, job={},
|
||||
wait_interval=30):
|
||||
"""Submits a Cloud Dataproc job for running Apache Hadoop MapReduce jobs
|
||||
on Apache Hadoop YARN.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Google Cloud Platform project
|
||||
that the cluster belongs to.
|
||||
region (str): Required. The Cloud Dataproc region in which to handle the
|
||||
request.
|
||||
cluster_name (str): Required. The cluster to run the job.
|
||||
main_jar_file_uri (str): The HCFS URI of the jar file containing the main
|
||||
class. Examples:
|
||||
`gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar`
|
||||
`hdfs:/tmp/test-samples/custom-wordcount.jar`
|
||||
`file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar`
|
||||
main_class (str): The name of the driver's main class. The jar file
|
||||
containing the class must be in the default CLASSPATH or specified
|
||||
in `jarFileUris`.
|
||||
args (list): Optional. The arguments to pass to the driver. Do not include
|
||||
arguments, such as -libjars or -Dfoo=bar, that can be set as job properties,
|
||||
since a collision may occur that causes an incorrect job submission.
|
||||
hadoop_job (dict): Optional. The full payload of a [hadoop job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/HadoopJob).
|
||||
job (dict): Optional. The full payload of a [Dataproc job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
wait_interval (int): The wait seconds between polling the operation.
|
||||
Defaults to 30s.
|
||||
job_id_output_path (str): Path for the ID of the created job
|
||||
|
||||
Returns:
|
||||
The created job payload.
|
||||
"""
|
||||
if not hadoop_job:
|
||||
hadoop_job = {}
|
||||
if not job:
|
||||
job = {}
|
||||
if main_jar_file_uri:
|
||||
hadoop_job['mainJarFileUri'] = main_jar_file_uri
|
||||
if main_class:
|
||||
hadoop_job['mainClass'] = main_class
|
||||
if args:
|
||||
hadoop_job['args'] = args
|
||||
job['hadoopJob'] = hadoop_job
|
||||
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._submit_job import submit_job
|
||||
|
||||
def submit_hive_job(project_id, region, cluster_name, job_id_output_path,
|
||||
queries=[], query_file_uri=None, script_variables={}, hive_job={},
|
||||
job={}, wait_interval=30):
|
||||
"""Submits a Cloud Dataproc job for running Apache Hive queries on YARN.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Google Cloud Platform project
|
||||
that the cluster belongs to.
|
||||
region (str): Required. The Cloud Dataproc region in which to handle the
|
||||
request.
|
||||
cluster_name (str): Required. The cluster to run the job.
|
||||
queries (list): Required. The queries to execute. You do not need to
|
||||
terminate a query with a semicolon. Multiple queries can be specified
|
||||
in one string by separating each with a semicolon.
|
||||
query_file_uri (str): The HCFS URI of the script that contains Hive queries.
|
||||
script_variables (dict): Optional. Mapping of query variable names to
|
||||
values (equivalent to the Hive command: SET name="value";).
|
||||
hive_job (dict): Optional. The full payload of a [Hive job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/HiveJob)
|
||||
job (dict): Optional. The full payload of a [Dataproc job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
wait_interval (int): The wait seconds between polling the operation.
|
||||
Defaults to 30s.
|
||||
job_id_output_path (str): Path for the ID of the created job
|
||||
|
||||
Returns:
|
||||
The created job payload.
|
||||
"""
|
||||
if not hive_job:
|
||||
hive_job = {}
|
||||
if not job:
|
||||
job = {}
|
||||
if queries:
|
||||
hive_job['queryList'] = { 'queries': queries }
|
||||
if query_file_uri:
|
||||
hive_job['queryFileUri'] = query_file_uri
|
||||
if script_variables:
|
||||
hive_job['scriptVariables'] = script_variables
|
||||
job['hiveJob'] = hive_job
|
||||
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)
|
||||
|
|
@ -1,81 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
import time
|
||||
|
||||
from ._client import DataprocClient
|
||||
from kfp_component.core import KfpExecutionContext, display
|
||||
from .. import common as gcp_common
|
||||
|
||||
def submit_job(project_id, region, cluster_name, job, wait_interval=30,
|
||||
job_id_output_path='/tmp/kfp/output/dataproc/job_id.txt',
|
||||
job_object_output_path='/tmp/kfp/output/dataproc/job.json',
|
||||
):
|
||||
"""Submits a Cloud Dataproc job.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Google Cloud Platform project
|
||||
that the cluster belongs to.
|
||||
region (str): Required. The Cloud Dataproc region in which to handle the
|
||||
request.
|
||||
cluster_name (str): Required. The cluster to run the job.
|
||||
job (dict): Optional. The full payload of a [Dataproc job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
wait_interval (int): The wait seconds between polling the operation.
|
||||
Defaults to 30s.
|
||||
job_id_output_path (str): Path for the ID of the created job
|
||||
job_object_output_path (str): Path for the created job object
|
||||
|
||||
Returns:
|
||||
The created job payload.
|
||||
"""
|
||||
if 'reference' not in job:
|
||||
job['reference'] = {}
|
||||
job['reference']['projectId'] = project_id
|
||||
if 'placement' not in job:
|
||||
job['placement'] = {}
|
||||
job['placement']['clusterName'] = cluster_name
|
||||
client = DataprocClient()
|
||||
job_id = None
|
||||
with KfpExecutionContext(
|
||||
on_cancel=lambda: client.cancel_job(
|
||||
project_id, region, job_id)) as ctx:
|
||||
submitted_job = client.submit_job(project_id, region, job,
|
||||
request_id=ctx.context_id())
|
||||
job_id = submitted_job['reference']['jobId']
|
||||
_dump_metadata(submitted_job, region)
|
||||
submitted_job = _wait_for_job_done(client, project_id, region,
|
||||
job_id, wait_interval)
|
||||
gcp_common.dump_file(job_object_output_path, json.dumps(submitted_job))
|
||||
gcp_common.dump_file(job_id_output_path, submitted_job.get('reference').get('jobId'))
|
||||
return submitted_job
|
||||
|
||||
def _wait_for_job_done(client, project_id, region, job_id, wait_interval):
|
||||
while True:
|
||||
job = client.get_job(project_id, region, job_id)
|
||||
state = job['status']['state']
|
||||
if state == 'DONE':
|
||||
return job
|
||||
if state == 'ERROR':
|
||||
raise RuntimeError(job['status']['details'])
|
||||
time.sleep(wait_interval)
|
||||
|
||||
def _dump_metadata(job, region):
|
||||
display.display(display.Link(
|
||||
'https://console.cloud.google.com/dataproc/jobs/{}?project={}®ion={}'.format(
|
||||
job.get('reference').get('jobId'),
|
||||
job.get('reference').get('projectId'),
|
||||
region),
|
||||
'Job Details'
|
||||
))
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._submit_job import submit_job
|
||||
|
||||
def submit_pig_job(project_id, region, cluster_name, job_id_output_path,
|
||||
queries=[], query_file_uri=None, script_variables={}, pig_job={},
|
||||
job={}, wait_interval=30):
|
||||
"""Submits a Cloud Dataproc job for running Apache Pig queries on YARN.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Google Cloud Platform project
|
||||
that the cluster belongs to.
|
||||
region (str): Required. The Cloud Dataproc region in which to handle the
|
||||
request.
|
||||
cluster_name (str): Required. The cluster to run the job.
|
||||
queries (list): Required. The queries to execute. You do not need to
|
||||
terminate a query with a semicolon. Multiple queries can be specified
|
||||
in one string by separating each with a semicolon.
|
||||
query_file_uri (str): The HCFS URI of the script that contains Pig queries.
|
||||
script_variables (dict): Optional. Mapping of query variable names to values
|
||||
(equivalent to the Pig command: name=[value]).
|
||||
pig_job (dict): Optional. The full payload of a [Pig job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/PigJob)
|
||||
job (dict): Optional. The full payload of a [Dataproc job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
wait_interval (int): The wait seconds between polling the operation.
|
||||
Defaults to 30s.
|
||||
job_id_output_path (str): Path for the ID of the created job
|
||||
|
||||
Returns:
|
||||
The created job payload.
|
||||
"""
|
||||
if not pig_job:
|
||||
pig_job = {}
|
||||
if not job:
|
||||
job = {}
|
||||
if queries:
|
||||
pig_job['queryList'] = { 'queries': queries }
|
||||
if query_file_uri:
|
||||
pig_job['queryFileUri'] = query_file_uri
|
||||
if script_variables:
|
||||
pig_job['scriptVariables'] = script_variables
|
||||
job['pigJob'] = pig_job
|
||||
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)
|
||||
|
|
@ -1,53 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._submit_job import submit_job
|
||||
|
||||
def submit_pyspark_job(project_id, region, cluster_name, job_id_output_path,
|
||||
main_python_file_uri=None, args=[], pyspark_job={}, job={},
|
||||
wait_interval=30):
|
||||
"""Submits a Cloud Dataproc job for running Apache PySpark applications on YARN.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Google Cloud Platform project
|
||||
that the cluster belongs to.
|
||||
region (str): Required. The Cloud Dataproc region in which to handle the
|
||||
request.
|
||||
cluster_name (str): Required. The cluster to run the job.
|
||||
main_python_file_uri (str): Required. The HCFS URI of the main Python file to
|
||||
use as the driver. Must be a .py file.
|
||||
args (list): Optional. The arguments to pass to the driver. Do not include
|
||||
arguments, such as --conf, that can be set as job properties, since a
|
||||
collision may occur that causes an incorrect job submission.
|
||||
pyspark_job (dict): Optional. The full payload of a [PySparkJob](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/PySparkJob).
|
||||
job (dict): Optional. The full payload of a [Dataproc job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
wait_interval (int): The wait seconds between polling the operation.
|
||||
Defaults to 30s.
|
||||
job_id_output_path (str): Path for the ID of the created job
|
||||
|
||||
Returns:
|
||||
The created job payload.
|
||||
"""
|
||||
if not pyspark_job:
|
||||
pyspark_job = {}
|
||||
if not job:
|
||||
job = {}
|
||||
if main_python_file_uri:
|
||||
pyspark_job['mainPythonFileUri'] = main_python_file_uri
|
||||
if args:
|
||||
pyspark_job['args'] = args
|
||||
job['pysparkJob'] = pyspark_job
|
||||
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)
|
||||
|
|
@ -1,57 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._submit_job import submit_job
|
||||
|
||||
def submit_spark_job(project_id, region, cluster_name, job_id_output_path,
|
||||
main_jar_file_uri=None, main_class=None, args=[], spark_job={}, job={},
|
||||
wait_interval=30):
|
||||
"""Submits a Cloud Dataproc job for running Apache Spark applications on YARN.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Google Cloud Platform project
|
||||
that the cluster belongs to.
|
||||
region (str): Required. The Cloud Dataproc region in which to handle the
|
||||
request.
|
||||
cluster_name (str): Required. The cluster to run the job.
|
||||
main_jar_file_uri (str): The HCFS URI of the jar file that contains the main class.
|
||||
main_class (str): The name of the driver's main class. The jar file that
|
||||
contains the class must be in the default CLASSPATH or specified in
|
||||
jarFileUris.
|
||||
args (list): Optional. The arguments to pass to the driver. Do not include
|
||||
arguments, such as --conf, that can be set as job properties, since a
|
||||
collision may occur that causes an incorrect job submission.
|
||||
spark_job (dict): Optional. The full payload of a [SparkJob](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob).
|
||||
job (dict): Optional. The full payload of a [Dataproc job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
wait_interval (int): The wait seconds between polling the operation.
|
||||
Defaults to 30s.
|
||||
job_id_output_path (str): Path for the ID of the created job
|
||||
|
||||
Returns:
|
||||
The created job payload.
|
||||
"""
|
||||
if not spark_job:
|
||||
spark_job = {}
|
||||
if not job:
|
||||
job = {}
|
||||
if main_jar_file_uri:
|
||||
spark_job['mainJarFileUri'] = main_jar_file_uri
|
||||
if main_class:
|
||||
spark_job['mainClass'] = main_class
|
||||
if args:
|
||||
spark_job['args'] = args
|
||||
job['sparkJob'] = spark_job
|
||||
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._submit_job import submit_job
|
||||
|
||||
def submit_sparksql_job(project_id, region, cluster_name, job_id_output_path,
|
||||
queries=[], query_file_uri=None, script_variables={}, sparksql_job={},
|
||||
job={}, wait_interval=30):
|
||||
"""Submits a Cloud Dataproc job for running Apache Spark SQL queries.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the Google Cloud Platform project
|
||||
that the cluster belongs to.
|
||||
region (str): Required. The Cloud Dataproc region in which to handle the
|
||||
request.
|
||||
cluster_name (str): Required. The cluster to run the job.
|
||||
queries (list): Required. The queries to execute. You do not need to
|
||||
terminate a query with a semicolon. Multiple queries can be specified
|
||||
in one string by separating each with a semicolon.
|
||||
query_file_uri (str): The HCFS URI of the script that contains SQL queries.
|
||||
script_variables (dict): Optional. Mapping of query variable names to values
|
||||
(equivalent to the Spark SQL command: SET name="value";).
|
||||
sparksql_job (dict): Optional. The full payload of a [Spark SQL job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkSqlJob)
|
||||
job (dict): Optional. The full payload of a [Dataproc job](
|
||||
https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
|
||||
wait_interval (int): The wait seconds between polling the operation.
|
||||
Defaults to 30s.
|
||||
job_id_output_path (str): Path for the ID of the created job
|
||||
|
||||
Returns:
|
||||
The created job payload.
|
||||
"""
|
||||
if not sparksql_job:
|
||||
sparksql_job = {}
|
||||
if not job:
|
||||
job = {}
|
||||
if queries:
|
||||
sparksql_job['queryList'] = { 'queries': queries }
|
||||
if query_file_uri:
|
||||
sparksql_job['queryFileUri'] = query_file_uri
|
||||
if script_variables:
|
||||
sparksql_job['scriptVariables'] = script_variables
|
||||
job['sparkSqlJob'] = sparksql_job
|
||||
return submit_job(project_id, region, cluster_name, job, wait_interval, job_id_output_path=job_id_output_path)
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Module that contains a set of commands to call ML Engine APIs
|
||||
|
||||
The commands are aware of KFP execution context and can work under
|
||||
retry and cancellation context. The currently supported commands
|
||||
are: train, batch_prediction, create_model, create_version and
|
||||
delete_version.
|
||||
|
||||
TODO(hongyes): Provides full ML Engine API support.
|
||||
"""
|
||||
|
||||
from ._create_job import create_job
|
||||
from ._create_model import create_model
|
||||
from ._create_version import create_version
|
||||
from ._delete_version import delete_version
|
||||
from ._train import train
|
||||
from ._batch_predict import batch_predict
|
||||
from ._deploy import deploy
|
||||
from ._set_default_version import set_default_version
|
||||
from ._wait_job import wait_job
|
||||
|
|
@ -1,85 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
from ._create_job import create_job
|
||||
|
||||
def batch_predict(project_id, model_path, input_paths, input_data_format,
|
||||
output_path, region, job_id_output_path, output_data_format=None, prediction_input=None, job_id_prefix=None,
|
||||
wait_interval=30):
|
||||
"""Creates a MLEngine batch prediction job.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the parent project of the job.
|
||||
model_path (str): Required. The path to the model. It can be either:
|
||||
`projects/[PROJECT_ID]/models/[MODEL_ID]` or
|
||||
`projects/[PROJECT_ID]/models/[MODEL_ID]/versions/[VERSION_ID]`
|
||||
or a GCS path of a model file.
|
||||
input_paths (list): Required. The Google Cloud Storage location of
|
||||
the input data files. May contain wildcards.
|
||||
input_data_format (str): Required. The format of the input data files.
|
||||
See https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat.
|
||||
output_path (str): Required. The output Google Cloud Storage location.
|
||||
region (str): Required. The Google Compute Engine region to run the
|
||||
prediction job in.
|
||||
output_data_format (str): Optional. Format of the output data files,
|
||||
defaults to JSON.
|
||||
prediction_input (dict): Input parameters to create a prediction job.
|
||||
job_id_prefix (str): the prefix of the generated job id.
|
||||
wait_interval (int): optional wait interval between calls
|
||||
to get job status. Defaults to 30.
|
||||
"""
|
||||
if not prediction_input:
|
||||
prediction_input = {}
|
||||
if not model_path:
|
||||
raise ValueError('model_path must be provided.')
|
||||
if _is_model_name(model_path):
|
||||
prediction_input['modelName'] = model_path
|
||||
elif _is_model_version_name(model_path):
|
||||
prediction_input['versionName'] = model_path
|
||||
elif _is_gcs_path(model_path):
|
||||
prediction_input['uri'] = model_path
|
||||
else:
|
||||
raise ValueError('model_path value is invalid.')
|
||||
|
||||
if input_paths:
|
||||
prediction_input['inputPaths'] = input_paths
|
||||
if input_data_format:
|
||||
prediction_input['dataFormat'] = input_data_format
|
||||
if output_path:
|
||||
prediction_input['outputPath'] = output_path
|
||||
if output_data_format:
|
||||
prediction_input['outputDataFormat'] = output_data_format
|
||||
if region:
|
||||
prediction_input['region'] = region
|
||||
job = {
|
||||
'predictionInput': prediction_input
|
||||
}
|
||||
create_job(
|
||||
project_id=project_id,
|
||||
job=job,
|
||||
job_id_prefix=job_id_prefix,
|
||||
wait_interval=wait_interval,
|
||||
job_id_output_path=job_id_output_path,
|
||||
)
|
||||
|
||||
def _is_model_name(name):
|
||||
return re.match(r'/projects/[^/]+/models/[^/]+$', name)
|
||||
|
||||
def _is_model_version_name(name):
|
||||
return re.match(r'/projects/[^/]+/models/[^/]+/versions/[^/]+$', name)
|
||||
|
||||
def _is_gcs_path(name):
|
||||
return name.startswith('gs://')
|
||||
|
|
@ -1,192 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from functools import wraps
|
||||
import logging
|
||||
import time
|
||||
|
||||
import googleapiclient.discovery as discovery
|
||||
from googleapiclient import errors
|
||||
from ..common import wait_operation_done, ClientWithRetries
|
||||
|
||||
|
||||
class MLEngineClient(ClientWithRetries):
|
||||
""" Client for calling MLEngine APIs.
|
||||
"""
|
||||
|
||||
def _build_client(self):
|
||||
self._ml_client = discovery.build('ml', 'v1', cache_discovery=False)
|
||||
|
||||
def create_job(self, project_id, job):
|
||||
"""Create a new job.
|
||||
|
||||
Args:
|
||||
project_id: the ID of the parent project.
|
||||
job: the payload of the job.
|
||||
|
||||
Returns:
|
||||
The created job.
|
||||
"""
|
||||
return self._ml_client.projects().jobs().create(
|
||||
parent = 'projects/{}'.format(project_id),
|
||||
body = job
|
||||
).execute()
|
||||
|
||||
def cancel_job(self, project_id, job_id):
|
||||
"""Cancel the specified job.
|
||||
|
||||
Args:
|
||||
project_id: the parent project ID of the job.
|
||||
job_id: the ID of the job.
|
||||
"""
|
||||
job_name = 'projects/{}/jobs/{}'.format(project_id, job_id)
|
||||
self._ml_client.projects().jobs().cancel(
|
||||
name = job_name,
|
||||
body = {
|
||||
'name': job_name
|
||||
},
|
||||
).execute()
|
||||
|
||||
def get_job(self, project_id, job_id):
|
||||
"""Gets the job by ID.
|
||||
|
||||
Args:
|
||||
project_id: the ID of the parent project.
|
||||
job_id: the ID of the job to retrieve.
|
||||
Returns:
|
||||
The retrieved job payload.
|
||||
"""
|
||||
job_name = 'projects/{}/jobs/{}'.format(project_id, job_id)
|
||||
return self._ml_client.projects().jobs().get(
|
||||
name=job_name).execute()
|
||||
|
||||
def create_model(self, project_id, model):
|
||||
"""Creates a new model.
|
||||
|
||||
Args:
|
||||
project_id: the ID of the parent project.
|
||||
model: the payload of the model.
|
||||
Returns:
|
||||
The created model.
|
||||
"""
|
||||
return self._ml_client.projects().models().create(
|
||||
parent = 'projects/{}'.format(project_id),
|
||||
body = model
|
||||
).execute()
|
||||
|
||||
def get_model(self, model_name):
|
||||
"""Gets a model.
|
||||
|
||||
Args:
|
||||
model_name: the name of the model.
|
||||
Returns:
|
||||
The retrieved model.
|
||||
"""
|
||||
return self._ml_client.projects().models().get(
|
||||
name = model_name
|
||||
).execute()
|
||||
|
||||
def create_version(self, model_name, version):
|
||||
"""Creates a new version.
|
||||
|
||||
Args:
|
||||
model_name: the name of the parent model.
|
||||
version: the payload of the version.
|
||||
|
||||
Returns:
|
||||
The created version.
|
||||
"""
|
||||
return self._ml_client.projects().models().versions().create(
|
||||
parent = model_name,
|
||||
body = version
|
||||
).execute()
|
||||
|
||||
def get_version(self, version_name):
|
||||
"""Gets a version.
|
||||
|
||||
Args:
|
||||
version_name: the name of the version.
|
||||
|
||||
Returns:
|
||||
The retrieved version. None if the version is not found.
|
||||
"""
|
||||
try:
|
||||
return self._ml_client.projects().models().versions().get(
|
||||
name = version_name
|
||||
).execute()
|
||||
except errors.HttpError as e:
|
||||
if e.resp.status == 404:
|
||||
return None
|
||||
raise
|
||||
|
||||
def delete_version(self, version_name):
|
||||
"""Deletes a version.
|
||||
|
||||
Args:
|
||||
version_name: the name of the version.
|
||||
|
||||
Returns:
|
||||
The delete operation. None if the version is not found.
|
||||
"""
|
||||
try:
|
||||
return self._ml_client.projects().models().versions().delete(
|
||||
name = version_name
|
||||
).execute()
|
||||
except errors.HttpError as e:
|
||||
if e.resp.status == 404:
|
||||
logging.info('The version has already been deleted.')
|
||||
return None
|
||||
raise
|
||||
|
||||
def set_default_version(self, version_name):
|
||||
return self._ml_client.projects().models().versions().setDefault(
|
||||
name = version_name
|
||||
).execute()
|
||||
|
||||
def get_operation(self, operation_name):
|
||||
"""Gets an operation.
|
||||
|
||||
Args:
|
||||
operation_name: the name of the operation.
|
||||
|
||||
Returns:
|
||||
The retrieved operation.
|
||||
"""
|
||||
return self._ml_client.projects().operations().get(
|
||||
name = operation_name
|
||||
).execute()
|
||||
|
||||
def wait_for_operation_done(self, operation_name, wait_interval):
|
||||
"""Waits for an operation to be done.
|
||||
|
||||
Args:
|
||||
operation_name: the name of the operation.
|
||||
wait_interval: the wait interview between pulling job
|
||||
status.
|
||||
|
||||
Returns:
|
||||
The completed operation.
|
||||
"""
|
||||
return wait_operation_done(
|
||||
lambda: self.get_operation(operation_name), wait_interval)
|
||||
|
||||
def cancel_operation(self, operation_name):
|
||||
"""Cancels an operation.
|
||||
|
||||
Args:
|
||||
operation_name: the name of the operation.
|
||||
"""
|
||||
self._ml_client.projects().operations().cancel(
|
||||
name = operation_name
|
||||
).execute()
|
||||
|
|
@ -1,161 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
|
||||
from googleapiclient import errors
|
||||
|
||||
from kfp_component.core import display
|
||||
from ._client import MLEngineClient
|
||||
from .. import common as gcp_common
|
||||
|
||||
def wait_existing_version(ml_client, version_name, wait_interval):
|
||||
while True:
|
||||
existing_version = ml_client.get_version(version_name)
|
||||
if not existing_version:
|
||||
return None
|
||||
state = existing_version.get('state', None)
|
||||
if not state in ['CREATING', 'DELETING', 'UPDATING']:
|
||||
return existing_version
|
||||
logging.info('Version is in {} state. Wait for {}s'.format(
|
||||
state, wait_interval
|
||||
))
|
||||
time.sleep(wait_interval)
|
||||
|
||||
def wait_for_operation_done(ml_client, operation_name, action, wait_interval):
|
||||
"""Waits for an operation to be done.
|
||||
|
||||
Args:
|
||||
operation_name: the name of the operation.
|
||||
action: the action name of the operation.
|
||||
wait_interval: the wait interview between pulling job
|
||||
status.
|
||||
|
||||
Returns:
|
||||
The completed operation.
|
||||
|
||||
Raises:
|
||||
RuntimeError if the operation has error.
|
||||
"""
|
||||
operation = None
|
||||
while True:
|
||||
operation = ml_client.get_operation(operation_name)
|
||||
done = operation.get('done', False)
|
||||
if done:
|
||||
break
|
||||
logging.info('Operation {} is not done. Wait for {}s.'.format(operation_name, wait_interval))
|
||||
time.sleep(wait_interval)
|
||||
error = operation.get('error', None)
|
||||
if error:
|
||||
raise RuntimeError('Failed to complete {} operation {}: {} {}'.format(
|
||||
action,
|
||||
operation_name,
|
||||
error.get('code', 'Unknown code'),
|
||||
error.get('message', 'Unknown message'),
|
||||
))
|
||||
return operation
|
||||
|
||||
def wait_for_job_done(ml_client, project_id, job_id, wait_interval, show_tensorboard=True,
|
||||
job_object_output_path='/tmp/kfp/output/ml_engine/job.json',
|
||||
job_id_output_path='/tmp/kfp/output/ml_engine/job_id.txt',
|
||||
job_dir_output_path='/tmp/kfp/output/ml_engine/job_dir.txt',
|
||||
):
|
||||
"""Waits for a CMLE job done.
|
||||
|
||||
Args:
|
||||
ml_client: CMLE google api client
|
||||
project_id: the ID of the project which has the job
|
||||
job_id: the ID of the job to wait
|
||||
wait_interval: the interval in seconds to wait between polls.
|
||||
show_tensorboard: True to dump Tensorboard metadata.
|
||||
|
||||
Returns:
|
||||
The completed job.
|
||||
|
||||
Raises:
|
||||
RuntimeError if the job finishes with failed or cancelled state.
|
||||
"""
|
||||
metadata_dumped = False
|
||||
while True:
|
||||
job = ml_client.get_job(project_id, job_id)
|
||||
print(job)
|
||||
if not metadata_dumped:
|
||||
_dump_job_metadata(project_id, job_id, job, show_tensorboard=show_tensorboard)
|
||||
metadata_dumped = True
|
||||
if job.get('state', None) in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
|
||||
break
|
||||
# Move to config from flag
|
||||
logging.info('job status is {}, wait for {}s'.format(
|
||||
job.get('state', None), wait_interval))
|
||||
time.sleep(wait_interval)
|
||||
|
||||
_dump_job(
|
||||
job=job,
|
||||
job_object_output_path=job_object_output_path,
|
||||
job_id_output_path=job_id_output_path,
|
||||
job_dir_output_path=job_dir_output_path,
|
||||
)
|
||||
|
||||
if job['state'] != 'SUCCEEDED':
|
||||
raise RuntimeError('Job failed with state {}. Error: {}'.format(
|
||||
job['state'], job.get('errorMessage', '')))
|
||||
return job
|
||||
|
||||
def _dump_job_metadata(project_id, job_id, job, show_tensorboard=True):
|
||||
display.display(display.Link(
|
||||
'https://console.cloud.google.com/mlengine/jobs/{}?project={}'.format(
|
||||
job_id, project_id),
|
||||
'Job Details'
|
||||
))
|
||||
display.display(display.Link(
|
||||
'https://console.cloud.google.com/logs/viewer?project={}&resource=ml_job/job_id/{}&interval=NO_LIMIT'.format(
|
||||
project_id, job_id),
|
||||
'Logs'
|
||||
))
|
||||
if show_tensorboard and 'trainingInput' in job and 'jobDir' in job['trainingInput']:
|
||||
display.display(display.Tensorboard(
|
||||
job['trainingInput']['jobDir']))
|
||||
|
||||
def _dump_job(
|
||||
job,
|
||||
job_object_output_path,
|
||||
job_id_output_path,
|
||||
job_dir_output_path,
|
||||
):
|
||||
logging.info('Dumping job: {}'.format(job))
|
||||
gcp_common.dump_file(job_object_output_path, json.dumps(job))
|
||||
gcp_common.dump_file(job_id_output_path, job['jobId'])
|
||||
job_dir = ''
|
||||
if 'trainingInput' in job and 'jobDir' in job['trainingInput']:
|
||||
job_dir = job['trainingInput']['jobDir']
|
||||
gcp_common.dump_file(job_dir_output_path, job_dir)
|
||||
|
||||
def cancel_job(ml_client, project_id, job_id):
|
||||
"""Cancels a CMLE job.
|
||||
|
||||
Args:
|
||||
ml_client: CMLE google api client
|
||||
project_id: the ID of the project which has the job
|
||||
job_id: the ID of the job to cancel
|
||||
"""
|
||||
try:
|
||||
logging.info('Cancelling job {}.'.format(job_id))
|
||||
ml_client.cancel_job(project_id, job_id)
|
||||
logging.info('Cancelled job {}.'.format(job_id))
|
||||
except errors.HttpError as e:
|
||||
# Best effort to cancel the job
|
||||
logging.error('Failed to cancel the job: {}'.format(e))
|
||||
pass
|
||||
|
|
@ -1,122 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
from googleapiclient import errors
|
||||
|
||||
from ._common_ops import wait_for_job_done, cancel_job
|
||||
|
||||
from kfp_component.core import KfpExecutionContext
|
||||
from ._client import MLEngineClient
|
||||
from .. import common as gcp_common
|
||||
|
||||
def create_job(
|
||||
project_id,
|
||||
job,
|
||||
job_id_prefix=None,
|
||||
job_id=None,
|
||||
wait_interval=30,
|
||||
job_object_output_path='/tmp/kfp/output/ml_engine/job.json',
|
||||
job_id_output_path='/tmp/kfp/output/ml_engine/job_id.txt',
|
||||
job_dir_output_path='/tmp/kfp/output/ml_engine/job_dir.txt',
|
||||
):
|
||||
"""Creates a MLEngine job.
|
||||
|
||||
Args:
|
||||
project_id: the ID of the parent project of the job.
|
||||
job: the payload of the job. Must have ``jobId``
|
||||
and ``trainingInput`` or ``predictionInput`.
|
||||
job_id_prefix: the prefix of the generated job id.
|
||||
job_id: the created job_id, takes precedence over generated job
|
||||
id if set.
|
||||
wait_interval: optional wait interval between calls
|
||||
to get job status. Defaults to 30.
|
||||
job_object_output_path: Path for the json payload of the create job.
|
||||
job_id_output_path: Path for the ID of the created job.
|
||||
job_dir_output_path: Path for the `jobDir` of the training job.
|
||||
"""
|
||||
return CreateJobOp(
|
||||
project_id=project_id,
|
||||
job=job,
|
||||
job_id_prefix=job_id_prefix,
|
||||
job_id=job_id,
|
||||
wait_interval=wait_interval,
|
||||
job_object_output_path=job_object_output_path,
|
||||
job_id_output_path=job_id_output_path,
|
||||
job_dir_output_path=job_dir_output_path,
|
||||
).execute_and_wait()
|
||||
|
||||
class CreateJobOp:
|
||||
def __init__(self,project_id, job, job_id_prefix=None, job_id=None,
|
||||
wait_interval=30,
|
||||
job_object_output_path=None,
|
||||
job_id_output_path=None,
|
||||
job_dir_output_path=None,
|
||||
):
|
||||
self._ml = MLEngineClient()
|
||||
self._project_id = project_id
|
||||
self._job_id_prefix = job_id_prefix
|
||||
self._job_id = job_id
|
||||
self._job = job
|
||||
self._wait_interval = wait_interval
|
||||
self._job_object_output_path = job_object_output_path
|
||||
self._job_id_output_path = job_id_output_path
|
||||
self._job_dir_output_path = job_dir_output_path
|
||||
|
||||
def execute_and_wait(self):
|
||||
with KfpExecutionContext(on_cancel=lambda: cancel_job(self._ml, self._project_id, self._job_id)) as ctx:
|
||||
self._set_job_id(ctx.context_id())
|
||||
self._create_job()
|
||||
return wait_for_job_done(self._ml, self._project_id, self._job_id, self._wait_interval,
|
||||
job_object_output_path=self._job_object_output_path,
|
||||
job_id_output_path=self._job_id_output_path,
|
||||
job_dir_output_path=self._job_dir_output_path,
|
||||
)
|
||||
|
||||
def _set_job_id(self, context_id):
|
||||
if self._job_id:
|
||||
job_id = self._job_id
|
||||
elif self._job_id_prefix:
|
||||
job_id = self._job_id_prefix + context_id[:16]
|
||||
else:
|
||||
job_id = 'job_' + context_id
|
||||
job_id = gcp_common.normalize_name(job_id)
|
||||
self._job_id = job_id
|
||||
self._job['jobId'] = job_id
|
||||
|
||||
def _create_job(self):
|
||||
try:
|
||||
self._ml.create_job(
|
||||
project_id = self._project_id,
|
||||
job = self._job
|
||||
)
|
||||
except errors.HttpError as e:
|
||||
if e.resp.status == 409:
|
||||
if not self._is_dup_job():
|
||||
logging.error('Another job has been created with same name before: {}'.format(self._job_id))
|
||||
raise
|
||||
logging.info('The job {} has been submitted before. Continue waiting.'.format(self._job_id))
|
||||
else:
|
||||
logging.error('Failed to create job.\nPayload: {}\nError: {}'.format(self._job, e))
|
||||
raise
|
||||
|
||||
def _is_dup_job(self):
|
||||
existing_job = self._ml.get_job(self._project_id, self._job_id)
|
||||
return existing_job.get('trainingInput', None) == self._job.get('trainingInput', None) \
|
||||
and existing_job.get('predictionInput', None) == self._job.get('predictionInput', None)
|
||||
|
|
@ -1,105 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
from googleapiclient import errors
|
||||
|
||||
from kfp_component.core import KfpExecutionContext, display
|
||||
from ._client import MLEngineClient
|
||||
from .. import common as gcp_common
|
||||
|
||||
def create_model(project_id, model_id=None, model=None,
|
||||
model_name_output_path='/tmp/kfp/output/ml_engine/model_name.txt',
|
||||
model_object_output_path='/tmp/kfp/output/ml_engine/model.json',
|
||||
):
|
||||
"""Creates a MLEngine model.
|
||||
|
||||
Args:
|
||||
project_id (str): the ID of the parent project of the model.
|
||||
model_id (str): optional, the name of the model. If absent, a new name will
|
||||
be generated.
|
||||
model (dict): the payload of the model.
|
||||
"""
|
||||
return CreateModelOp(project_id, model_id, model,
|
||||
model_name_output_path=model_name_output_path,
|
||||
model_object_output_path=model_object_output_path,
|
||||
).execute()
|
||||
|
||||
class CreateModelOp:
|
||||
def __init__(self, project_id, model_id, model,
|
||||
model_name_output_path,
|
||||
model_object_output_path,
|
||||
):
|
||||
self._ml = MLEngineClient()
|
||||
self._project_id = project_id
|
||||
self._model_id = model_id
|
||||
self._model_name = None
|
||||
if model:
|
||||
self._model = model
|
||||
else:
|
||||
self._model = {}
|
||||
self._model_name_output_path = model_name_output_path
|
||||
self._model_object_output_path = model_object_output_path
|
||||
|
||||
def execute(self):
|
||||
with KfpExecutionContext() as ctx:
|
||||
self._set_model_name(ctx.context_id())
|
||||
self._dump_metadata()
|
||||
try:
|
||||
created_model = self._ml.create_model(
|
||||
project_id = self._project_id,
|
||||
model = self._model)
|
||||
except errors.HttpError as e:
|
||||
if e.resp.status == 409:
|
||||
existing_model = self._ml.get_model(self._model_name)
|
||||
if not self._is_dup_model(existing_model):
|
||||
raise
|
||||
logging.info('The same model {} has been submitted'
|
||||
' before. Continue the operation.'.format(
|
||||
self._model_name))
|
||||
created_model = existing_model
|
||||
else:
|
||||
raise
|
||||
self._dump_model(created_model)
|
||||
return created_model
|
||||
|
||||
def _set_model_name(self, context_id):
|
||||
if not self._model_id:
|
||||
self._model_id = 'model_' + context_id
|
||||
self._model['name'] = gcp_common.normalize_name(self._model_id)
|
||||
self._model_name = 'projects/{}/models/{}'.format(
|
||||
self._project_id, self._model_id)
|
||||
|
||||
|
||||
def _is_dup_model(self, existing_model):
|
||||
return not gcp_common.check_resource_changed(
|
||||
self._model,
|
||||
existing_model,
|
||||
['description', 'regions',
|
||||
'onlinePredictionLogging', 'labels'])
|
||||
|
||||
def _dump_metadata(self):
|
||||
display.display(display.Link(
|
||||
'https://console.cloud.google.com/mlengine/models/{}?project={}'.format(
|
||||
self._model_id, self._project_id),
|
||||
'Model Details'
|
||||
))
|
||||
|
||||
def _dump_model(self, model):
|
||||
logging.info('Dumping model: {}'.format(model))
|
||||
gcp_common.dump_file(self._model_object_output_path, json.dumps(model))
|
||||
gcp_common.dump_file(self._model_name_output_path, self._model_name)
|
||||
|
|
@ -1,213 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import re
|
||||
|
||||
from googleapiclient import errors
|
||||
from fire import decorators
|
||||
|
||||
from kfp_component.core import KfpExecutionContext, display
|
||||
from ._client import MLEngineClient
|
||||
from .. import common as gcp_common
|
||||
from ._common_ops import wait_existing_version, wait_for_operation_done
|
||||
|
||||
@decorators.SetParseFns(python_version=str, runtime_version=str)
|
||||
def create_version(model_name, deployemnt_uri=None, version_id=None,
|
||||
runtime_version=None, python_version=None, version=None,
|
||||
replace_existing=False, wait_interval=30,
|
||||
version_name_output_path='/tmp/kfp/output/ml_engine/version_name.txt',
|
||||
version_object_output_path='/tmp/kfp/output/ml_engine/version.json',
|
||||
):
|
||||
"""Creates a MLEngine version and wait for the operation to be done.
|
||||
|
||||
Args:
|
||||
model_name (str): required, the name of the parent model.
|
||||
deployment_uri (str): optional, the Google Cloud Storage location of
|
||||
the trained model used to create the version.
|
||||
version_id (str): optional, the user provided short name of
|
||||
the version. If it is not provided, the operation uses a random name.
|
||||
runtime_version (str): optinal, the Cloud ML Engine runtime version
|
||||
to use for this deployment. If not set, Cloud ML Engine uses
|
||||
the default stable version, 1.0.
|
||||
python_version (str): optinal, the version of Python used in prediction.
|
||||
If not set, the default version is '2.7'. Python '3.5' is available
|
||||
when runtimeVersion is set to '1.4' and above. Python '2.7' works
|
||||
with all supported runtime versions.
|
||||
version (dict): optional, the payload of the new version.
|
||||
replace_existing (boolean): boolean flag indicates whether to replace
|
||||
existing version in case of conflict.
|
||||
wait_interval (int): the interval to wait for a long running operation.
|
||||
"""
|
||||
if not version:
|
||||
version = {}
|
||||
if deployemnt_uri:
|
||||
version['deploymentUri'] = deployemnt_uri
|
||||
if version_id:
|
||||
version['name'] = version_id
|
||||
if runtime_version:
|
||||
version['runtimeVersion'] = runtime_version
|
||||
if python_version:
|
||||
version['pythonVersion'] = python_version
|
||||
|
||||
return CreateVersionOp(model_name, version,
|
||||
replace_existing, wait_interval,
|
||||
version_name_output_path=version_name_output_path,
|
||||
version_object_output_path=version_object_output_path,
|
||||
).execute_and_wait()
|
||||
|
||||
class CreateVersionOp:
|
||||
def __init__(self, model_name, version,
|
||||
replace_existing, wait_interval,
|
||||
version_name_output_path,
|
||||
version_object_output_path,
|
||||
):
|
||||
self._ml = MLEngineClient()
|
||||
self._model_name = model_name
|
||||
self._project_id, self._model_id = self._parse_model_name(model_name)
|
||||
# The name of the version resource, which is in the format
|
||||
# of projects/*/models/*/versions/*
|
||||
self._version_name = None
|
||||
# The user provide short name of the version.
|
||||
self._version_id = None
|
||||
# The full payload of the version resource.
|
||||
self._version = version
|
||||
self._replace_existing = replace_existing
|
||||
self._wait_interval = wait_interval
|
||||
self._create_operation_name = None
|
||||
self._delete_operation_name = None
|
||||
self._version_name_output_path = version_name_output_path
|
||||
self._version_object_output_path = version_object_output_path
|
||||
|
||||
def execute_and_wait(self):
|
||||
with KfpExecutionContext(on_cancel=self._cancel) as ctx:
|
||||
self._set_version_name(ctx.context_id())
|
||||
self._dump_metadata()
|
||||
existing_version = wait_existing_version(self._ml,
|
||||
self._version_name,
|
||||
self._wait_interval)
|
||||
if existing_version and self._is_dup_version(existing_version):
|
||||
return self._handle_completed_version(existing_version)
|
||||
|
||||
if existing_version and self._replace_existing:
|
||||
logging.info('Deleting existing version...')
|
||||
self._delete_version_and_wait()
|
||||
elif existing_version:
|
||||
raise RuntimeError(
|
||||
'Existing version conflicts with the name of the new version.')
|
||||
|
||||
created_version = self._create_version_and_wait()
|
||||
return self._handle_completed_version(created_version)
|
||||
|
||||
def _parse_model_name(self, model_name):
|
||||
match = re.search(r'^projects/([^/]+)/models/([^/]+)$', model_name)
|
||||
if not match:
|
||||
raise ValueError('model name "{}" is not in desired format.'.format(model_name))
|
||||
return (match.group(1), match.group(2))
|
||||
|
||||
def _set_version_name(self, context_id):
|
||||
name = self._version.get('name', None)
|
||||
if not name:
|
||||
name = 'ver_' + context_id
|
||||
name = gcp_common.normalize_name(name)
|
||||
self._version_id = name
|
||||
self._version['name'] = name
|
||||
self._version_name = '{}/versions/{}'.format(self._model_name, name)
|
||||
|
||||
def _cancel(self):
|
||||
if self._delete_operation_name:
|
||||
self._ml.cancel_operation(self._delete_operation_name)
|
||||
|
||||
if self._create_operation_name:
|
||||
self._ml.cancel_operation(self._create_operation_name)
|
||||
|
||||
def _create_version_and_wait(self):
|
||||
operation = self._ml.create_version(self._model_name, self._version)
|
||||
# Cache operation name for cancellation.
|
||||
self._create_operation_name = operation.get('name')
|
||||
try:
|
||||
operation = wait_for_operation_done(
|
||||
self._ml,
|
||||
self._create_operation_name,
|
||||
'create version',
|
||||
self._wait_interval)
|
||||
finally:
|
||||
self._create_operation_name = None
|
||||
return operation.get('response', None)
|
||||
|
||||
def _delete_version_and_wait(self):
|
||||
operation = self._ml.delete_version(self._version_name)
|
||||
# Cache operation name for cancellation.
|
||||
self._delete_operation_name = operation.get('name')
|
||||
try:
|
||||
wait_for_operation_done(
|
||||
self._ml,
|
||||
self._delete_operation_name,
|
||||
'delete version',
|
||||
self._wait_interval)
|
||||
finally:
|
||||
self._delete_operation_name = None
|
||||
|
||||
def _handle_completed_version(self, version):
|
||||
state = version.get('state', None)
|
||||
if state == 'FAILED':
|
||||
error_message = version.get('errorMessage', 'Unknown failure')
|
||||
raise RuntimeError('Version is in failed state: {}'.format(
|
||||
error_message))
|
||||
# Workaround issue that CMLE doesn't return the full version name.
|
||||
version['name'] = self._version_name
|
||||
self._dump_version(version)
|
||||
return version
|
||||
|
||||
def _dump_metadata(self):
|
||||
display.display(display.Link(
|
||||
'https://console.cloud.google.com/mlengine/models/{}/versions/{}?project={}'.format(
|
||||
self._model_id, self._version_id, self._project_id),
|
||||
'Version Details'
|
||||
))
|
||||
display.display(display.Markdown('''
|
||||
## Online Prediction
|
||||
|
||||
### REST endpoint
|
||||
The REST endpoint for online prediction is as follows:
|
||||
```
|
||||
POST https://ml.googleapis.com/v1/{}:predict
|
||||
```
|
||||
Try the REST endpoint in [Google OAuth 2.0 Playgound](https://developers.google.com/oauthplayground/#step3\
|
||||
&apisSelect=https://www.googleapis.com/auth/cloud-platform&postData={{"instances":[]}}\
|
||||
&url=https://ml.googleapis.com/v1/{}:predict&content_type=application/json&http_method=POST).
|
||||
|
||||
### GCloud command
|
||||
```bash
|
||||
gcloud ai-platform predict --model {} \
|
||||
--version {} \
|
||||
--json-instances instances.json
|
||||
```
|
||||
'''.format(self._version_name, self._version_name, self._model_id, self._version_id)))
|
||||
|
||||
def _dump_version(self, version):
|
||||
logging.info('Dumping version: {}'.format(version))
|
||||
gcp_common.dump_file(self._version_name_output_path, json.dumps(version))
|
||||
gcp_common.dump_file(self._version_object_output_path, version['name'])
|
||||
|
||||
def _is_dup_version(self, existing_version):
|
||||
return not gcp_common.check_resource_changed(
|
||||
self._version,
|
||||
existing_version,
|
||||
['description', 'deploymentUri',
|
||||
'runtimeVersion', 'machineType', 'labels',
|
||||
'framework', 'pythonVersion', 'autoScaling',
|
||||
'manualScaling'])
|
||||
|
|
@ -1,66 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
from googleapiclient import errors
|
||||
|
||||
from kfp_component.core import KfpExecutionContext
|
||||
from ._client import MLEngineClient
|
||||
from .. import common as gcp_common
|
||||
from ._common_ops import wait_existing_version, wait_for_operation_done
|
||||
|
||||
def delete_version(version_name, wait_interval=30):
|
||||
"""Deletes a MLEngine version and wait.
|
||||
|
||||
Args:
|
||||
version_name (str): required, the name of the version.
|
||||
wait_interval (int): the interval to wait for a long running operation.
|
||||
"""
|
||||
DeleteVersionOp(version_name, wait_interval).execute_and_wait()
|
||||
|
||||
class DeleteVersionOp:
|
||||
def __init__(self, version_name, wait_interval):
|
||||
self._ml = MLEngineClient()
|
||||
self._version_name = version_name
|
||||
self._wait_interval = wait_interval
|
||||
self._delete_operation_name = None
|
||||
|
||||
def execute_and_wait(self):
|
||||
with KfpExecutionContext(on_cancel=self._cancel):
|
||||
existing_version = wait_existing_version(self._ml,
|
||||
self._version_name,
|
||||
self._wait_interval)
|
||||
if not existing_version:
|
||||
logging.info('The version has already been deleted.')
|
||||
return None
|
||||
|
||||
logging.info('Deleting existing version...')
|
||||
operation = self._ml.delete_version(self._version_name)
|
||||
# Cache operation name for cancellation.
|
||||
self._delete_operation_name = operation.get('name')
|
||||
try:
|
||||
wait_for_operation_done(
|
||||
self._ml,
|
||||
self._delete_operation_name,
|
||||
'delete version',
|
||||
self._wait_interval)
|
||||
finally:
|
||||
self._delete_operation_name = None
|
||||
return None
|
||||
|
||||
def _cancel(self):
|
||||
if self._delete_operation_name:
|
||||
self._ml.cancel_operation(self._delete_operation_name)
|
||||
|
|
@ -1,112 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
import os
|
||||
|
||||
from fire import decorators
|
||||
|
||||
from google.cloud import storage
|
||||
from .. import common as gcp_common
|
||||
from ..storage import parse_blob_path
|
||||
from ._create_model import create_model
|
||||
from ._create_version import create_version
|
||||
from ._set_default_version import set_default_version
|
||||
|
||||
KNOWN_MODEL_NAMES = ['saved_model.pb', 'saved_model.pbtext', 'model.pkl', 'model.pkl', 'model.pkl']
|
||||
|
||||
@decorators.SetParseFns(python_version=str, runtime_version=str)
|
||||
def deploy(model_uri, project_id,
|
||||
model_uri_output_path, model_name_output_path, version_name_output_path,
|
||||
model_id=None, version_id=None,
|
||||
runtime_version=None, python_version=None, model=None, version=None,
|
||||
replace_existing_version=False, set_default=False, wait_interval=30):
|
||||
"""Deploy a model to MLEngine from GCS URI
|
||||
|
||||
Args:
|
||||
model_uri (str): Required, the GCS URI which contains a model file.
|
||||
If no model file is found, the same path will be treated as an export
|
||||
base directory of a TF Estimator. The last time-stamped sub-directory
|
||||
will be chosen as model URI.
|
||||
project_id (str): required, the ID of the parent project.
|
||||
model_id (str): optional, the user provided name of the model.
|
||||
version_id (str): optional, the user provided name of the version.
|
||||
If it is not provided, the operation uses a random name.
|
||||
runtime_version (str): optinal, the Cloud ML Engine runtime version
|
||||
to use for this deployment. If not set, Cloud ML Engine uses
|
||||
the default stable version, 1.0.
|
||||
python_version (str): optinal, the version of Python used in prediction.
|
||||
If not set, the default version is '2.7'. Python '3.5' is available
|
||||
when runtimeVersion is set to '1.4' and above. Python '2.7' works
|
||||
with all supported runtime versions.
|
||||
model (dict): Optional, the JSON payload of the new model. The schema follows
|
||||
[REST Model resource](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models).
|
||||
version (dict): Optional, the JSON payload of the new version. The schema follows
|
||||
the [REST Version resource](https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions)
|
||||
replace_existing_version (boolean): boolean flag indicates whether to replace
|
||||
existing version in case of conflict.
|
||||
set_default (boolean): boolean flag indicates whether to set the new
|
||||
version as default version in the model.
|
||||
wait_interval (int): the interval to wait for a long running operation.
|
||||
"""
|
||||
storage_client = storage.Client()
|
||||
model_uri = _search_dir_with_model(storage_client, model_uri)
|
||||
gcp_common.dump_file(model_uri_output_path, model_uri)
|
||||
model = create_model(project_id, model_id, model,
|
||||
model_name_output_path=model_name_output_path,
|
||||
)
|
||||
model_name = model.get('name')
|
||||
version = create_version(model_name, model_uri, version_id,
|
||||
runtime_version, python_version, version, replace_existing_version,
|
||||
wait_interval, version_name_output_path=version_name_output_path,
|
||||
)
|
||||
if set_default:
|
||||
version_name = version.get('name')
|
||||
version = set_default_version(version_name)
|
||||
return version
|
||||
|
||||
def _search_dir_with_model(storage_client, model_root_uri):
|
||||
bucket_name, blob_name = parse_blob_path(model_root_uri)
|
||||
bucket = storage_client.bucket(bucket_name)
|
||||
if not blob_name.endswith('/'):
|
||||
blob_name += '/'
|
||||
it = bucket.list_blobs(prefix=blob_name, delimiter='/')
|
||||
for resource in it:
|
||||
basename = os.path.basename(resource.name)
|
||||
if basename in KNOWN_MODEL_NAMES:
|
||||
logging.info('Found model file under {}.'.format(model_root_uri))
|
||||
return model_root_uri
|
||||
model_dir = _search_tf_export_dir_base(storage_client, bucket, blob_name)
|
||||
if not model_dir:
|
||||
model_dir = model_root_uri
|
||||
return model_dir
|
||||
|
||||
def _search_tf_export_dir_base(storage_client, bucket, export_dir_base):
|
||||
logging.info('Searching model under export base dir: {}.'.format(export_dir_base))
|
||||
it = bucket.list_blobs(prefix=export_dir_base, delimiter='/')
|
||||
for _ in it.pages:
|
||||
# Iterate to the last page to get the full prefixes.
|
||||
pass
|
||||
timestamped_dirs = []
|
||||
for sub_dir in it.prefixes:
|
||||
dir_name = os.path.basename(os.path.normpath(sub_dir))
|
||||
if dir_name.isdigit():
|
||||
timestamped_dirs.append(sub_dir)
|
||||
|
||||
if not timestamped_dirs:
|
||||
logging.info('No timestamped sub-directory is found under {}'.format(export_dir_base))
|
||||
return None
|
||||
|
||||
last_timestamped_dir = max(timestamped_dirs)
|
||||
logging.info('Found timestamped sub-directory: {}.'.format(last_timestamped_dir))
|
||||
return 'gs://{}/{}'.format(bucket.name, last_timestamped_dir)
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._client import MLEngineClient
|
||||
|
||||
def set_default_version(version_name):
|
||||
"""Set specified version as default version.
|
||||
"""
|
||||
return MLEngineClient().set_default_version(version_name)
|
||||
|
|
@ -1,105 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from fire import decorators
|
||||
from ._create_job import create_job
|
||||
|
||||
|
||||
@decorators.SetParseFns(python_version=str, runtime_version=str)
|
||||
def train(project_id,
|
||||
job_id_output_path,
|
||||
job_dir_output_path,
|
||||
python_module=None,
|
||||
package_uris=None,
|
||||
region=None,
|
||||
args=None,
|
||||
job_dir=None,
|
||||
python_version=None,
|
||||
runtime_version=None,
|
||||
master_image_uri=None,
|
||||
worker_image_uri=None,
|
||||
training_input=None,
|
||||
job_id_prefix=None,
|
||||
job_id=None,
|
||||
wait_interval=30):
|
||||
"""Creates a MLEngine training job.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the parent project of the job.
|
||||
job_id_output_path (str): Required. Path for the ID of the created job.
|
||||
job_dir_output_path (str): Required. Path for the directory of the job.
|
||||
python_module (str): Required. The Python module name to run after
|
||||
installing the packages.
|
||||
package_uris (list): Required. The Google Cloud Storage location of
|
||||
the packages with the training program and any additional
|
||||
dependencies. The maximum number of package URIs is 100.
|
||||
region (str): Required. The Google Compute Engine region to run the
|
||||
training job in
|
||||
args (list): Command line arguments to pass to the program.
|
||||
job_dir (str): A Google Cloud Storage path in which to store training
|
||||
outputs and other data needed for training. This path is passed to
|
||||
your TensorFlow program as the '--job-dir' command-line argument.
|
||||
The benefit of specifying this field is that Cloud ML validates the
|
||||
path for use in training.
|
||||
python_version (str): Optional. The version of Python used in
|
||||
training. If not set, the default version is '2.7'. Python '3.5' is
|
||||
available when runtimeVersion is set to '1.4' and above. Python
|
||||
'2.7' works with all supported runtime versions.
|
||||
runtime_version (str): Optional. The Cloud ML Engine runtime version
|
||||
to use for training. If not set, Cloud ML Engine uses the default
|
||||
stable version, 1.0.
|
||||
master_image_uri (str): The Docker image to run on the master replica.
|
||||
This image must be in Container Registry.
|
||||
worker_image_uri (str): The Docker image to run on the worker replica.
|
||||
This image must be in Container Registry.
|
||||
training_input (dict): Input parameters to create a training job.
|
||||
job_id_prefix (str): the prefix of the generated job id.
|
||||
job_id (str): the created job_id, takes precedence over generated job
|
||||
id if set.
|
||||
wait_interval (int): optional wait interval between calls to get job
|
||||
status. Defaults to 30.
|
||||
"""
|
||||
if not training_input:
|
||||
training_input = {}
|
||||
if python_module:
|
||||
training_input['pythonModule'] = python_module
|
||||
if package_uris:
|
||||
training_input['packageUris'] = package_uris
|
||||
if region:
|
||||
training_input['region'] = region
|
||||
if args:
|
||||
training_input['args'] = args
|
||||
if job_dir:
|
||||
training_input['jobDir'] = job_dir
|
||||
if python_version:
|
||||
training_input['pythonVersion'] = python_version
|
||||
if runtime_version:
|
||||
training_input['runtimeVersion'] = runtime_version
|
||||
if master_image_uri:
|
||||
if 'masterConfig' not in training_input:
|
||||
training_input['masterConfig'] = {}
|
||||
training_input['masterConfig']['imageUri'] = master_image_uri
|
||||
if worker_image_uri:
|
||||
if 'workerConfig' not in training_input:
|
||||
training_input['workerConfig'] = {}
|
||||
training_input['workerConfig']['imageUri'] = worker_image_uri
|
||||
job = {'trainingInput': training_input}
|
||||
return create_job(
|
||||
project_id=project_id,
|
||||
job=job,
|
||||
job_id_prefix=job_id_prefix,
|
||||
job_id=job_id,
|
||||
wait_interval=wait_interval,
|
||||
job_id_output_path=job_id_output_path,
|
||||
job_dir_output_path=job_dir_output_path)
|
||||
|
|
@ -1,53 +0,0 @@
|
|||
# Copyright 2019 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._common_ops import wait_for_job_done, cancel_job
|
||||
|
||||
from kfp_component.core import KfpExecutionContext
|
||||
from ._client import MLEngineClient
|
||||
from .. import common as gcp_common
|
||||
|
||||
def wait_job(
|
||||
project_id,
|
||||
job_id,
|
||||
wait_interval=30,
|
||||
show_tensorboard=True,
|
||||
job_object_output_path='/tmp/kfp/output/ml_engine/job.json',
|
||||
job_id_output_path='/tmp/kfp/output/ml_engine/job_id.txt',
|
||||
job_dir_output_path='/tmp/kfp/output/ml_engine/job_dir.txt',
|
||||
):
|
||||
"""Waits a MLEngine job.
|
||||
|
||||
Args:
|
||||
project_id (str): Required. The ID of the parent project of the job.
|
||||
job_id (str): Required. The ID of the job to wait.
|
||||
wait_interval (int): optional wait interval between calls
|
||||
to get job status. Defaults to 30.
|
||||
show_tensorboard (bool): optional. True to dump Tensorboard metadata.
|
||||
job_object_output_path: Path for the json payload of the waiting job.
|
||||
job_id_output_path: Path for the ID of the waiting job.
|
||||
job_dir_output_path: Path for the `jobDir` of the waiting job.
|
||||
"""
|
||||
ml_client = MLEngineClient()
|
||||
with KfpExecutionContext(on_cancel=lambda: cancel_job(ml_client, project_id, job_id)):
|
||||
return wait_for_job_done(
|
||||
ml_client=ml_client,
|
||||
project_id=project_id,
|
||||
job_id=job_id,
|
||||
wait_interval=wait_interval,
|
||||
show_tensorboard=show_tensorboard,
|
||||
job_object_output_path=job_object_output_path,
|
||||
job_id_output_path=job_id_output_path,
|
||||
job_dir_output_path=job_dir_output_path,
|
||||
)
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ._download_blob import download_blob
|
||||
from ._common_ops import parse_blob_path, is_gcs_path
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import re
|
||||
|
||||
def is_gcs_path(path):
|
||||
"""Check if the path is a gcs path"""
|
||||
return path.startswith('gs://')
|
||||
|
||||
def parse_blob_path(path):
|
||||
"""Parse a gcs path into bucket name and blob name
|
||||
|
||||
Args:
|
||||
path (str): the path to parse.
|
||||
|
||||
Returns:
|
||||
(bucket name in the path, blob name in the path)
|
||||
|
||||
Raises:
|
||||
ValueError if the path is not a valid gcs blob path.
|
||||
|
||||
Example:
|
||||
|
||||
`bucket_name, blob_name = parse_blob_path('gs://foo/bar')`
|
||||
`bucket_name` is `foo` and `blob_name` is `bar`
|
||||
"""
|
||||
match = re.match('gs://([^/]+)/(.+)$', path)
|
||||
if match:
|
||||
return match.group(1), match.group(2)
|
||||
raise ValueError('Path {} is invalid blob path.'.format(
|
||||
path))
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
from google.cloud import storage
|
||||
from ._common_ops import parse_blob_path
|
||||
|
||||
def download_blob(source_blob_path, destination_file_path):
|
||||
"""Downloads a blob from the bucket.
|
||||
|
||||
Args:
|
||||
source_blob_path (str): the source blob path to download from.
|
||||
destination_file_path (str): the local file path to download to.
|
||||
"""
|
||||
bucket_name, blob_name = parse_blob_path(source_blob_path)
|
||||
storage_client = storage.Client()
|
||||
bucket = storage_client.bucket(bucket_name)
|
||||
blob = bucket.blob(blob_name)
|
||||
|
||||
dirname = os.path.dirname(destination_file_path)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
|
||||
with open(destination_file_path, 'wb+') as f:
|
||||
blob.download_to_file(f)
|
||||
|
||||
logging.info('Blob {} downloaded to {}.'.format(
|
||||
source_blob_path,
|
||||
destination_file_path))
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Entrypoint module to launch python module or file dynamically.
|
||||
|
||||
This module makes it easier to build kfp component with python code
|
||||
by defining a dynamic entrypoint and generate command line arg parser
|
||||
by python-fire module. It can be used as an entrypoint in the
|
||||
container spec to run arbitary python module or code in the local
|
||||
image.
|
||||
"""
|
||||
|
||||
from .launcher import launch
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import fire
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from .launcher import launch
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='launcher',
|
||||
description='Launch a python module or file.')
|
||||
parser.add_argument('file_or_module', type=str,
|
||||
help='Either a python file path or a module name.')
|
||||
parser.add_argument(
|
||||
'--ui_metadata_path',
|
||||
type=str,
|
||||
default='/mlpipeline-ui-metadata.json',
|
||||
help='Path for the file where the mlpipeline-ui-metadata.json data '
|
||||
'should be written.')
|
||||
parser.add_argument('args', nargs=argparse.REMAINDER)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.ui_metadata_path:
|
||||
os.environ['KFP_UI_METADATA_PATH'] = args.ui_metadata_path
|
||||
|
||||
launch(args.file_or_module, args.args)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import fire
|
||||
import importlib
|
||||
import sys
|
||||
import logging
|
||||
|
||||
def launch(file_or_module, args):
|
||||
"""Launches a python file or module as a command entrypoint.
|
||||
|
||||
Args:
|
||||
file_or_module: it is either a file path to python file
|
||||
a module path.
|
||||
args: the args passed to the entrypoint function.
|
||||
|
||||
Returns:
|
||||
The return value from the launched function.
|
||||
"""
|
||||
try:
|
||||
module = importlib.import_module(file_or_module)
|
||||
except Exception:
|
||||
try:
|
||||
if sys.version_info.major > 2:
|
||||
spec = importlib.util.spec_from_file_location('module', file_or_module)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
else:
|
||||
import imp
|
||||
module = imp.load_source('module', file_or_module)
|
||||
except Exception:
|
||||
logging.error('Failed to find the module or file: {}'.format(file_or_module))
|
||||
sys.exit(1)
|
||||
return fire.Fire(module, command=args, name=module.__name__)
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
--- http.py 2019-05-03 15:07:52.591411824 -0700
|
||||
+++ http_new.py 2019-05-03 15:09:23.470304022 -0700
|
||||
@@ -1784,4 +1784,4 @@
|
||||
http_timeout = socket.getdefaulttimeout()
|
||||
else:
|
||||
http_timeout = DEFAULT_HTTP_TIMEOUT_SEC
|
||||
- return httplib2.Http(timeout=http_timeout)
|
||||
+ return set_user_agent(httplib2.Http(timeout=http_timeout), '-kfpipeline-')
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
pip install -U tox virtualenv
|
||||
tox "$@"
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
PACKAGE_NAME = 'kfp-component'
|
||||
VERSION = '1.8.0-alpha.0'
|
||||
|
||||
setup(
|
||||
name=PACKAGE_NAME,
|
||||
version=VERSION,
|
||||
description='KubeFlow Pipelines Component SDK',
|
||||
author='google',
|
||||
install_requires=[
|
||||
'kubernetes >= 8.0.1', 'urllib3>=1.15,<1.25', 'fire == 0.1.3',
|
||||
'google-api-python-client == 1.7.8', 'google-cloud-storage == 1.14.0',
|
||||
'google-cloud-bigquery == 1.9.0'
|
||||
],
|
||||
packages=[
|
||||
'kfp_component',
|
||||
],
|
||||
classifiers=[
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Education',
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Programming Language :: Python',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Topic :: Scientific/Engineering',
|
||||
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
||||
'Topic :: Software Development',
|
||||
'Topic :: Software Development :: Libraries',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||
],
|
||||
include_package_data=True,
|
||||
)
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
.
|
||||
flake8
|
||||
pytest
|
||||
mock
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
|
@ -1,92 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from kfp_component.core import display
|
||||
|
||||
import mock
|
||||
import unittest
|
||||
|
||||
@mock.patch('kfp_component.core._display.json')
|
||||
@mock.patch('kfp_component.core._display.os')
|
||||
@mock.patch('kfp_component.core._display.open')
|
||||
class DisplayTest(unittest.TestCase):
|
||||
|
||||
def test_display_markdown(self, mock_open, mock_os, mock_json):
|
||||
mock_os.path.isfile.return_value = False
|
||||
|
||||
display.display(display.Markdown('# test'))
|
||||
|
||||
mock_json.dump.assert_called_with({
|
||||
'outputs': [{
|
||||
'type': 'markdown',
|
||||
'source': '# test',
|
||||
'storage': 'inline'
|
||||
}]
|
||||
}, mock.ANY)
|
||||
|
||||
def test_display_markdown_append(self, mock_open, mock_os, mock_json):
|
||||
mock_os.path.isfile.return_value = True
|
||||
mock_json.load.return_value = {
|
||||
'outputs': [{
|
||||
'type': 'markdown',
|
||||
'source': '# test 1',
|
||||
'storage': 'inline'
|
||||
}]
|
||||
}
|
||||
|
||||
display.display(display.Markdown('# test 2'))
|
||||
|
||||
mock_json.dump.assert_called_with({
|
||||
'outputs': [{
|
||||
'type': 'markdown',
|
||||
'source': '# test 1',
|
||||
'storage': 'inline'
|
||||
},{
|
||||
'type': 'markdown',
|
||||
'source': '# test 2',
|
||||
'storage': 'inline'
|
||||
}]
|
||||
}, mock.ANY)
|
||||
|
||||
def test_display_tensorboard(self, mock_open, mock_os, mock_json):
|
||||
mock_os.path.isfile.return_value = False
|
||||
|
||||
display.display(display.Tensorboard('gs://job/dir'))
|
||||
|
||||
mock_json.dump.assert_called_with({
|
||||
'outputs': [{
|
||||
'type': 'tensorboard',
|
||||
'source': 'gs://job/dir'
|
||||
}]
|
||||
}, mock.ANY)
|
||||
|
||||
def test_display_link(self, mock_open, mock_os, mock_json):
|
||||
mock_os.path.isfile.return_value = False
|
||||
|
||||
display.display(display.Link('https://test/link', 'Test Link'))
|
||||
|
||||
mock_json.dump.assert_called_with({
|
||||
'outputs': [{
|
||||
'type': 'markdown',
|
||||
'source': '## [Test Link](https://test/link)',
|
||||
'storage': 'inline'
|
||||
}]
|
||||
}, mock.ANY)
|
||||
|
||||
def test___repr__(self, mock_open, mock_os, mock_json):
|
||||
self.assertEqual('# Title', str(display.Markdown('# Title')))
|
||||
self.assertEqual('Open Tensorboard at: gs://trained/model/',
|
||||
str(display.Tensorboard('gs://trained/model/')))
|
||||
self.assertEqual('title: https://test/uri',
|
||||
str(display.Link('https://test/uri', 'title')))
|
||||
|
|
@ -1,139 +0,0 @@
|
|||
# Copyright 2018 The Kubeflow Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
from kfp_component.core import KfpExecutionContext
|
||||
|
||||
from kubernetes import client, config
|
||||
from kubernetes.client.rest import ApiException
|
||||
|
||||
import mock
|
||||
import unittest
|
||||
|
||||
@mock.patch('kubernetes.config.load_incluster_config')
|
||||
@mock.patch('kubernetes.client.CoreV1Api')
|
||||
class KfpExecutionContextTest(unittest.TestCase):
|
||||
|
||||
def test_init_succeed_without_pod_name(self,
|
||||
mock_k8s_client, mock_load_config):
|
||||
with KfpExecutionContext() as ctx:
|
||||
self.assertFalse(ctx.under_kfp_environment())
|
||||
pass
|
||||
|
||||
@mock.patch.dict('os.environ', {
|
||||
'KFP_POD_NAME': 'mock-pod-id'
|
||||
})
|
||||
def test_init_succeed_when_load_k8s_config_fail(self,
|
||||
mock_k8s_client, mock_load_config):
|
||||
mock_load_config.side_effect = Exception()
|
||||
|
||||
with KfpExecutionContext() as ctx:
|
||||
self.assertFalse(ctx.under_kfp_environment())
|
||||
pass
|
||||
|
||||
@mock.patch.dict('os.environ', {
|
||||
'KFP_POD_NAME': 'mock-pod-id'
|
||||
})
|
||||
def test_init_succeed_when_load_k8s_client_fail(self,
|
||||
mock_k8s_client, mock_load_config):
|
||||
mock_k8s_client.side_effect = Exception()
|
||||
|
||||
with KfpExecutionContext() as ctx:
|
||||
self.assertFalse(ctx.under_kfp_environment())
|
||||
pass
|
||||
|
||||
@mock.patch.dict('os.environ', {
|
||||
'KFP_POD_NAME': 'mock-pod-id'
|
||||
})
|
||||
def test_init_succeed_when_load_pod_fail(self,
|
||||
mock_k8s_client, mock_load_config):
|
||||
mock_k8s_client().read_namespaced_pod.side_effect = Exception()
|
||||
|
||||
with KfpExecutionContext() as ctx:
|
||||
self.assertFalse(ctx.under_kfp_environment())
|
||||
pass
|
||||
|
||||
@mock.patch.dict('os.environ', {
|
||||
'KFP_POD_NAME': 'mock-pod-id'
|
||||
})
|
||||
def test_init_succeed_no_argo_node_name(self,
|
||||
mock_k8s_client, mock_load_config):
|
||||
mock_pod = mock_k8s_client().read_namespaced_pod.return_value
|
||||
mock_pod.metadata.annotations = {}
|
||||
with KfpExecutionContext() as ctx:
|
||||
self.assertFalse(ctx.under_kfp_environment())
|
||||
pass
|
||||
|
||||
@mock.patch.dict('os.environ', {
|
||||
'KFP_POD_NAME': 'mock-pod-id',
|
||||
'KFP_NAMESPACE': 'mock-namespace'
|
||||
})
|
||||
def test_init_succeed(self,
|
||||
mock_k8s_client, mock_load_config):
|
||||
mock_pod = mock_k8s_client().read_namespaced_pod.return_value
|
||||
mock_pod.metadata.annotations = {
|
||||
'workflows.argoproj.io/node-name': 'node-1'
|
||||
}
|
||||
with KfpExecutionContext() as ctx:
|
||||
self.assertTrue(ctx.under_kfp_environment())
|
||||
pass
|
||||
mock_k8s_client().read_namespaced_pod.assert_called_with('mock-pod-id', 'mock-namespace')
|
||||
|
||||
@mock.patch.dict('os.environ', {
|
||||
'KFP_POD_NAME': 'mock-pod-id'
|
||||
})
|
||||
def test__exit_gracefully_cancel(self,
|
||||
mock_k8s_client, mock_load_config):
|
||||
mock_pod = mock_k8s_client().read_namespaced_pod.return_value
|
||||
mock_pod.metadata.annotations = {
|
||||
'workflows.argoproj.io/node-name': 'node-1',
|
||||
'workflows.argoproj.io/execution': '{"deadline": "1970-01-01T00:00:00Z"}'
|
||||
}
|
||||
cancel_handler = mock.Mock()
|
||||
context = KfpExecutionContext(on_cancel=cancel_handler)
|
||||
|
||||
context._exit_gracefully(0, 0)
|
||||
|
||||
cancel_handler.assert_called_once()
|
||||
|
||||
@mock.patch.dict('os.environ', {
|
||||
'KFP_POD_NAME': 'mock-pod-id'
|
||||
})
|
||||
def test__exit_gracefully_no_cancel(self,
|
||||
mock_k8s_client, mock_load_config):
|
||||
mock_pod = mock_k8s_client().read_namespaced_pod.return_value
|
||||
mock_pod.metadata.annotations = {
|
||||
'workflows.argoproj.io/node-name': 'node-1'
|
||||
}
|
||||
cancel_handler = mock.Mock()
|
||||
context = KfpExecutionContext(on_cancel=cancel_handler)
|
||||
|
||||
context._exit_gracefully(0, 0)
|
||||
|
||||
cancel_handler.assert_not_called()
|
||||
|
||||
@mock.patch.dict('os.environ', {
|
||||
'KFP_POD_NAME': 'mock-pod-id'
|
||||
})
|
||||
def test_context_id_stable_across_retries(self,
|
||||
mock_k8s_client, mock_load_config):
|
||||
mock_pod = mock_k8s_client().read_namespaced_pod.return_value
|
||||
mock_pod.metadata.annotations = {
|
||||
'workflows.argoproj.io/node-name': 'node-1'
|
||||
}
|
||||
ctx1 = KfpExecutionContext()
|
||||
ctx2 = KfpExecutionContext()
|
||||
|
||||
self.assertEqual(ctx1.context_id(), ctx2.context_id())
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue