129 lines
5.2 KiB
Python
129 lines
5.2 KiB
Python
# Copyright 2018 Google LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import json
|
|
import logging
|
|
|
|
from google.cloud import bigquery
|
|
from google.api_core import exceptions
|
|
|
|
from kfp_component.core import KfpExecutionContext, display
|
|
from .. import common as gcp_common
|
|
|
|
# TODO(hongyes): make this path configurable as a environment variable
|
|
KFP_OUTPUT_PATH = '/tmp/kfp/output/'
|
|
|
|
def query(query, project_id, dataset_id=None, table_id=None,
|
|
output_gcs_path=None, dataset_location='US', job_config=None):
|
|
"""Submit a query to Bigquery service and dump outputs to Bigquery table or
|
|
a GCS blob.
|
|
|
|
Args:
|
|
query (str): The query used by Bigquery service to fetch the results.
|
|
project_id (str): The project to execute the query job.
|
|
dataset_id (str): The ID of the persistent dataset to keep the results
|
|
of the query. If the dataset does not exist, the operation will
|
|
create a new one.
|
|
table_id (str): The ID of the table to keep the results of the query. If
|
|
absent, the operation will generate a random id for the table.
|
|
output_gcs_path (str): The GCS blob path to dump the query results to.
|
|
dataset_location (str): The location to create the dataset. Defaults to `US`.
|
|
job_config (dict): The full config spec for the query job.
|
|
Returns:
|
|
The API representation of the completed query job.
|
|
"""
|
|
client = bigquery.Client(project=project_id, location=dataset_location)
|
|
if not job_config:
|
|
job_config = bigquery.QueryJobConfig()
|
|
job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
|
|
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
|
|
job_id = None
|
|
def cancel():
|
|
if job_id:
|
|
client.cancel_job(job_id)
|
|
with KfpExecutionContext(on_cancel=cancel) as ctx:
|
|
job_id = 'query_' + ctx.context_id()
|
|
query_job = _get_job(client, job_id)
|
|
table_ref = None
|
|
if not query_job:
|
|
dataset_ref = _prepare_dataset_ref(client, dataset_id, output_gcs_path,
|
|
dataset_location)
|
|
if dataset_ref:
|
|
if not table_id:
|
|
table_id = job_id
|
|
table_ref = dataset_ref.table(table_id)
|
|
job_config.destination = table_ref
|
|
query_job = client.query(query, job_config, job_id=job_id)
|
|
_display_job_link(project_id, job_id)
|
|
query_job.result() # Wait for query to finish
|
|
if output_gcs_path:
|
|
job_id = 'extract_' + ctx.context_id()
|
|
extract_job = _get_job(client, job_id)
|
|
logging.info('Extracting data from table {} to {}.'.format(str(table_ref), output_gcs_path))
|
|
if not extract_job:
|
|
extract_job = client.extract_table(table_ref, output_gcs_path)
|
|
extract_job.result() # Wait for export to finish
|
|
_dump_outputs(query_job, output_gcs_path, table_ref)
|
|
return query_job.to_api_repr()
|
|
|
|
def _get_job(client, job_id):
|
|
try:
|
|
return client.get_job(job_id)
|
|
except exceptions.NotFound:
|
|
return None
|
|
|
|
def _prepare_dataset_ref(client, dataset_id, output_gcs_path, dataset_location):
|
|
if not output_gcs_path and not dataset_id:
|
|
return None
|
|
|
|
if not dataset_id:
|
|
dataset_id = 'kfp_tmp_dataset'
|
|
dataset_ref = client.dataset(dataset_id)
|
|
dataset = _get_dataset(client, dataset_ref)
|
|
if not dataset:
|
|
logging.info('Creating dataset {}'.format(dataset_id))
|
|
dataset = _create_dataset(client, dataset_ref, dataset_location)
|
|
return dataset_ref
|
|
|
|
def _get_dataset(client, dataset_ref):
|
|
try:
|
|
return client.get_dataset(dataset_ref)
|
|
except exceptions.NotFound:
|
|
return None
|
|
|
|
def _create_dataset(client, dataset_ref, location):
|
|
dataset = bigquery.Dataset(dataset_ref)
|
|
dataset.location = location
|
|
return client.create_dataset(dataset)
|
|
|
|
def _display_job_link(project_id, job_id):
|
|
display.display(display.Link(
|
|
href= 'https://console.cloud.google.com/bigquery?project={}'
|
|
'&j={}&page=queryresults'.format(project_id, job_id),
|
|
text='Query Details'
|
|
))
|
|
|
|
def _dump_outputs(job, output_path, table_ref):
|
|
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-job.json',
|
|
json.dumps(job.to_api_repr()))
|
|
if not output_path:
|
|
output_path = ''
|
|
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-output-path.txt',
|
|
output_path)
|
|
(dataset_id, table_id) = (table_ref.dataset_id, table_ref.table_id) if table_ref else ('', '')
|
|
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-dataset-id.txt',
|
|
dataset_id)
|
|
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-table-id.txt',
|
|
table_id)
|