pipelines/component_sdk/python/kfp_component/google/bigquery/_query.py

129 lines
5.2 KiB
Python

# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
from google.cloud import bigquery
from google.api_core import exceptions
from kfp_component.core import KfpExecutionContext, display
from .. import common as gcp_common
# TODO(hongyes): make this path configurable as a environment variable
KFP_OUTPUT_PATH = '/tmp/kfp/output/'
def query(query, project_id, dataset_id=None, table_id=None,
output_gcs_path=None, dataset_location='US', job_config=None):
"""Submit a query to Bigquery service and dump outputs to Bigquery table or
a GCS blob.
Args:
query (str): The query used by Bigquery service to fetch the results.
project_id (str): The project to execute the query job.
dataset_id (str): The ID of the persistent dataset to keep the results
of the query. If the dataset does not exist, the operation will
create a new one.
table_id (str): The ID of the table to keep the results of the query. If
absent, the operation will generate a random id for the table.
output_gcs_path (str): The GCS blob path to dump the query results to.
dataset_location (str): The location to create the dataset. Defaults to `US`.
job_config (dict): The full config spec for the query job.
Returns:
The API representation of the completed query job.
"""
client = bigquery.Client(project=project_id, location=dataset_location)
if not job_config:
job_config = bigquery.QueryJobConfig()
job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
job_id = None
def cancel():
if job_id:
client.cancel_job(job_id)
with KfpExecutionContext(on_cancel=cancel) as ctx:
job_id = 'query_' + ctx.context_id()
query_job = _get_job(client, job_id)
table_ref = None
if not query_job:
dataset_ref = _prepare_dataset_ref(client, dataset_id, output_gcs_path,
dataset_location)
if dataset_ref:
if not table_id:
table_id = job_id
table_ref = dataset_ref.table(table_id)
job_config.destination = table_ref
query_job = client.query(query, job_config, job_id=job_id)
_display_job_link(project_id, job_id)
query_job.result() # Wait for query to finish
if output_gcs_path:
job_id = 'extract_' + ctx.context_id()
extract_job = _get_job(client, job_id)
logging.info('Extracting data from table {} to {}.'.format(str(table_ref), output_gcs_path))
if not extract_job:
extract_job = client.extract_table(table_ref, output_gcs_path)
extract_job.result() # Wait for export to finish
_dump_outputs(query_job, output_gcs_path, table_ref)
return query_job.to_api_repr()
def _get_job(client, job_id):
try:
return client.get_job(job_id)
except exceptions.NotFound:
return None
def _prepare_dataset_ref(client, dataset_id, output_gcs_path, dataset_location):
if not output_gcs_path and not dataset_id:
return None
if not dataset_id:
dataset_id = 'kfp_tmp_dataset'
dataset_ref = client.dataset(dataset_id)
dataset = _get_dataset(client, dataset_ref)
if not dataset:
logging.info('Creating dataset {}'.format(dataset_id))
dataset = _create_dataset(client, dataset_ref, dataset_location)
return dataset_ref
def _get_dataset(client, dataset_ref):
try:
return client.get_dataset(dataset_ref)
except exceptions.NotFound:
return None
def _create_dataset(client, dataset_ref, location):
dataset = bigquery.Dataset(dataset_ref)
dataset.location = location
return client.create_dataset(dataset)
def _display_job_link(project_id, job_id):
display.display(display.Link(
href= 'https://console.cloud.google.com/bigquery?project={}'
'&j={}&page=queryresults'.format(project_id, job_id),
text='Query Details'
))
def _dump_outputs(job, output_path, table_ref):
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-job.json',
json.dumps(job.to_api_repr()))
if not output_path:
output_path = ''
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-output-path.txt',
output_path)
(dataset_id, table_id) = (table_ref.dataset_id, table_ref.table_id) if table_ref else ('', '')
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-dataset-id.txt',
dataset_id)
gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-table-id.txt',
table_id)