pipelines/component_sdk/python/kfp_component/google/bigquery/_query.py

# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging

from google.cloud import bigquery
from google.api_core import exceptions

from kfp_component.core import KfpExecutionContext, display
from .. import common as gcp_common

# TODO(hongyes): make this path configurable as a environment variable
KFP_OUTPUT_PATH = '/tmp/kfp/output/'

def query(query, project_id, dataset_id=None, table_id=None,
    output_gcs_path=None, dataset_location='US', job_config=None):
    """Submit a query to Bigquery service and dump outputs to Bigquery table or
    a GCS blob.

    Args:
        query (str): The query used by Bigquery service to fetch the results.
        project_id (str): The project to execute the query job.
        dataset_id (str): The ID of the persistent dataset to keep the results
            of the query. If the dataset does not exist, the operation will
            create a new one.
        table_id (str): The ID of the table to keep the results of the query. If
            absent, the operation will generate a random id for the table.
        output_gcs_path (str): The GCS blob path to dump the query results to.
        dataset_location (str): The location to create the dataset. Defaults to `US`.
        job_config (dict): The full config spec for the query job.
    Returns:
        The API representation of the completed query job.
    """
    client = bigquery.Client(project=project_id, location=dataset_location)
    if not job_config:
        job_config = bigquery.QueryJobConfig()
        job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
        job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
    job_id = None
    def cancel():
        if job_id:
            client.cancel_job(job_id)
    with KfpExecutionContext(on_cancel=cancel) as ctx:
        job_id = 'query_' + ctx.context_id()
        query_job = _get_job(client, job_id)
        table_ref = None
        if not query_job:
            dataset_ref = _prepare_dataset_ref(client, dataset_id, output_gcs_path,
                dataset_location)
            if dataset_ref:
                if not table_id:
                    table_id = job_id
                table_ref = dataset_ref.table(table_id)
                job_config.destination = table_ref
            query_job = client.query(query, job_config, job_id=job_id)
        _display_job_link(project_id, job_id)
        query_job.result() # Wait for query to finish
        if output_gcs_path:
            job_id = 'extract_' + ctx.context_id()
            extract_job = _get_job(client, job_id)
            logging.info('Extracting data from table {} to {}.'.format(str(table_ref), output_gcs_path))
            if not extract_job:
                extract_job = client.extract_table(table_ref, output_gcs_path)
            extract_job.result()  # Wait for export to finish
        _dump_outputs(query_job, output_gcs_path, table_ref)
        return query_job.to_api_repr()

def _get_job(client, job_id):
    try:
        return client.get_job(job_id)
    except exceptions.NotFound:
        return None

def _prepare_dataset_ref(client, dataset_id, output_gcs_path, dataset_location):
    if not output_gcs_path and not dataset_id:
        return None

    if not dataset_id:
        dataset_id = 'kfp_tmp_dataset'
    dataset_ref = client.dataset(dataset_id)
    dataset = _get_dataset(client, dataset_ref)
    if not dataset:
        logging.info('Creating dataset {}'.format(dataset_id))
        dataset = _create_dataset(client, dataset_ref, dataset_location)
    return dataset_ref

def _get_dataset(client, dataset_ref):
    try:
        return client.get_dataset(dataset_ref)
    except exceptions.NotFound:
        return None

def _create_dataset(client, dataset_ref, location):
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = location
    return client.create_dataset(dataset)

def _display_job_link(project_id, job_id):
    display.display(display.Link(
        href= 'https://console.cloud.google.com/bigquery?project={}'
            '&j={}&page=queryresults'.format(project_id, job_id),
        text='Query Details'
    ))

def _dump_outputs(job, output_path, table_ref):
    gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-job.json',
        json.dumps(job.to_api_repr()))
    if not output_path:
        output_path = ''
    gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-output-path.txt',
        output_path)
    (dataset_id, table_id) = (table_ref.dataset_id, table_ref.table_id) if table_ref else ('', '')
    gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-dataset-id.txt',
        dataset_id)
    gcp_common.dump_file(KFP_OUTPUT_PATH + 'bigquery/query-table-id.txt',
        table_id)