Refactor dataflow pipelines (#197)

* Update to a new dataflow package * [WIP] updating docstrings, fixing redundancies * Limit the scope of Github Transform pipeline, make everything unicode * Add ability to start github pipelines from transformed bigquery dataset * Upgrade batch prediction pipeline to be modular * Fix lint errors * Add write disposition to BigQuery transform * Update documentation format * Nicer names for modules * Add unicode encoding to parsed function docstring tuples * Use Apache Beam options parser to expose all CLI arguments
2018-07-27 06:26:56 -07:00 · 2018-07-27 06:26:56 -07:00 · 767c90ff20
parent 1746820f8f
commit 767c90ff20
26 changed files with 848 additions and 635 deletions
--- a/code_search/src/MANIFEST.in
+++ b/code_search/src/MANIFEST.in
@ -1,2 +1 @@
 include requirements.txt
-include files/*
--- a/code_search/src/code_search/cli.py
+++ b/code_search/src/code_search/cli.py
@ -1,120 +0,0 @@
-"""Entrypoint for Dataflow jobs"""
-
-from __future__ import print_function
-import argparse
-import os
-import apache_beam as beam
-import apache_beam.options.pipeline_options as pipeline_options
-
-import code_search.transforms.process_github_files as process_github_files
-import code_search.transforms.code_embed as code_embed
-
-
-def create_pipeline_opts(args):
-  """Create standard Pipeline Options for Beam"""
-
-  options = pipeline_options.PipelineOptions()
-  options.view_as(pipeline_options.StandardOptions).runner = args.runner
-
-  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
-  google_cloud_options.project = args.project
-  if args.runner == 'DataflowRunner':
-    google_cloud_options.job_name = args.job_name
-    google_cloud_options.temp_location = '{}/temp'.format(args.storage_bucket)
-    google_cloud_options.staging_location = '{}/staging'.format(args.storage_bucket)
-
-    worker_options = options.view_as(pipeline_options.WorkerOptions)
-    worker_options.num_workers = args.num_workers
-    worker_options.max_num_workers = args.max_num_workers
-    worker_options.machine_type = args.machine_type
-
-  setup_options = options.view_as(pipeline_options.SetupOptions)
-  setup_options.setup_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'setup.py')
-
-  return options
-
-def parse_arguments(argv):
-  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-  parser.add_argument('-r', '--runner', metavar='', type=str, default='DirectRunner',
-                      help='Type of runner - DirectRunner or DataflowRunner')
-  parser.add_argument('-i', '--input', metavar='', type=str, default='',
-                      help='Path to input file')
-  parser.add_argument('-o', '--output', metavar='', type=str,
-                      help='Output string of the format <dataset>:<table>')
-
-  predict_args_parser = parser.add_argument_group('Batch Prediction Arguments')
-  predict_args_parser.add_argument('--problem', metavar='', type=str,
-                                   help='Name of the T2T problem')
-  predict_args_parser.add_argument('--data-dir', metavar='', type=str,
-                                   help='aPath to directory of the T2T problem data')
-  predict_args_parser.add_argument('--saved-model-dir', metavar='', type=str,
-                                   help='Path to directory containing Tensorflow SavedModel')
-
-  # Dataflow related arguments
-  dataflow_args_parser = parser.add_argument_group('Dataflow Runner Arguments')
-  dataflow_args_parser.add_argument('-p', '--project', metavar='', type=str, default='Project',
-                                    help='Project ID')
-  dataflow_args_parser.add_argument('-j', '--job-name', metavar='', type=str, default='Beam Job',
-                                    help='Job name')
-  dataflow_args_parser.add_argument('--storage-bucket', metavar='', type=str, default='gs://bucket',
-                                    help='Path to Google Storage Bucket')
-  dataflow_args_parser.add_argument('--num-workers', metavar='', type=int, default=1,
-                                    help='Number of workers')
-  dataflow_args_parser.add_argument('--max-num-workers', metavar='', type=int, default=1,
-                                    help='Maximum number of workers')
-  dataflow_args_parser.add_argument('--machine-type', metavar='', type=str, default='n1-standard-1',
-                                    help='Google Cloud Machine Type to use')
-
-  parsed_args = parser.parse_args(argv)
-  return parsed_args
-
-
-def create_github_pipeline(argv=None):
-  """Creates the Github source code pre-processing pipeline.
-
-  This pipeline takes an SQL file for BigQuery as an input
-  and puts the results in a file and a new BigQuery table.
-  An SQL file is included with the module.
-  """
-  args = parse_arguments(argv)
-
-  default_sql_file = os.path.abspath('{}/../../files/select_github_archive.sql'.format(__file__))
-  args.input = args.input or default_sql_file
-
-  pipeline_opts = create_pipeline_opts(args)
-
-  with open(args.input, 'r') as f:
-    query_string = f.read()
-
-  pipeline = beam.Pipeline(options=pipeline_opts)
-  (pipeline #pylint: disable=expression-not-assigned
-    | process_github_files.ProcessGithubFiles(args.project, query_string,
-                                    args.output, args.storage_bucket)
-  )
-  result = pipeline.run()
-  if args.runner == 'DirectRunner':
-    result.wait_until_finish()
-
-
-def create_batch_predict_pipeline(argv=None):
-  """Creates Batch Prediction Pipeline using trained model.
-
-  This pipeline takes in a collection of CSV files returned
-  by the Github Pipeline, embeds the code text using the
-  trained model in a given model directory.
-  """
-  args = parse_arguments(argv)
-  pipeline_opts = create_pipeline_opts(args)
-
-  pipeline = beam.Pipeline(options=pipeline_opts)
-  (pipeline  #pylint: disable=expression-not-assigned
-    | code_embed.GithubBatchPredict(args.project, args.problem,
-                                    args.data_dir, args.saved_model_dir)
-  )
-  result = pipeline.run()
-  if args.runner == 'DirectRunner':
-    result.wait_until_finish()
-
-if __name__ == '__main__':
-  create_batch_predict_pipeline()
--- a/code_search/src/code_search/transforms/init.py
+++ b/code_search/src/code_search/transforms/init.py
--- a/code_search/src/code_search/dataflow/cli/init.py
+++ b/code_search/src/code_search/dataflow/cli/init.py
--- a/code_search/src/code_search/dataflow/cli/arguments.py
+++ b/code_search/src/code_search/dataflow/cli/arguments.py
@ -0,0 +1,68 @@
+import os
+import sys
+import apache_beam.options.pipeline_options as pipeline_options
+
+
+class PipelineCLIOptions(pipeline_options.StandardOptions,
+                         pipeline_options.WorkerOptions,
+                         pipeline_options.SetupOptions,
+                         pipeline_options.GoogleCloudOptions):
+  """A unified arguments parser.
+
+  This parser directly exposes all the underlying Beam
+  options available to the user (along with some custom
+  arguments). To use, simply pass the arguments list as
+  `PipelineCLIOptions(argv)`.
+
+  Args:
+    argv: A list of strings representing CLI options.
+  """
+
+  @classmethod
+  def _add_argparse_args(cls, parser):
+    add_parser_arguments(parser)
+
+
+def add_parser_arguments(parser):
+  additional_args_parser = parser.add_argument_group('Custom Arguments')
+  additional_args_parser.add_argument('--target_dataset', metavar='', type=str,
+                      help='BigQuery dataset for output results')
+  additional_args_parser.add_argument('--pre_transformed', action='store_true',
+                      help='Use a pre-transformed BigQuery dataset')
+
+  predict_args_parser = parser.add_argument_group('Batch Prediction Arguments')
+  predict_args_parser.add_argument('--problem', metavar='', type=str,
+                                   help='Name of the T2T problem')
+  predict_args_parser.add_argument('--data_dir', metavar='', type=str,
+                                   help='Path to directory of the T2T problem data')
+  predict_args_parser.add_argument('--saved_model_dir', metavar='', type=str,
+                                   help='Path to directory containing Tensorflow SavedModel')
+
+
+def prepare_pipeline_opts(argv=None):
+  """Prepare pipeline options from CLI arguments.
+
+  This uses the unified PipelineCLIOptions parser
+  and adds modifications on top. It adds a `setup_file`
+  to allow installation of dependencies on Dataflow workers.
+  These implicit changes allow ease-of-use.
+
+  Use `-h` CLI argument to see the list of all possible
+  arguments.
+
+  Args:
+    argv: A list of strings representing the CLI arguments.
+
+  Returns:
+    A PipelineCLIOptions object whose `_visible_options`
+    contains the parsed Namespace object.
+  """
+  argv = argv or sys.argv[1:]
+  argv.extend([
+    '--setup_file',
+    os.path.abspath(os.path.join(__file__, '../../../../setup.py')),
+  ])
+
+  pipeline_opts = PipelineCLIOptions(flags=argv)
+
+  return pipeline_opts
--- a/code_search/src/code_search/dataflow/cli/create_function_embeddings.py
+++ b/code_search/src/code_search/dataflow/cli/create_function_embeddings.py
@ -0,0 +1,49 @@
+import apache_beam as beam
+
+import code_search.dataflow.cli.arguments as arguments
+import code_search.dataflow.transforms.github_bigquery as gh_bq
+import code_search.dataflow.transforms.function_embeddings as func_embed
+import code_search.dataflow.do_fns.dict_to_csv as dict_to_csv
+
+
+def create_function_embeddings(argv=None):
+  """Creates Batch Prediction Pipeline using trained model.
+
+  At a high level, this pipeline does the following things:
+    - Read the Processed Github Dataset from BigQuery
+    - Encode the functions using T2T problem
+    - Get function embeddings using `kubeflow_batch_predict.dataflow.batch_prediction`
+    - All results are stored in a BigQuery dataset (`args.target_dataset`)
+    - See `transforms.github_dataset.GithubBatchPredict` for details of tables created
+    - Additionally, store CSV of docstring, original functions and other metadata for
+      reverse index lookup during search engine queries.
+  """
+  pipeline_opts = arguments.prepare_pipeline_opts(argv)
+  args = pipeline_opts._visible_options  # pylint: disable=protected-access
+
+  pipeline = beam.Pipeline(options=pipeline_opts)
+
+  token_pairs = (pipeline
+    | "Read Transformed Github Dataset" >> gh_bq.ReadTransformedGithubDataset(
+        args.project, dataset=args.target_dataset)
+    | "Compute Function Embeddings" >> func_embed.FunctionEmbeddings(args.project,
+                                                                     args.target_dataset,
+                                                                     args.problem,
+                                                                     args.data_dir,
+                                                                     args.saved_model_dir)
+  )
+
+  (token_pairs  # pylint: disable=expression-not-assigned
+    | "Format for CSV Write" >> beam.ParDo(dict_to_csv.DictToCSVString(
+        ['nwo', 'path', 'function_name', 'lineno', 'original_function', 'function_embedding']))
+    | "Write Embeddings to CSV" >> beam.io.WriteToText('{}/func-index'.format(args.data_dir),
+                                                       file_name_suffix='.csv')
+  )
+
+  result = pipeline.run()
+  if args.runner == 'DirectRunner':
+    result.wait_until_finish()
+
+
+if __name__ == '__main__':
+  create_function_embeddings()
--- a/code_search/src/code_search/dataflow/cli/preprocess_github_dataset.py
+++ b/code_search/src/code_search/dataflow/cli/preprocess_github_dataset.py
@ -0,0 +1,52 @@
+import apache_beam as beam
+
+import code_search.dataflow.cli.arguments as arguments
+import code_search.dataflow.transforms.github_bigquery as gh_bq
+import code_search.dataflow.transforms.github_dataset as github_dataset
+import code_search.dataflow.do_fns.dict_to_csv as dict_to_csv
+
+
+def preprocess_github_dataset(argv=None):
+  """Apache Beam pipeline for pre-processing Github dataset.
+
+  At a high level, this pipeline does the following things:
+    - Read Github Python files from BigQuery
+    - If Github Python files have already been processed, use the
+      pre-processed table instead (using flag `--pre-transformed`)
+    - Tokenize files into pairs of function definitions and docstrings
+    - All results are stored in a BigQuery dataset (`args.target_dataset`)
+    - See `transforms.github_dataset.TransformGithubDataset` for details of tables created
+    - Additionally, store pairs of docstring and function tokens in a CSV file
+      for training
+  """
+  pipeline_opts = arguments.prepare_pipeline_opts(argv)
+  args = pipeline_opts._visible_options  # pylint: disable=protected-access
+
+  pipeline = beam.Pipeline(options=pipeline_opts)
+
+  if args.pre_transformed:
+    token_pairs = (pipeline
+      | "Read Transformed Github Dataset" >> gh_bq.ReadTransformedGithubDataset(
+        args.project, dataset=args.target_dataset)
+    )
+  else:
+    token_pairs = (pipeline
+      | "Read Github Dataset" >> gh_bq.ReadGithubDataset(args.project)
+      | "Transform Github Dataset" >> github_dataset.TransformGithubDataset(args.project,
+                                                                            args.target_dataset)
+    )
+
+  (token_pairs  # pylint: disable=expression-not-assigned
+    | "Format for CSV Write" >> beam.ParDo(dict_to_csv.DictToCSVString(
+      ['docstring_tokens', 'function_tokens']))
+    | "Write CSV" >> beam.io.WriteToText('{}/func-doc-pairs'.format(args.data_dir),
+                                         file_name_suffix='.csv')
+  )
+
+  result = pipeline.run()
+  if args.runner == 'DirectRunner':
+    result.wait_until_finish()
+
+
+if __name__ == '__main__':
+  preprocess_github_dataset()
--- a/code_search/src/code_search/dataflow/do_fns/init.py
+++ b/code_search/src/code_search/dataflow/do_fns/init.py
--- a/code_search/src/code_search/dataflow/do_fns/dict_to_csv.py
+++ b/code_search/src/code_search/dataflow/do_fns/dict_to_csv.py
@ -0,0 +1,50 @@
+import csv
+import io
+import apache_beam as beam
+
+
+class DictToCSVString(beam.DoFn):
+  """Convert incoming dict to a CSV string.
+
+  This DoFn converts a Python dict into
+  a CSV string.
+
+  Args:
+    fieldnames: A list of strings representing keys of a dict.
+  """
+  def __init__(self, fieldnames):
+    super(DictToCSVString, self).__init__()
+
+    self.fieldnames = fieldnames
+
+  def process(self, element, *_args, **_kwargs):
+    """Convert a Python dict instance into CSV string.
+
+    This routine uses the Python CSV DictReader to
+    robustly convert an input dict to a comma-separated
+    CSV string. This also handles appropriate escaping of
+    characters like the delimiter ",". The dict values
+    must be serializable into a string.
+
+    Args:
+      element: A dict mapping string keys to string values.
+        {
+          "key1": "STRING",
+          "key2": "STRING"
+        }
+
+    Yields:
+      A string representing the row in CSV format.
+    """
+    fieldnames = self.fieldnames
+    filtered_element = {
+      key: value.encode('utf-8')
+      for (key, value) in element.iteritems()
+      if key in fieldnames
+    }
+    with io.BytesIO() as stream:
+      writer = csv.DictWriter(stream, fieldnames)
+      writer.writerow(filtered_element)
+      csv_string = stream.getvalue().strip('\r\n')
+
+    yield csv_string
--- a/code_search/src/code_search/dataflow/do_fns/function_embeddings.py
+++ b/code_search/src/code_search/dataflow/do_fns/function_embeddings.py
@ -0,0 +1,164 @@
+"""Beam DoFns specific to `code_search.dataflow.transforms.function_embeddings`."""
+
+import apache_beam as beam
+
+from code_search.t2t.query import get_encoder, encode_query
+
+
+class EncodeFunctionTokens(beam.DoFn):
+  """Encode function tokens.
+
+  This DoFn prepares the function tokens for
+  inference by a SavedModel estimator downstream.
+
+  Args:
+    problem: A string representing the registered Tensor2Tensor Problem.
+    data_dir: A string representing the path to data directory.
+  """
+  def __init__(self, problem, data_dir):
+    super(EncodeFunctionTokens, self).__init__()
+
+    self.problem = problem
+    self.data_dir = data_dir
+
+  @property
+  def function_tokens_key(self):
+    return u'function_tokens'
+
+  @property
+  def instances_key(self):
+    return u'instances'
+
+  def process(self, element, *_args, **_kwargs):
+    """Encode the function instance.
+
+    This DoFn takes a tokenized function string and
+    encodes them into a base64 string of TFExample
+    binary format. The "function_tokens" are encoded
+    and stored into the "instances" key in a format
+    ready for consumption by TensorFlow SavedModel
+    estimators. The encoder is provided by a
+    Tensor2Tensor problem as provided in the constructor.
+
+    Args:
+      element: A Python dict of the form,
+        {
+          "nwo": "STRING",
+          "path": "STRING",
+          "function_name": "STRING",
+          "lineno": "STRING",
+          "original_function": "STRING",
+          "function_tokens": "STRING",
+          "docstring_tokens": "STRING",
+        }
+
+    Yields:
+      An updated Python dict of the form
+        {
+          "nwo": "STRING",
+          "path": "STRING",
+          "function_name": "STRING",
+          "lineno": "STRING",
+          "original_function": "STRING",
+          "function_tokens": "STRING",
+          "docstring_tokens": "STRING",
+          "instances": [
+            {
+              "input": {
+                "b64": "STRING",
+              }
+            }
+          ]
+        }
+    """
+    encoder = get_encoder(self.problem, self.data_dir)
+    encoded_function = encode_query(encoder, element.get(self.function_tokens_key))
+
+    element[self.instances_key] = [{'input': {'b64': encoded_function}}]
+    yield element
+
+
+class ProcessFunctionEmbedding(beam.DoFn):
+  """Process results from PredictionDoFn.
+
+  This is a DoFn for post-processing on inference
+  results from a SavedModel estimator which are
+  returned by the PredictionDoFn.
+  """
+
+  @property
+  def function_embedding_key(self):
+    return 'function_embedding'
+
+  @property
+  def predictions_key(self):
+    return 'predictions'
+
+  @property
+  def pop_keys(self):
+    return [
+      'predictions',
+      'docstring_tokens',
+      'function_tokens',
+      'instances',
+    ]
+
+  def process(self, element, *_args, **_kwargs):
+    """Post-Process Function embedding.
+
+    This converts the incoming function instance
+    embedding into a serializable string for downstream
+    tasks. It also pops any extraneous keys which are
+    no more required. The "lineno" key is also converted
+    to a string for serializability downstream.
+
+    Args:
+      element: A Python dict of the form,
+        {
+          "nwo": "STRING",
+          "path": "STRING",
+          "function_name": "STRING",
+          "lineno": "STRING",
+          "original_function": "STRING",
+          "function_tokens": "STRING",
+          "docstring_tokens": "STRING",
+          "instances": [
+            {
+              "input": {
+                "b64": "STRING",
+              }
+            }
+          ],
+          "predictions": [
+            {
+              "outputs": [
+                FLOAT,
+                FLOAT,
+                ...
+              ]
+            }
+          ],
+        }
+
+    Yields:
+      An update Python dict of the form,
+        {
+          "nwo": "STRING",
+          "path": "STRING",
+          "function_name": "STRING",
+          "lineno": "STRING",
+          "original_function": "STRING",
+          "function_embedding": "STRING",
+        }
+    """
+    prediction = element.get(self.predictions_key)[0]['outputs']
+    element[self.function_embedding_key] = ','.join([
+      str(val).decode('utf-8') for val in prediction
+    ])
+
+    element['lineno'] = str(element['lineno']).decode('utf-8')
+
+    for key in self.pop_keys:
+      element.pop(key)
+
+    yield element
--- a/code_search/src/code_search/dataflow/do_fns/github_dataset.py
+++ b/code_search/src/code_search/dataflow/do_fns/github_dataset.py
@ -0,0 +1,126 @@
+"""Beam DoFns specific to `code_search.dataflow.transforms.github_dataset`."""
+
+import logging
+import apache_beam as beam
+from apache_beam import pvalue
+
+
+class SplitRepoPath(beam.DoFn):
+  """Update element keys to separate repo path and file path.
+
+  This DoFn's only purpose is to be used after
+  `code_search.dataflow.transforms.github_bigquery.ReadGithubDataset`
+  to split the source dictionary key into two target keys.
+  """
+
+  @property
+  def source_key(self):
+    return u'repo_path'
+
+  @property
+  def target_keys(self):
+    return [u'nwo', u'path']
+
+  def process(self, element, *_args, **_kwargs):
+    """Process Python file attributes.
+
+    This simple DoFn splits the `repo_path` into
+    independent properties of owner (`nwo`) and
+    relative file path (`path`). This value is
+    space-delimited and split over the first space
+    is enough.
+
+    Args:
+      element: A Python dict of the form,
+        {
+          "repo_path": "STRING",
+          "content": "STRING",
+        }
+
+    Yields:
+      An updated Python dict of the form,
+        {
+          "nwo": "STRING",
+          "path": "STRING",
+          "content": "STRING",
+        }
+    """
+    values = element.pop(self.source_key).split(' ', 1)
+
+    for key, value in zip(self.target_keys, values):
+      element[key] = value
+
+    yield element
+
+
+class TokenizeFunctionDocstrings(beam.DoFn):
+  """Tokenize function and docstrings.
+
+  This DoFn takes in the rows from BigQuery and tokenizes
+  the file content present in the content key. This
+  yields an updated dictionary with the new tokenized
+  data in the pairs key.
+  """
+
+  @property
+  def content_key(self):
+    return 'content'
+
+  @property
+  def info_keys(self):
+    return [
+      u'function_name',
+      u'lineno',
+      u'original_function',
+      u'function_tokens',
+      u'docstring_tokens',
+    ]
+
+  def process(self, element, *_args, **_kwargs):
+    """Get list of Function-Docstring tokens
+
+    This processes each Python file's content
+    and returns a list of metadata for each extracted
+    pair. These contain the tokenized functions and
+    docstrings. In cases where the tokenization fails,
+    a side output is returned. All values are unicode
+    for serialization.
+
+    Args:
+      element: A Python dict of the form,
+        {
+          "nwo": "STRING",
+          "path": "STRING",
+          "content": "STRING",
+        }
+
+    Yields:
+      A Python list of the form,
+      [
+        {
+          "nwo": "STRING",
+          "path": "STRING",
+          "function_name": "STRING",
+          "lineno": "STRING",
+          "original_function": "STRING",
+          "function_tokens": "STRING",
+          "docstring_tokens": "STRING",
+        },
+        ...
+      ]
+    """
+    try:
+      import code_search.dataflow.utils as utils
+
+      content_blob = element.pop(self.content_key)
+      pairs = utils.get_function_docstring_pairs(content_blob)
+
+      result = [
+        dict(zip(self.info_keys, pair_tuple), **element)
+        for pair_tuple in pairs
+      ]
+
+      yield result
+    except Exception as e:  # pylint: disable=broad-except
+      logging.warning('Tokenization failed, %s', e.message)
+      yield pvalue.TaggedOutput('err', element)
--- a/code_search/src/code_search/dataflow/transforms/init.py
+++ b/code_search/src/code_search/dataflow/transforms/init.py
--- a/code_search/src/code_search/dataflow/transforms/bigquery.py
+++ b/code_search/src/code_search/dataflow/transforms/bigquery.py
@ -1,4 +1,5 @@
 import apache_beam as beam
+import apache_beam.io.gcp.bigquery as bigquery
 import apache_beam.io.gcp.internal.clients as clients


@ -10,10 +11,12 @@ class BigQueryRead(beam.PTransform):
  string.
  """

-  def __init__(self, project):
+  def __init__(self, project, dataset=None, table=None):
    super(BigQueryRead, self).__init__()

    self.project = project
+    self.dataset = dataset
+    self.table = table

  @property
  def limit(self):
@ -47,12 +50,14 @@ class BigQueryWrite(beam.PTransform):
    ]
  """

-  def __init__(self, project, dataset, table, batch_size=500):
+  def __init__(self, project, dataset, table, batch_size=500,
+               write_disposition=bigquery.BigQueryDisposition.WRITE_TRUNCATE):
    super(BigQueryWrite, self).__init__()

    self.project = project
    self.dataset = dataset
    self.table = table
+    self.write_disposition = write_disposition
    self.batch_size = batch_size

  @property
@ -69,7 +74,8 @@ class BigQueryWrite(beam.PTransform):
                                dataset=self.dataset,
                                table=self.table,
                                schema=self.output_schema,
-                                batch_size=self.batch_size)
+                                batch_size=self.batch_size,
+                                write_disposition=self.write_disposition)
    )

  @staticmethod
--- a/code_search/src/code_search/dataflow/transforms/function_embeddings.py
+++ b/code_search/src/code_search/dataflow/transforms/function_embeddings.py
@ -0,0 +1,46 @@
+import apache_beam as beam
+import kubeflow_batch_predict.dataflow.batch_prediction as batch_prediction
+
+import code_search.dataflow.do_fns.function_embeddings as func_embeddings
+import code_search.dataflow.transforms.github_bigquery as github_bigquery
+
+
+class FunctionEmbeddings(beam.PTransform):
+  """Batch Prediction for Github dataset.
+
+  This Beam pipeline takes in the transformed dataset,
+  prepares each element's function tokens for prediction
+  by encoding it into base64 format and returns an updated
+  dictionary element with the embedding for further processing.
+  """
+
+  def __init__(self, project, target_dataset, problem, data_dir, saved_model_dir):
+    super(FunctionEmbeddings, self).__init__()
+
+    self.project = project
+    self.target_dataset = target_dataset
+    self.problem = problem
+    self.data_dir = data_dir
+    self.saved_model_dir = saved_model_dir
+
+  def expand(self, input_or_inputs):
+    batch_predict = (input_or_inputs
+      | "Encoded Function Tokens" >> beam.ParDo(func_embeddings.EncodeFunctionTokens(
+        self.problem, self.data_dir))
+      | "Compute Function Embeddings" >> beam.ParDo(batch_prediction.PredictionDoFn(),
+                                                    self.saved_model_dir).with_outputs('err',
+                                                                                       main='main')
+    )
+
+    predictions = batch_predict.main
+
+    formatted_predictions = (predictions
+      | "Process Function Embeddings" >> beam.ParDo(func_embeddings.ProcessFunctionEmbedding())
+    )
+
+    (formatted_predictions  # pylint: disable=expression-not-assigned
+      | "Save Function Embeddings" >> github_bigquery.WriteGithubFunctionEmbeddings(
+        self.project, self.target_dataset)
+    )
+
+    return formatted_predictions
--- a/code_search/src/code_search/dataflow/transforms/github_bigquery.py
+++ b/code_search/src/code_search/dataflow/transforms/github_bigquery.py
@ -0,0 +1,143 @@
+import apache_beam.io.gcp.bigquery as bigquery
+import code_search.dataflow.transforms.bigquery as bq_transform
+
+
+# Default internal table names
+PAIRS_TABLE = 'token_pairs'
+FAILED_TOKENIZE_TABLE = 'failed_tokenize'
+FUNCTION_EMBEDDINGS_TABLE = 'function_embeddings'
+
+
+class ReadGithubDataset(bq_transform.BigQueryRead):
+  """Read original Github files from BigQuery.
+
+  This utility Transform reads Python files
+  from a BigQuery public dump which are smaller
+  than 15k lines of code, contain at least one
+  function definition and its repository has been
+  watched at least twice since 2017.
+
+  NOTE: Make sure to modify the `self.limit` property
+  as desired.
+  """
+
+  @property
+  def limit(self):
+    # return 500
+    return None
+
+  @property
+  def query_string(self):
+    query = """
+      SELECT
+        MAX(CONCAT(f.repo_name, ' ', f.path)) AS repo_path,
+        c.content
+      FROM
+        `bigquery-public-data.github_repos.files` AS f
+      JOIN
+        `bigquery-public-data.github_repos.contents` AS c
+      ON
+        f.id = c.id
+      JOIN (
+          --this part of the query makes sure repo is watched at least twice since 2017
+        SELECT
+          repo
+        FROM (
+          SELECT
+            repo.name AS repo
+          FROM
+            `githubarchive.year.2017`
+          WHERE
+            type="WatchEvent"
+          UNION ALL
+          SELECT
+            repo.name AS repo
+          FROM
+            `githubarchive.month.2018*`
+          WHERE
+            type="WatchEvent" )
+        GROUP BY
+          1
+        HAVING
+          COUNT(*) >= 2 ) AS r
+      ON
+        f.repo_name = r.repo
+      WHERE
+        f.path LIKE '%.py' AND --with python extension
+        c.size < 15000 AND --get rid of ridiculously long files
+        REGEXP_CONTAINS(c.content, r'def ') --contains function definition
+      GROUP BY
+        c.content
+    """
+
+    if self.limit:
+      query += '\nLIMIT {}'.format(self.limit)
+    return query
+
+
+class WriteFailedTokenizedData(bq_transform.BigQueryWrite):
+  @property
+  def column_list(self):
+    return [
+      ('nwo', 'STRING'),
+      ('path', 'STRING'),
+      ('content', 'STRING')
+    ]
+
+
+class WriteTokenizedData(bq_transform.BigQueryWrite):
+  @property
+  def column_list(self):
+    return [
+      ('nwo', 'STRING'),
+      ('path', 'STRING'),
+      ('function_name', 'STRING'),
+      ('lineno', 'STRING'),
+      ('original_function', 'STRING'),
+      ('function_tokens', 'STRING'),
+      ('docstring_tokens', 'STRING'),
+    ]
+
+
+class ReadTransformedGithubDataset(bq_transform.BigQueryRead):
+
+  def __init__(self, project, dataset=None, table=PAIRS_TABLE):
+    super(ReadTransformedGithubDataset, self).__init__(project, dataset=dataset, table=table)
+
+  @property
+  def limit(self):
+    # return 500
+    return None
+
+  @property
+  def query_string(self):
+    query = """
+      SELECT 
+        nwo, path, function_name, lineno, original_function, function_tokens, docstring_tokens
+      FROM
+        {}.{}
+    """.format(self.dataset, self.table)
+
+    if self.limit:
+      query += '\nLIMIT {}'.format(self.limit)
+    return query
+
+
+class WriteGithubFunctionEmbeddings(bq_transform.BigQueryWrite):
+
+  def __init__(self, project, dataset, table=FUNCTION_EMBEDDINGS_TABLE, batch_size=500,
+               write_disposition=bigquery.BigQueryDisposition.WRITE_TRUNCATE):
+    super(WriteGithubFunctionEmbeddings, self).__init__(project, dataset, table,
+                                                        batch_size=batch_size,
+                                                        write_disposition=write_disposition)
+
+  @property
+  def column_list(self):
+    return [
+      ('nwo', 'STRING'),
+      ('path', 'STRING'),
+      ('function_name', 'STRING'),
+      ('lineno', 'STRING'),
+      ('original_function', 'STRING'),
+      ('function_embedding', 'STRING')
+    ]
--- a/code_search/src/code_search/dataflow/transforms/github_dataset.py
+++ b/code_search/src/code_search/dataflow/transforms/github_dataset.py
@ -0,0 +1,61 @@
+import apache_beam as beam
+
+import code_search.dataflow.do_fns.github_dataset as gh_do_fns
+import code_search.dataflow.transforms.github_bigquery as gh_bq
+
+
+class TransformGithubDataset(beam.PTransform):
+  """Transform the BigQuery Github Dataset.
+
+  This is a Beam Pipeline which reads the Github Dataset from
+  BigQuery, tokenizes functions and docstrings in Python files,
+  and dumps into a new BigQuery dataset for further processing.
+  All tiny docstrings (smaller than `self.min_docstring_tokens`)
+  are filtered out.
+
+  This transform creates following tables in the `target_dataset`
+  which are defined as properties for easy modification.
+    - `self.failed_tokenize_table`
+    - `self.pairs_table`
+  """
+
+  def __init__(self, project, target_dataset,
+               pairs_table=gh_bq.PAIRS_TABLE,
+               failed_tokenize_table=gh_bq.FAILED_TOKENIZE_TABLE):
+    super(TransformGithubDataset, self).__init__()
+
+    self.project = project
+    self.target_dataset = target_dataset
+    self.pairs_table = pairs_table
+    self.failed_tokenize_table = failed_tokenize_table
+
+  @property
+  def min_docstring_tokens(self):
+    return 5
+
+  def expand(self, input_or_inputs):
+    tokenize_result = (input_or_inputs
+     | "Split 'repo_path'" >> beam.ParDo(gh_do_fns.SplitRepoPath())
+     | "Tokenize Code/Docstring Pairs" >> beam.ParDo(
+        gh_do_fns.TokenizeFunctionDocstrings()).with_outputs('err', main='rows')
+    )
+
+    pairs, tokenize_errors = tokenize_result.rows, tokenize_result.err
+
+    (tokenize_errors  # pylint: disable=expression-not-assigned
+     | "Failed Tokenization" >> gh_bq.WriteFailedTokenizedData(self.project, self.target_dataset,
+                                                               self.failed_tokenize_table)
+    )
+
+    flat_rows = (pairs
+      | "Flatten Rows" >> beam.FlatMap(lambda x: x)
+      | "Filter Tiny Docstrings" >> beam.Filter(
+        lambda row: len(row['docstring_tokens'].split(' ')) > self.min_docstring_tokens)
+    )
+
+    (flat_rows  # pylint: disable=expression-not-assigned
+      | "Save Tokens" >> gh_bq.WriteTokenizedData(self.project, self.target_dataset,
+                                                  self.pairs_table)
+    )
+
+    return flat_rows
--- a/code_search/src/code_search/dataflow/utils.py
+++ b/code_search/src/code_search/dataflow/utils.py
@ -0,0 +1,80 @@
+import ast
+import astor
+import nltk.tokenize as tokenize
+import spacy
+
+
+def tokenize_docstring(text):
+  """Tokenize docstrings.
+
+  Args:
+    text: A docstring to be tokenized.
+
+  Returns:
+    A list of strings representing the tokens in the docstring.
+  """
+  en = spacy.load('en')
+  tokens = en.tokenizer(text.decode('utf8'))
+  return [token.text.lower() for token in tokens if not token.is_space]
+
+
+def tokenize_code(text):
+  """Tokenize code strings.
+
+  This simply considers whitespaces as token delimiters.
+
+  Args:
+    text: A code string to be tokenized.
+
+  Returns:
+    A list of strings representing the tokens in the code.
+  """
+  return tokenize.RegexpTokenizer(r'\w+').tokenize(text)
+
+
+def get_function_docstring_pairs(blob):
+  """Extract (function/method, docstring) pairs from a given code blob.
+
+  This method reads a string representing a Python file, builds an
+  abstract syntax tree (AST) and returns a list of Docstring and Function
+  pairs along with supporting metadata.
+
+  Args:
+    blob: A string representing the Python file contents.
+
+  Returns:
+    A list of tuples of the form:
+      [
+        (
+          function_name,
+          lineno,
+          original_function,
+          function_tokens,
+          docstring_tokens
+        ),
+        ...
+      ]
+  """
+  pairs = []
+  try:
+    module = ast.parse(blob)
+    classes = [node for node in module.body if isinstance(node, ast.ClassDef)]
+    functions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
+    for _class in classes:
+      functions.extend([node for node in _class.body if isinstance(node, ast.FunctionDef)])
+
+    for f in functions:
+      source = astor.to_source(f)
+      docstring = ast.get_docstring(f) if ast.get_docstring(f) else ''
+      func = source.replace(ast.get_docstring(f, clean=False), '') if docstring else source
+      pair_tuple = (
+        f.name.decode('utf-8'),
+        str(f.lineno).decode('utf-8'),
+        source.decode('utf-8'),
+        ' '.join(tokenize_code(func)).decode('utf-8'),
+        ' '.join(tokenize_docstring(docstring.split('\n\n')[0])).decode('utf-8'),
+      )
+      pairs.append(pair_tuple)
+  except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):
+    pass
+  return pairs
--- a/code_search/src/code_search/do_fns/init.py
+++ b/code_search/src/code_search/do_fns/init.py
@ -1,3 +0,0 @@
-from code_search.do_fns.github_files import ExtractFuncInfo
-from code_search.do_fns.github_files import TokenizeCodeDocstring
-from code_search.do_fns.github_files import SplitRepoPath
--- a/code_search/src/code_search/do_fns/embeddings.py
+++ b/code_search/src/code_search/do_fns/embeddings.py
@ -1,76 +0,0 @@
-"""Beam DoFns for prediction related tasks"""
-import io
-import csv
-from cStringIO import StringIO
-import apache_beam as beam
-from code_search.transforms.process_github_files import ProcessGithubFiles
-from code_search.t2t.query import get_encoder, encode_query
-
-class GithubCSVToDict(beam.DoFn):
-  """Split a text row and convert into a dict."""
-
-  def process(self, element):  # pylint: disable=no-self-use
-    element = element.encode('utf-8')
-    row = StringIO(element)
-    reader = csv.reader(row, delimiter=',')
-
-    keys = ProcessGithubFiles.get_key_list()
-    values = next(reader)  # pylint: disable=stop-iteration-return
-
-    result = dict(zip(keys, values))
-    yield result
-
-
-class GithubDictToCSV(beam.DoFn):
-  """Convert dictionary to writable CSV string."""
-
-  def process(self, element):  # pylint: disable=no-self-use
-    element['function_embedding'] = ','.join(str(val) for val in element['function_embedding'])
-
-    target_keys = ['nwo', 'path', 'function_name', 'function_embedding']
-    target_values = [element[key].encode('utf-8') for key in target_keys]
-
-    with io.BytesIO() as fs:
-      cw = csv.writer(fs)
-      cw.writerow(target_values)
-      result_str = fs.getvalue().strip('\r\n')
-
-    return result_str
-
-
-class EncodeExample(beam.DoFn):
-  """Encode string to integer tokens.
-
-  This is needed so that the data can be sent in
-  for prediction
-  """
-  def __init__(self, problem, data_dir):
-    super(EncodeExample, self).__init__()
-
-    self.problem = problem
-    self.data_dir = data_dir
-
-  def process(self, element):
-    encoder = get_encoder(self.problem, self.data_dir)
-    encoded_function = encode_query(encoder, element['function_tokens'])
-
-    element['instances'] = [{'input': {'b64': encoded_function}}]
-    yield element
-
-
-class ProcessPrediction(beam.DoFn):
-  """Process results from PredictionDoFn.
-
-  This class processes predictions from another
-  DoFn, to make sure it is a correctly formatted dict.
-  """
-  def process(self, element):  # pylint: disable=no-self-use
-    element['function_embedding'] = ','.join([
-      str(val) for val in element['predictions'][0]['outputs']
-    ])
-
-    element.pop('function_tokens')
-    element.pop('instances')
-    element.pop('predictions')
-
-    yield element
--- a/code_search/src/code_search/do_fns/github_files.py
+++ b/code_search/src/code_search/do_fns/github_files.py
@ -1,79 +0,0 @@
-"""Beam DoFns for Github related tasks"""
-import time
-import logging
-import apache_beam as beam
-from apache_beam import pvalue
-from apache_beam.metrics import Metrics
-
-
-class SplitRepoPath(beam.DoFn):
-  # pylint: disable=abstract-method
-  """Split the space-delimited file `repo_path` into owner repository (`nwo`)
-  and file path (`path`)"""
-
-  def process(self, element): # pylint: disable=no-self-use
-    nwo, path = element.pop('repo_path').split(' ', 1)
-    element['nwo'] = nwo
-    element['path'] = path
-    yield element
-
-
-class TokenizeCodeDocstring(beam.DoFn):
-  # pylint: disable=abstract-method
-  """Compute code/docstring pairs from incoming BigQuery row dict"""
-  def __init__(self):
-    super(TokenizeCodeDocstring, self).__init__()
-
-    self.tokenization_time_ms = Metrics.counter(self.__class__, 'tokenization_time_ms')
-
-  def process(self, element): # pylint: disable=no-self-use
-    try:
-      import code_search.utils as utils
-
-      start_time = time.time()
-      element['pairs'] = utils.get_function_docstring_pairs(element.pop('content'))
-      self.tokenization_time_ms.inc(int((time.time() - start_time) * 1000.0))
-
-      yield element
-    except Exception as e: #pylint: disable=broad-except
-      logging.warning('Tokenization failed, %s', e.message)
-      yield pvalue.TaggedOutput('err_rows', element)
-
-
-class ExtractFuncInfo(beam.DoFn):
-  # pylint: disable=abstract-method
-  """Convert pair tuples to dict.
-
-  This takes a list of values from `TokenizeCodeDocstring`
-  and converts into a dictionary so that values can be
-  indexed by names instead of indices. `info_keys` is the
-  list of names of those values in order which will become
-  the keys of each new dict.
-  """
-  def __init__(self, info_keys):
-    super(ExtractFuncInfo, self).__init__()
-
-    self.info_keys = info_keys
-
-  def process(self, element):
-    try:
-      info_rows = [dict(zip(self.info_keys, pair)) for pair in element.pop('pairs')]
-      info_rows = [self.merge_two_dicts(info_dict, element) for info_dict in info_rows]
-      info_rows = map(self.dict_to_unicode, info_rows)
-      yield info_rows
-    except Exception as e: #pylint: disable=broad-except
-      logging.warning('Function Info extraction failed, %s', e.message)
-      yield pvalue.TaggedOutput('err_rows', element)
-
-  @staticmethod
-  def merge_two_dicts(dict_a, dict_b):
-    result = dict_a.copy()
-    result.update(dict_b)
-    return result
-
-  @staticmethod
-  def dict_to_unicode(data_dict):
-    for k, v in data_dict.items():
-      if isinstance(v, str):
-        data_dict[k] = v.decode('utf-8', 'ignore')
-    return data_dict
--- a/code_search/src/code_search/transforms/code_embed.py
+++ b/code_search/src/code_search/transforms/code_embed.py
@ -1,54 +0,0 @@
-import apache_beam as beam
-import kubeflow_batch_predict.dataflow.batch_prediction as batch_prediction
-
-import code_search.do_fns.embeddings as embeddings
-import code_search.transforms.github_bigquery as github_bigquery
-
-
-class GithubBatchPredict(beam.PTransform):
-  """Batch Prediction for Github dataset"""
-
-  def __init__(self, project, problem, data_dir, saved_model_dir):
-    super(GithubBatchPredict, self).__init__()
-
-    self.project = project
-    self.problem = problem
-    self.data_dir = data_dir
-    self.saved_model_dir = saved_model_dir
-
-    ##
-    # Target dataset and table to store prediction outputs.
-    # Non-configurable for now.
-    #
-    self.index_dataset = 'code_search'
-    self.index_table = 'search_index'
-
-    self.batch_size = 100
-
-  def expand(self, input_or_inputs):
-    rows = (input_or_inputs
-      | "Read Processed Github Dataset" >> github_bigquery.ReadProcessedGithubData(self.project)
-    )
-
-    batch_predict = (rows
-      | "Prepare Encoded Input" >> beam.ParDo(embeddings.EncodeExample(self.problem,
-                                                                       self.data_dir))
-      | "Execute Predictions" >> beam.ParDo(batch_prediction.PredictionDoFn(),
-                                            self.saved_model_dir).with_outputs("errors",
-                                                                               main="main")
-    )
-
-    predictions = batch_predict.main
-
-    formatted_predictions = (predictions
-      | "Process Predictions" >> beam.ParDo(embeddings.ProcessPrediction())
-    )
-
-    (formatted_predictions  # pylint: disable=expression-not-assigned
-      | "Save Index Data" >> github_bigquery.WriteGithubIndexData(self.project,
-                                                                  self.index_dataset,
-                                                                  self.index_table,
-                                                                  batch_size=self.batch_size)
-    )
-
-    return formatted_predictions
--- a/code_search/src/code_search/transforms/github_bigquery.py
+++ b/code_search/src/code_search/transforms/github_bigquery.py
@ -1,87 +0,0 @@
-import code_search.transforms.bigquery as bigquery
-
-
-class ReadOriginalGithubPythonData(bigquery.BigQueryRead):
-  @property
-  def limit(self):
-    return None
-
-  @property
-  def query_string(self):
-    query = """
-      SELECT
-        MAX(CONCAT(f.repo_name, ' ', f.path)) AS repo_path,
-        c.content
-      FROM
-        `bigquery-public-data.github_repos.files` AS f
-      JOIN
-        `bigquery-public-data.github_repos.contents` AS c
-      ON
-        f.id = c.id
-      JOIN (
-          --this part of the query makes sure repo is watched at least twice since 2017
-        SELECT
-          repo
-        FROM (
-          SELECT
-            repo.name AS repo
-          FROM
-            `githubarchive.year.2017`
-          WHERE
-            type="WatchEvent"
-          UNION ALL
-          SELECT
-            repo.name AS repo
-          FROM
-            `githubarchive.month.2018*`
-          WHERE
-            type="WatchEvent" )
-        GROUP BY
-          1
-        HAVING
-          COUNT(*) >= 2 ) AS r
-      ON
-        f.repo_name = r.repo
-      WHERE
-        f.path LIKE '%.py' AND --with python extension
-        c.size < 15000 AND --get rid of ridiculously long files
-        REGEXP_CONTAINS(c.content, r'def ') --contains function definition
-      GROUP BY
-        c.content
-    """
-
-    if self.limit:
-      query += '\nLIMIT {}'.format(self.limit)
-    return query
-
-
-class ReadProcessedGithubData(bigquery.BigQueryRead):
-  @property
-  def limit(self):
-    return 100
-
-  @property
-  def query_string(self):
-    query = """
-      SELECT 
-        nwo, path, function_name, lineno, original_function, function_tokens
-      FROM
-        code_search.function_docstrings
-    """
-
-    if self.limit:
-      query += '\nLIMIT {}'.format(self.limit)
-    return query
-
-
-class WriteGithubIndexData(bigquery.BigQueryWrite):
-  @property
-  def column_list(self):
-    return [
-      ('nwo', 'STRING'),
-      ('path', 'STRING'),
-      ('function_name', 'STRING'),
-      ('lineno', 'INTEGER'),
-      ('original_function', 'STRING'),
-      ('function_embedding', 'STRING')
-    ]
--- a/code_search/src/code_search/transforms/process_github_files.py
+++ b/code_search/src/code_search/transforms/process_github_files.py
@ -1,133 +0,0 @@
-import io
-import csv
-import apache_beam as beam
-import apache_beam.io.gcp.internal.clients as clients
-
-import code_search.do_fns as do_fns
-
-
-class ProcessGithubFiles(beam.PTransform):
-  # pylint: disable=too-many-instance-attributes
-
-  """A collection of `DoFn`s for Pipeline Transform. Reads the Github dataset from BigQuery
-  and writes back the processed code-docstring pairs in a query-friendly format back to BigQuery
-  table.
-  """
-  data_columns = ['nwo', 'path', 'function_name', 'lineno', 'original_function',
-                       'function_tokens', 'docstring_tokens']
-  data_types = ['STRING', 'STRING', 'STRING', 'INTEGER', 'STRING', 'STRING', 'STRING']
-
-  def __init__(self, project, query_string, output_string, storage_bucket):
-    super(ProcessGithubFiles, self).__init__()
-
-    self.project = project
-    self.query_string = query_string
-    self.output_dataset, self.output_table = output_string.split(':')
-    self.storage_bucket = storage_bucket
-
-    self.num_shards = 10
-
-  def expand(self, input_or_inputs):
-    tokenize_result = (input_or_inputs
-      | "Read Github Dataset" >> beam.io.Read(beam.io.BigQuerySource(query=self.query_string,
-                                                          use_standard_sql=True))
-      | "Split 'repo_path'" >> beam.ParDo(do_fns.SplitRepoPath())
-      | "Tokenize Code/Docstring Pairs" >> beam.ParDo(do_fns.TokenizeCodeDocstring())
-                                               .with_outputs('err_rows', main='rows')
-    )
-
-    #pylint: disable=expression-not-assigned
-    (tokenize_result.err_rows
-     | "Failed Row Tokenization" >> beam.io.WriteToBigQuery(project=self.project,
-                                                        dataset=self.output_dataset,
-                                                        table=self.output_table + '_failed',
-                                                        schema=self.create_failed_output_schema())
-    )
-    # pylint: enable=expression-not-assigned
-
-
-    info_result = (tokenize_result.rows
-      | "Extract Function Info" >> beam.ParDo(do_fns.ExtractFuncInfo(self.data_columns[2:]))
-                                       .with_outputs('err_rows', main='rows')
-    )
-
-    #pylint: disable=expression-not-assigned
-    (info_result.err_rows
-     | "Failed Function Info" >> beam.io.WriteToBigQuery(project=self.project,
-                                                        dataset=self.output_dataset,
-                                                        table=self.output_table + '_failed',
-                                                        schema=self.create_failed_output_schema())
-    )
-    # pylint: enable=expression-not-assigned
-
-    processed_rows = (info_result.rows | "Flatten Rows" >> beam.FlatMap(lambda x: x))
-
-    # pylint: disable=expression-not-assigned
-    (processed_rows
-     | "Filter Tiny Docstrings" >> beam.Filter(
-        lambda row: len(row['docstring_tokens'].split(' ')) > 5)
-     | "Format For Write" >> beam.Map(self.format_for_write)
-     | "Write To File" >> beam.io.WriteToText('{}/data/pairs'.format(self.storage_bucket),
-                                         file_name_suffix='.csv',
-                                         num_shards=self.num_shards))
-    # pylint: enable=expression-not-assigned
-
-    return (processed_rows
-      | "Save Tokens" >> beam.io.WriteToBigQuery(project=self.project,
-                                                  dataset=self.output_dataset,
-                                                  table=self.output_table,
-                                                  schema=self.create_output_schema())
-    )
-
-  @staticmethod
-  def get_key_list():
-    filter_keys = [
-        'original_function',
-        'lineno',
-    ]
-    key_list = [col for col in ProcessGithubFiles.data_columns
-                if col not in filter_keys]
-    return key_list
-
-  def format_for_write(self, row):
-    """This method filters keys that we don't need in the
-    final CSV. It must ensure that there are no multi-line
-    column fields. For instance, 'original_function' is a
-    multi-line string and makes CSV parsing hard for any
-    derived Dataflow steps. This uses the CSV Writer
-    to handle all edge cases like quote escaping."""
-
-    target_keys = self.get_key_list()
-    target_values = [row[key].encode('utf-8') for key in target_keys]
-
-    with io.BytesIO() as fs:
-      cw = csv.writer(fs)
-      cw.writerow(target_values)
-      result_str = fs.getvalue().strip('\r\n')
-
-    return result_str
-
-  def create_output_schema(self):
-    table_schema = clients.bigquery.TableSchema()
-
-    for column, data_type in zip(self.data_columns, self.data_types):
-      field_schema = clients.bigquery.TableFieldSchema()
-      field_schema.name = column
-      field_schema.type = data_type
-      field_schema.mode = 'nullable'
-      table_schema.fields.append(field_schema)
-
-    return table_schema
-
-  def create_failed_output_schema(self):
-    table_schema = clients.bigquery.TableSchema()
-
-    for column, data_type in zip(self.data_columns[:2] + ['content'],
-                                 self.data_types[:2] + ['STRING']):
-      field_schema = clients.bigquery.TableFieldSchema()
-      field_schema.name = column
-      field_schema.type = data_type
-      field_schema.mode = 'nullable'
-      table_schema.fields.append(field_schema)
-
-    return table_schema
--- a/code_search/src/code_search/utils.py
+++ b/code_search/src/code_search/utils.py
@ -1,38 +0,0 @@
-import ast
-import astor
-import nltk.tokenize as tokenize
-import spacy
-
-
-def tokenize_docstring(text):
-  """Apply tokenization using spacy to docstrings."""
-  en = spacy.load('en')
-  tokens = en.tokenizer(text.decode('utf8', 'ignore'))
-  return [token.text.lower() for token in tokens if not token.is_space]
-
-
-def tokenize_code(text):
-  """A very basic procedure for tokenizing code strings."""
-  return tokenize.RegexpTokenizer(r'\w+').tokenize(text)
-
-
-def get_function_docstring_pairs(blob):
-  """Extract (function/method, docstring) pairs from a given code blob."""
-  pairs = []
-  try:
-    module = ast.parse(blob)
-    classes = [node for node in module.body if isinstance(node, ast.ClassDef)]
-    functions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
-    for _class in classes:
-      functions.extend([node for node in _class.body if isinstance(node, ast.FunctionDef)])
-
-    for f in functions:
-      source = astor.to_source(f)
-      docstring = ast.get_docstring(f) if ast.get_docstring(f) else ''
-      func = source.replace(ast.get_docstring(f, clean=False), '') if docstring else source
-
-      pairs.append((f.name, f.lineno, source, ' '.join(tokenize_code(func)),
-                    ' '.join(tokenize_docstring(docstring.split('\n\n')[0]))))
-  except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):
-    pass
-  return pairs
--- a/code_search/src/files/select_github_archive.sql
+++ b/code_search/src/files/select_github_archive.sql
@ -1,39 +0,0 @@
-SELECT
-  MAX(CONCAT(f.repo_name, ' ', f.path)) AS repo_path,
-  c.content
-FROM
-  `bigquery-public-data.github_repos.files` AS f
-JOIN
-  `bigquery-public-data.github_repos.contents` AS c
-ON
-  f.id = c.id
-JOIN (
-    --this part of the query makes sure repo is watched at least twice since 2017
-  SELECT
-    repo
-  FROM (
-    SELECT
-      repo.name AS repo
-    FROM
-      `githubarchive.year.2017`
-    WHERE
-      type="WatchEvent"
-    UNION ALL
-    SELECT
-      repo.name AS repo
-    FROM
-      `githubarchive.month.2018*`
-    WHERE
-      type="WatchEvent" )
-  GROUP BY
-    1
-  HAVING
-    COUNT(*) >= 2 ) AS r
-ON
-  f.repo_name = r.repo
-WHERE
-  f.path LIKE '%.py' AND --with python extension
-  c.size < 15000 AND --get rid of ridiculously long files
-  REGEXP_CONTAINS(c.content, r'def ') --contains function definition
-GROUP BY
-  c.content
--- a/code_search/src/setup.py
+++ b/code_search/src/setup.py
@ -60,8 +60,6 @@ setup(name='code-search',
      },
      entry_points={
        'console_scripts': [
-          'code-search-preprocess=code_search.cli:create_github_pipeline',
-          'code-search-predict=code_search.cli:create_batch_predict_pipeline',
          'nmslib-serve=code_search.nmslib.cli:server',
          'nmslib-create=code_search.nmslib.cli:creator',
        ]