Semantic Code Search Example Data Ingestion (#120)

* Code Search Preprocessing Pipeline * Add missing pipeline execution to git tree * Move the preprocessing step into its own package * Add docstrings * Fix pylint errors
2018-05-31 15:28:56 -07:00 · 2018-05-31 15:28:56 -07:00 · 26ff66d747
parent 174d6602ac
commit 26ff66d747
10 changed files with 458 additions and 0 deletions
--- a/code_search/preprocess/.gitignore
+++ b/code_search/preprocess/.gitignore
@ -0,0 +1,108 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# Virtual Environments
+venv/
+
--- a/code_search/preprocess/MANIFEST.in
+++ b/code_search/preprocess/MANIFEST.in
@ -0,0 +1,2 @@
+include README.md requirements.txt
+include files/*
--- a/code_search/preprocess/README.md
+++ b/code_search/preprocess/README.md
@ -0,0 +1,51 @@
+# Semantic Code Search
+
+Pre-processing Pipeline package for End-to-End Semantic Code Search on Kubeflow
+
+## Prerequisites
+
+* Python 2.7 (with `pip`)
+* Python `virtualenv`
+
+**NOTE**: This package uses Google Cloud Dataflow which only supports Python 2.7.
+
+## Setup
+
+* Setup Python Virtual Environment
+```
+$ virtualenv venv
+$ source venv/bin/activate
+```
+
+* Install [`gcloud`](https://cloud.google.com/sdk/gcloud/) CLI
+
+* Setup Application Default Credentials 
+```
+$ gcloud auth application-default login
+```
+
+* Enable Dataflow via Command Line (or use the Google Cloud Console)
+```
+$ gcloud services enable dataflow.googleapis.com
+```
+
+* Build and install package
+```
+$ python setup.py build install
+```
+
+
+# Execution
+
+Submit a `Dataflow` job using the following command
+
+```
+$ python scripts/process_github_archive.py -i files/select_github_archive.sql -o code_search:function_docstrings -p kubeflow-dev 
+                        -j process-github-archive --storage-bucket gs://kubeflow-dev
+```
+
+**NOTE**: Make sure the Project and Google Storage Bucket is created.
+
+# Acknowledgements
+
+This project derives from [hamelsmu/code_search](https://github.com/hamelsmu/code_search).
--- a/code_search/preprocess/files/select_github_archive.sql
+++ b/code_search/preprocess/files/select_github_archive.sql
@ -0,0 +1,42 @@
+SELECT
+  MAX(CONCAT(f.repo_name, ' ', f.path)) AS repo_path,
+  c.content
+FROM
+  `bigquery-public-data.github_repos.files` AS f
+JOIN
+  `bigquery-public-data.github_repos.contents` AS c
+ON
+  f.id = c.id
+JOIN (
+    --this part of the query makes sure repo is watched at least twice since 2017
+  SELECT
+    repo
+  FROM (
+    SELECT
+      repo.name AS repo
+    FROM
+      `githubarchive.year.2017`
+    WHERE
+      type="WatchEvent"
+    UNION ALL
+    SELECT
+      repo.name AS repo
+    FROM
+      `githubarchive.month.2018*`
+    WHERE
+      type="WatchEvent" )
+  GROUP BY
+    1
+  HAVING
+    COUNT(*) >= 2 ) AS r
+ON
+  f.repo_name = r.repo
+WHERE
+  f.path LIKE '%.py' AND --with python extension
+  c.size < 15000 AND --get rid of ridiculously long files
+  REGEXP_CONTAINS(c.content, r'def ') --contains function definition
+GROUP BY
+  c.content
+-- for development purposes only
+LIMIT
+  1000000
--- a/code_search/preprocess/preprocess/init.py
+++ b/code_search/preprocess/preprocess/init.py
--- a/code_search/preprocess/preprocess/pipeline.py
+++ b/code_search/preprocess/preprocess/pipeline.py
@ -0,0 +1,117 @@
+import os
+import apache_beam as beam
+import apache_beam.io as io
+from apache_beam.options.pipeline_options import StandardOptions, PipelineOptions, \
+  GoogleCloudOptions, SetupOptions
+from apache_beam.io.gcp.internal.clients import bigquery
+
+
+def create_pipeline_opts(args):
+  """Create standard Pipeline Options for Google Cloud Dataflow"""
+  options = PipelineOptions()
+  options.view_as(StandardOptions).runner = 'DataflowRunner'
+
+  google_cloud_options = options.view_as(GoogleCloudOptions)
+  google_cloud_options.project = args.project
+  google_cloud_options.job_name = args.job_name
+  google_cloud_options.temp_location = '{}/{}/temp'.format(args.storage_bucket, args.job_name)
+  google_cloud_options.staging_location = '{}/{}/staging'.format(args.storage_bucket, args.job_name)
+
+  # Point to `setup.py` to allow Dataflow runner to install the package
+  options.view_as(SetupOptions).setup_file = os.path.join(
+    os.path.dirname(os.path.dirname(__file__)), 'setup.py')
+
+  return options
+
+
+class SplitRepoPath(beam.DoFn):
+  # pylint: disable=abstract-method
+  """Split the space-delimited file `repo_path` into owner repository (`nwo`)
+  and file path (`path`)"""
+
+  def process(self, element, *args, **kwargs): # pylint: disable=unused-argument,no-self-use
+    nwo, path = element.pop('repo_path').split(' ', 1)
+    element['nwo'] = nwo
+    element['path'] = path
+    yield element
+
+
+class TokenizeCodeDocstring(beam.DoFn):
+  # pylint: disable=abstract-method
+  """Compute code/docstring pairs from incoming BigQuery row dict"""
+
+  def process(self, element, *args, **kwargs): # pylint: disable=unused-argument,no-self-use
+    from preprocess.tokenizer import get_function_docstring_pairs
+    element['pairs'] = get_function_docstring_pairs(element.pop('content'))
+    yield element
+
+
+class ExtractFuncInfo(beam.DoFn):
+  # pylint: disable=abstract-method
+  """Convert pair tuples from `TokenizeCodeDocstring` into dict containing query-friendly keys"""
+  def __init__(self, info_keys):
+    super(ExtractFuncInfo, self).__init__()
+
+    self.info_keys = info_keys
+
+  def process(self, element, *args, **kwargs): # pylint: disable=unused-argument
+    info_rows = [dict(zip(self.info_keys, pair)) for pair in element.pop('pairs')]
+    info_rows = [self.merge_two_dicts(info_dict, element) for info_dict in info_rows]
+    info_rows = map(self.dict_to_unicode, info_rows)
+    yield info_rows
+
+  @staticmethod
+  def merge_two_dicts(dict_a, dict_b):
+    result = dict_a.copy()
+    result.update(dict_b)
+    return result
+
+  @staticmethod
+  def dict_to_unicode(data_dict):
+    for k, v in data_dict.items():
+      if isinstance(v, str):
+        data_dict[k] = v.encode('utf-8', 'ignore')
+    return data_dict
+
+
+class BigQueryGithubFiles(beam.PTransform):
+  """A collection of `DoFn`s for Pipeline Transform. Reads the Github dataset from BigQuery
+  and writes back the processed code-docstring pairs in a query-friendly format back to BigQuery
+  table.
+  """
+  def __init__(self, project, query_string, output_string):
+    super(BigQueryGithubFiles, self).__init__()
+
+    self.project = project
+    self.query_string = query_string
+    self.output_dataset, self.output_table = output_string.split(':')
+
+    self.data_columns = ['nwo', 'path', 'function_name', 'lineno', 'original_function',
+                         'function_tokens', 'docstring_tokens']
+    self.data_types = ['STRING', 'STRING', 'STRING', 'INTEGER', 'STRING', 'STRING', 'STRING']
+
+  def expand(self, input_or_inputs):
+    return (input_or_inputs
+            | "Read BigQuery Rows" >> io.Read(io.BigQuerySource(query=self.query_string,
+                                                                use_standard_sql=True))
+            | "Split 'repo_path'" >> beam.ParDo(SplitRepoPath())
+            | "Tokenize Code/Docstring Pairs" >> beam.ParDo(TokenizeCodeDocstring())
+            | "Extract Function Info" >> beam.ParDo(ExtractFuncInfo(self.data_columns[2:]))
+            | "Flatten Rows" >> beam.FlatMap(lambda x: x)
+            | "Write to BigQuery" >> io.WriteToBigQuery(project=self.project,
+                                                        dataset=self.output_dataset,
+                                                        table=self.output_table,
+                                                        schema=self.create_output_schema())
+            )
+
+  def create_output_schema(self):
+    table_schema = bigquery.TableSchema()
+
+    for column, data_type in zip(self.data_columns, self.data_types):
+      field_schema = bigquery.TableFieldSchema()
+      field_schema.name = column
+      field_schema.type = data_type
+      field_schema.mode = 'nullable'
+      table_schema.fields.append(field_schema)
+
+    return table_schema
--- a/code_search/preprocess/preprocess/tokenizer.py
+++ b/code_search/preprocess/preprocess/tokenizer.py
@ -0,0 +1,38 @@
+import ast
+import astor
+import spacy
+from nltk.tokenize import RegexpTokenizer
+
+
+def tokenize_docstring(text):
+  """Apply tokenization using spacy to docstrings."""
+  en = spacy.load('en')
+  tokens = en.tokenizer(text.decode('utf8'))
+  return [token.text.lower() for token in tokens if not token.is_space]
+
+
+def tokenize_code(text):
+  """A very basic procedure for tokenizing code strings."""
+  return RegexpTokenizer(r'\w+').tokenize(text)
+
+
+def get_function_docstring_pairs(blob):
+  """Extract (function/method, docstring) pairs from a given code blob."""
+  pairs = []
+  try:
+    module = ast.parse(blob)
+    classes = [node for node in module.body if isinstance(node, ast.ClassDef)]
+    functions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
+    for _class in classes:
+      functions.extend([node for node in _class.body if isinstance(node, ast.FunctionDef)])
+
+    for f in functions:
+      source = astor.to_source(f)
+      docstring = ast.get_docstring(f) if ast.get_docstring(f) else ''
+      func = source.replace(ast.get_docstring(f, clean=False), '') if docstring else source
+
+      pairs.append((f.name, f.lineno, source, ' '.join(tokenize_code(func)),
+                    ' '.join(tokenize_docstring(docstring.split('\n\n')[0]))))
+  except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):
+    pass
+  return pairs
--- a/code_search/preprocess/requirements.txt
+++ b/code_search/preprocess/requirements.txt
@ -0,0 +1,4 @@
+astor~=0.6.0
+apache-beam[gcp]~=2.4.0
+nltk~=3.3.0
+spacy~=2.0.0
--- a/code_search/preprocess/scripts/process_github_archive.py
+++ b/code_search/preprocess/scripts/process_github_archive.py
@ -0,0 +1,36 @@
+from __future__ import print_function
+import argparse
+import apache_beam as beam
+
+from preprocess.pipeline import create_pipeline_opts, BigQueryGithubFiles
+
+
+def parse_arguments(args):
+  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+  parser.add_argument('-i', '--input', metavar='', help='Path to BigQuery SQL script')
+  parser.add_argument('-o', '--output', metavar='',
+                      help='Output string of the format <dataset>:<table>')
+  parser.add_argument('-p', '--project', metavar='', default='Project', help='Project ID')
+  parser.add_argument('-j', '--job-name', metavar='', default='Beam Job', help='Job name')
+  parser.add_argument('--storage-bucket', metavar='', default='gs://bucket',
+                      help='Path to Google Storage Bucket')
+
+  parsed_args = parser.parse_args(args)
+  return parsed_args
+
+
+def main(args):
+  args = parse_arguments(args)
+  pipeline_opts = create_pipeline_opts(args)
+
+  with open(args.input, 'r') as f:
+    query_string = f.read()
+
+  pipeline = beam.Pipeline(options=pipeline_opts)
+  (pipeline | BigQueryGithubFiles(args.project, query_string, args.output)) #pylint: disable=expression-not-assigned
+  pipeline.run()
+
+
+if __name__ == '__main__':
+  import sys
+  main(sys.argv[1:])
--- a/code_search/preprocess/setup.py
+++ b/code_search/preprocess/setup.py
@ -0,0 +1,60 @@
+from __future__ import print_function
+import subprocess
+from distutils.command.build import build as distutils_build #pylint: disable=no-name-in-module
+from setuptools import setup, find_packages, Command as SetupToolsCommand
+
+
+with open('requirements.txt', 'r') as f:
+  install_requires = f.readlines()
+
+with open('README.md') as f:
+  README = f.read()
+
+VERSION = '0.1.0'
+CUSTOM_COMMANDS = [
+  ['python', '-m', 'spacy', 'download', 'en']
+]
+
+
+class Build(distutils_build):
+  sub_commands = distutils_build.sub_commands + [('CustomCommands', None)]
+
+
+class CustomCommands(SetupToolsCommand):
+  def initialize_options(self):
+    pass
+
+  def finalize_options(self):
+    pass
+
+  @staticmethod
+  def run_custom_command(command_list):
+    print('Running command: %s' % command_list)
+    p = subprocess.Popen(command_list, stdin=subprocess.PIPE,
+                         stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    stdout_data, _ = p.communicate()
+    print('Command output: %s' % stdout_data)
+    if p.returncode != 0:
+      raise RuntimeError('Command %s failed: exit code: %s' % (command_list, p.returncode))
+
+  def run(self):
+    for command in CUSTOM_COMMANDS:
+      self.run_custom_command(command)
+
+
+setup(name='kubeflow-code-search',
+      description='Kubeflow Code Search Demo',
+      long_description=README,
+      long_description_content_type='text/markdown',
+      url='https://www.github.com/kubeflow/examples',
+      author='Sanyam Kapoor',
+      author_email='sanyamkapoor@google.com',
+      version=VERSION,
+      license='MIT',
+      packages=find_packages(),
+      install_requires=install_requires,
+      extras_require={},
+      cmdclass={
+          'build': Build,
+          'CustomCommands': CustomCommands,
+      })