mirror of https://github.com/kubeflow/examples.git
Semantic Code Search Example Data Ingestion (#120)
* Code Search Preprocessing Pipeline * Add missing pipeline execution to git tree * Move the preprocessing step into its own package * Add docstrings * Fix pylint errors
This commit is contained in:
parent
174d6602ac
commit
26ff66d747
|
|
@ -0,0 +1,108 @@
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# celery beat schedule file
|
||||||
|
celerybeat-schedule
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
|
||||||
|
# Virtual Environments
|
||||||
|
venv/
|
||||||
|
|
||||||
|
|
@ -0,0 +1,2 @@
|
||||||
|
include README.md requirements.txt
|
||||||
|
include files/*
|
||||||
|
|
@ -0,0 +1,51 @@
|
||||||
|
# Semantic Code Search
|
||||||
|
|
||||||
|
Pre-processing Pipeline package for End-to-End Semantic Code Search on Kubeflow
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
* Python 2.7 (with `pip`)
|
||||||
|
* Python `virtualenv`
|
||||||
|
|
||||||
|
**NOTE**: This package uses Google Cloud Dataflow which only supports Python 2.7.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
* Setup Python Virtual Environment
|
||||||
|
```
|
||||||
|
$ virtualenv venv
|
||||||
|
$ source venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
* Install [`gcloud`](https://cloud.google.com/sdk/gcloud/) CLI
|
||||||
|
|
||||||
|
* Setup Application Default Credentials
|
||||||
|
```
|
||||||
|
$ gcloud auth application-default login
|
||||||
|
```
|
||||||
|
|
||||||
|
* Enable Dataflow via Command Line (or use the Google Cloud Console)
|
||||||
|
```
|
||||||
|
$ gcloud services enable dataflow.googleapis.com
|
||||||
|
```
|
||||||
|
|
||||||
|
* Build and install package
|
||||||
|
```
|
||||||
|
$ python setup.py build install
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
# Execution
|
||||||
|
|
||||||
|
Submit a `Dataflow` job using the following command
|
||||||
|
|
||||||
|
```
|
||||||
|
$ python scripts/process_github_archive.py -i files/select_github_archive.sql -o code_search:function_docstrings -p kubeflow-dev
|
||||||
|
-j process-github-archive --storage-bucket gs://kubeflow-dev
|
||||||
|
```
|
||||||
|
|
||||||
|
**NOTE**: Make sure the Project and Google Storage Bucket is created.
|
||||||
|
|
||||||
|
# Acknowledgements
|
||||||
|
|
||||||
|
This project derives from [hamelsmu/code_search](https://github.com/hamelsmu/code_search).
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
SELECT
|
||||||
|
MAX(CONCAT(f.repo_name, ' ', f.path)) AS repo_path,
|
||||||
|
c.content
|
||||||
|
FROM
|
||||||
|
`bigquery-public-data.github_repos.files` AS f
|
||||||
|
JOIN
|
||||||
|
`bigquery-public-data.github_repos.contents` AS c
|
||||||
|
ON
|
||||||
|
f.id = c.id
|
||||||
|
JOIN (
|
||||||
|
--this part of the query makes sure repo is watched at least twice since 2017
|
||||||
|
SELECT
|
||||||
|
repo
|
||||||
|
FROM (
|
||||||
|
SELECT
|
||||||
|
repo.name AS repo
|
||||||
|
FROM
|
||||||
|
`githubarchive.year.2017`
|
||||||
|
WHERE
|
||||||
|
type="WatchEvent"
|
||||||
|
UNION ALL
|
||||||
|
SELECT
|
||||||
|
repo.name AS repo
|
||||||
|
FROM
|
||||||
|
`githubarchive.month.2018*`
|
||||||
|
WHERE
|
||||||
|
type="WatchEvent" )
|
||||||
|
GROUP BY
|
||||||
|
1
|
||||||
|
HAVING
|
||||||
|
COUNT(*) >= 2 ) AS r
|
||||||
|
ON
|
||||||
|
f.repo_name = r.repo
|
||||||
|
WHERE
|
||||||
|
f.path LIKE '%.py' AND --with python extension
|
||||||
|
c.size < 15000 AND --get rid of ridiculously long files
|
||||||
|
REGEXP_CONTAINS(c.content, r'def ') --contains function definition
|
||||||
|
GROUP BY
|
||||||
|
c.content
|
||||||
|
-- for development purposes only
|
||||||
|
LIMIT
|
||||||
|
1000000
|
||||||
|
|
@ -0,0 +1,117 @@
|
||||||
|
import os
|
||||||
|
import apache_beam as beam
|
||||||
|
import apache_beam.io as io
|
||||||
|
from apache_beam.options.pipeline_options import StandardOptions, PipelineOptions, \
|
||||||
|
GoogleCloudOptions, SetupOptions
|
||||||
|
from apache_beam.io.gcp.internal.clients import bigquery
|
||||||
|
|
||||||
|
|
||||||
|
def create_pipeline_opts(args):
|
||||||
|
"""Create standard Pipeline Options for Google Cloud Dataflow"""
|
||||||
|
options = PipelineOptions()
|
||||||
|
options.view_as(StandardOptions).runner = 'DataflowRunner'
|
||||||
|
|
||||||
|
google_cloud_options = options.view_as(GoogleCloudOptions)
|
||||||
|
google_cloud_options.project = args.project
|
||||||
|
google_cloud_options.job_name = args.job_name
|
||||||
|
google_cloud_options.temp_location = '{}/{}/temp'.format(args.storage_bucket, args.job_name)
|
||||||
|
google_cloud_options.staging_location = '{}/{}/staging'.format(args.storage_bucket, args.job_name)
|
||||||
|
|
||||||
|
# Point to `setup.py` to allow Dataflow runner to install the package
|
||||||
|
options.view_as(SetupOptions).setup_file = os.path.join(
|
||||||
|
os.path.dirname(os.path.dirname(__file__)), 'setup.py')
|
||||||
|
|
||||||
|
return options
|
||||||
|
|
||||||
|
|
||||||
|
class SplitRepoPath(beam.DoFn):
|
||||||
|
# pylint: disable=abstract-method
|
||||||
|
"""Split the space-delimited file `repo_path` into owner repository (`nwo`)
|
||||||
|
and file path (`path`)"""
|
||||||
|
|
||||||
|
def process(self, element, *args, **kwargs): # pylint: disable=unused-argument,no-self-use
|
||||||
|
nwo, path = element.pop('repo_path').split(' ', 1)
|
||||||
|
element['nwo'] = nwo
|
||||||
|
element['path'] = path
|
||||||
|
yield element
|
||||||
|
|
||||||
|
|
||||||
|
class TokenizeCodeDocstring(beam.DoFn):
|
||||||
|
# pylint: disable=abstract-method
|
||||||
|
"""Compute code/docstring pairs from incoming BigQuery row dict"""
|
||||||
|
|
||||||
|
def process(self, element, *args, **kwargs): # pylint: disable=unused-argument,no-self-use
|
||||||
|
from preprocess.tokenizer import get_function_docstring_pairs
|
||||||
|
element['pairs'] = get_function_docstring_pairs(element.pop('content'))
|
||||||
|
yield element
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractFuncInfo(beam.DoFn):
|
||||||
|
# pylint: disable=abstract-method
|
||||||
|
"""Convert pair tuples from `TokenizeCodeDocstring` into dict containing query-friendly keys"""
|
||||||
|
def __init__(self, info_keys):
|
||||||
|
super(ExtractFuncInfo, self).__init__()
|
||||||
|
|
||||||
|
self.info_keys = info_keys
|
||||||
|
|
||||||
|
def process(self, element, *args, **kwargs): # pylint: disable=unused-argument
|
||||||
|
info_rows = [dict(zip(self.info_keys, pair)) for pair in element.pop('pairs')]
|
||||||
|
info_rows = [self.merge_two_dicts(info_dict, element) for info_dict in info_rows]
|
||||||
|
info_rows = map(self.dict_to_unicode, info_rows)
|
||||||
|
yield info_rows
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def merge_two_dicts(dict_a, dict_b):
|
||||||
|
result = dict_a.copy()
|
||||||
|
result.update(dict_b)
|
||||||
|
return result
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def dict_to_unicode(data_dict):
|
||||||
|
for k, v in data_dict.items():
|
||||||
|
if isinstance(v, str):
|
||||||
|
data_dict[k] = v.encode('utf-8', 'ignore')
|
||||||
|
return data_dict
|
||||||
|
|
||||||
|
|
||||||
|
class BigQueryGithubFiles(beam.PTransform):
|
||||||
|
"""A collection of `DoFn`s for Pipeline Transform. Reads the Github dataset from BigQuery
|
||||||
|
and writes back the processed code-docstring pairs in a query-friendly format back to BigQuery
|
||||||
|
table.
|
||||||
|
"""
|
||||||
|
def __init__(self, project, query_string, output_string):
|
||||||
|
super(BigQueryGithubFiles, self).__init__()
|
||||||
|
|
||||||
|
self.project = project
|
||||||
|
self.query_string = query_string
|
||||||
|
self.output_dataset, self.output_table = output_string.split(':')
|
||||||
|
|
||||||
|
self.data_columns = ['nwo', 'path', 'function_name', 'lineno', 'original_function',
|
||||||
|
'function_tokens', 'docstring_tokens']
|
||||||
|
self.data_types = ['STRING', 'STRING', 'STRING', 'INTEGER', 'STRING', 'STRING', 'STRING']
|
||||||
|
|
||||||
|
def expand(self, input_or_inputs):
|
||||||
|
return (input_or_inputs
|
||||||
|
| "Read BigQuery Rows" >> io.Read(io.BigQuerySource(query=self.query_string,
|
||||||
|
use_standard_sql=True))
|
||||||
|
| "Split 'repo_path'" >> beam.ParDo(SplitRepoPath())
|
||||||
|
| "Tokenize Code/Docstring Pairs" >> beam.ParDo(TokenizeCodeDocstring())
|
||||||
|
| "Extract Function Info" >> beam.ParDo(ExtractFuncInfo(self.data_columns[2:]))
|
||||||
|
| "Flatten Rows" >> beam.FlatMap(lambda x: x)
|
||||||
|
| "Write to BigQuery" >> io.WriteToBigQuery(project=self.project,
|
||||||
|
dataset=self.output_dataset,
|
||||||
|
table=self.output_table,
|
||||||
|
schema=self.create_output_schema())
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_output_schema(self):
|
||||||
|
table_schema = bigquery.TableSchema()
|
||||||
|
|
||||||
|
for column, data_type in zip(self.data_columns, self.data_types):
|
||||||
|
field_schema = bigquery.TableFieldSchema()
|
||||||
|
field_schema.name = column
|
||||||
|
field_schema.type = data_type
|
||||||
|
field_schema.mode = 'nullable'
|
||||||
|
table_schema.fields.append(field_schema)
|
||||||
|
|
||||||
|
return table_schema
|
||||||
|
|
@ -0,0 +1,38 @@
|
||||||
|
import ast
|
||||||
|
import astor
|
||||||
|
import spacy
|
||||||
|
from nltk.tokenize import RegexpTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_docstring(text):
|
||||||
|
"""Apply tokenization using spacy to docstrings."""
|
||||||
|
en = spacy.load('en')
|
||||||
|
tokens = en.tokenizer(text.decode('utf8'))
|
||||||
|
return [token.text.lower() for token in tokens if not token.is_space]
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_code(text):
|
||||||
|
"""A very basic procedure for tokenizing code strings."""
|
||||||
|
return RegexpTokenizer(r'\w+').tokenize(text)
|
||||||
|
|
||||||
|
|
||||||
|
def get_function_docstring_pairs(blob):
|
||||||
|
"""Extract (function/method, docstring) pairs from a given code blob."""
|
||||||
|
pairs = []
|
||||||
|
try:
|
||||||
|
module = ast.parse(blob)
|
||||||
|
classes = [node for node in module.body if isinstance(node, ast.ClassDef)]
|
||||||
|
functions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
|
||||||
|
for _class in classes:
|
||||||
|
functions.extend([node for node in _class.body if isinstance(node, ast.FunctionDef)])
|
||||||
|
|
||||||
|
for f in functions:
|
||||||
|
source = astor.to_source(f)
|
||||||
|
docstring = ast.get_docstring(f) if ast.get_docstring(f) else ''
|
||||||
|
func = source.replace(ast.get_docstring(f, clean=False), '') if docstring else source
|
||||||
|
|
||||||
|
pairs.append((f.name, f.lineno, source, ' '.join(tokenize_code(func)),
|
||||||
|
' '.join(tokenize_docstring(docstring.split('\n\n')[0]))))
|
||||||
|
except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):
|
||||||
|
pass
|
||||||
|
return pairs
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
astor~=0.6.0
|
||||||
|
apache-beam[gcp]~=2.4.0
|
||||||
|
nltk~=3.3.0
|
||||||
|
spacy~=2.0.0
|
||||||
|
|
@ -0,0 +1,36 @@
|
||||||
|
from __future__ import print_function
|
||||||
|
import argparse
|
||||||
|
import apache_beam as beam
|
||||||
|
|
||||||
|
from preprocess.pipeline import create_pipeline_opts, BigQueryGithubFiles
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments(args):
|
||||||
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
parser.add_argument('-i', '--input', metavar='', help='Path to BigQuery SQL script')
|
||||||
|
parser.add_argument('-o', '--output', metavar='',
|
||||||
|
help='Output string of the format <dataset>:<table>')
|
||||||
|
parser.add_argument('-p', '--project', metavar='', default='Project', help='Project ID')
|
||||||
|
parser.add_argument('-j', '--job-name', metavar='', default='Beam Job', help='Job name')
|
||||||
|
parser.add_argument('--storage-bucket', metavar='', default='gs://bucket',
|
||||||
|
help='Path to Google Storage Bucket')
|
||||||
|
|
||||||
|
parsed_args = parser.parse_args(args)
|
||||||
|
return parsed_args
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
args = parse_arguments(args)
|
||||||
|
pipeline_opts = create_pipeline_opts(args)
|
||||||
|
|
||||||
|
with open(args.input, 'r') as f:
|
||||||
|
query_string = f.read()
|
||||||
|
|
||||||
|
pipeline = beam.Pipeline(options=pipeline_opts)
|
||||||
|
(pipeline | BigQueryGithubFiles(args.project, query_string, args.output)) #pylint: disable=expression-not-assigned
|
||||||
|
pipeline.run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
main(sys.argv[1:])
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
from __future__ import print_function
|
||||||
|
import subprocess
|
||||||
|
from distutils.command.build import build as distutils_build #pylint: disable=no-name-in-module
|
||||||
|
from setuptools import setup, find_packages, Command as SetupToolsCommand
|
||||||
|
|
||||||
|
|
||||||
|
with open('requirements.txt', 'r') as f:
|
||||||
|
install_requires = f.readlines()
|
||||||
|
|
||||||
|
with open('README.md') as f:
|
||||||
|
README = f.read()
|
||||||
|
|
||||||
|
VERSION = '0.1.0'
|
||||||
|
CUSTOM_COMMANDS = [
|
||||||
|
['python', '-m', 'spacy', 'download', 'en']
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class Build(distutils_build):
|
||||||
|
sub_commands = distutils_build.sub_commands + [('CustomCommands', None)]
|
||||||
|
|
||||||
|
|
||||||
|
class CustomCommands(SetupToolsCommand):
|
||||||
|
def initialize_options(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def finalize_options(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def run_custom_command(command_list):
|
||||||
|
print('Running command: %s' % command_list)
|
||||||
|
p = subprocess.Popen(command_list, stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||||
|
stdout_data, _ = p.communicate()
|
||||||
|
print('Command output: %s' % stdout_data)
|
||||||
|
if p.returncode != 0:
|
||||||
|
raise RuntimeError('Command %s failed: exit code: %s' % (command_list, p.returncode))
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
for command in CUSTOM_COMMANDS:
|
||||||
|
self.run_custom_command(command)
|
||||||
|
|
||||||
|
|
||||||
|
setup(name='kubeflow-code-search',
|
||||||
|
description='Kubeflow Code Search Demo',
|
||||||
|
long_description=README,
|
||||||
|
long_description_content_type='text/markdown',
|
||||||
|
url='https://www.github.com/kubeflow/examples',
|
||||||
|
author='Sanyam Kapoor',
|
||||||
|
author_email='sanyamkapoor@google.com',
|
||||||
|
version=VERSION,
|
||||||
|
license='MIT',
|
||||||
|
packages=find_packages(),
|
||||||
|
install_requires=install_requires,
|
||||||
|
extras_require={},
|
||||||
|
cmdclass={
|
||||||
|
'build': Build,
|
||||||
|
'CustomCommands': CustomCommands,
|
||||||
|
})
|
||||||
Loading…
Reference in New Issue