Create a script to count lines of code. (#379)

* Create a script to count lines of code.

* This is used in the presentation to get an estimate of where the human effort is involved.

* Fix lint issues.
This commit is contained in:
Jeremy Lewi 2018-12-19 09:42:25 -08:00 committed by Kubernetes Prow Robot
parent 345e69ab4c
commit ba9af34805
1 changed files with 145 additions and 0 deletions

View File

@ -0,0 +1,145 @@
"""Count lines of code in different types of file.
This has nothing to do with actually running code search.
The sole purpose of this script is to collect data for the presentation to
illustrate the point that most effort isn't spent on ML.
"""
import argparse
import csv
import logging
import os
import re
import sys
import tempfile
# Mapping from categories to regexes to include
# These are applied to the full path.
MATCH_RES = {
"dataflow": [re.compile(r".*dataflow.*\.py")],
"packaging (e.g dockerfile)": [
re.compile(".*Dockerfile.*"),
re.compile(r"code_search/src/.*requirements.*\.txt")],
"cloud config": [re.compile(".*gcp_config.*")],
"k8s & kubeflow config": [
re.compile(r".*/cs-demo-1103/ks_app/components/.*"),
re.compile(r".*/cs-demo-1103/k8s_specs/.*")],
"model": [
re.compile(r".*t2t/.*\.py")
],
"serving k8s config": [
re.compile(r".*/ks-web-app/components/.*"),
],
"batch k8s config": [
re.compile(r".*/kubeflow/components/.*"),
],
"serving code": [
re.compile(r".*/code_search/nmslib/.*\.py"),
re.compile(r".*/ui.*\.js$"),
],
}
# Regexes matching files to exclude
NAME_EXCLUDES = [
re.compile(r".*\.pyc"),
re.compile(r"__init__\.py"),
]
class Results(object):
def __init__(self):
self.files = []
self.loc = 0
def add_file(self, full_path):
self.files.append(full_path)
with open(full_path) as hf:
lines = hf.readlines()
self.loc += len(lines)
@property
def num_files(self):
return len(self.files)
def classify_files(root_dir):
"""Return lists of files in different categories
Args:
root_dir: Root directory to begin searching in
Returns:
categories: Dictionary mapping a category to list of files.
"""
categories = {}
for k in MATCH_RES.iterkeys():
categories[k] = Results()
for root, _, files in os.walk(root_dir):
for name in files:
full_path = os.path.join(root, name)
exclude = False
for m in NAME_EXCLUDES:
if m.match(name):
exclude = True
break
if exclude:
continue
for k, patterns in MATCH_RES.iteritems():
for p in patterns:
if p.match(full_path):
categories[k].add_file(full_path)
break
return categories
def main():
logging.basicConfig(level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
'|%(pathname)s|%(lineno)d| %(message)s'),
datefmt='%Y-%m-%dT%H:%M:%S',
)
logging.getLogger().setLevel(logging.INFO)
parser = argparse.ArgumentParser(
description="Create a CSV file containing # of PRs by company.")
parser.add_argument(
"--output",
default="",
type=str,
help="The file to write.")
args = parser.parse_args()
if not args.output:
with tempfile.NamedTemporaryFile(prefix="tmpCS_demo_code_stats", dir=None,
suffix=".csv",
delete=True) as hf:
args.output = hf.name
logging.info("--output not specified; defaulting to %s", args.output)
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
logging.info("root_dir")
categories = classify_files(root_dir)
for k, v in categories.iteritems():
for path in v.files:
print(k, path)
logging.info("Writing output to %s", args.output)
with open(args.output, "w") as hf:
writer = csv.writer(hf)
std_writer = csv.writer(sys.stdout)
row = ["category", "number of files", "lines of code"]
writer.writerow(row)
std_writer.writerow(row)
for k, v in categories.iteritems():
row = [k, v.num_files, v.loc]
writer.writerow(row)
std_writer.writerow(row)
if __name__ == "__main__":
main()