mirror of https://github.com/kubeflow/examples.git
Create a script to count lines of code. (#379)
* Create a script to count lines of code. * This is used in the presentation to get an estimate of where the human effort is involved. * Fix lint issues.
This commit is contained in:
parent
345e69ab4c
commit
ba9af34805
|
@ -0,0 +1,145 @@
|
|||
"""Count lines of code in different types of file.
|
||||
|
||||
This has nothing to do with actually running code search.
|
||||
|
||||
The sole purpose of this script is to collect data for the presentation to
|
||||
illustrate the point that most effort isn't spent on ML.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
# Mapping from categories to regexes to include
|
||||
# These are applied to the full path.
|
||||
MATCH_RES = {
|
||||
"dataflow": [re.compile(r".*dataflow.*\.py")],
|
||||
"packaging (e.g dockerfile)": [
|
||||
re.compile(".*Dockerfile.*"),
|
||||
re.compile(r"code_search/src/.*requirements.*\.txt")],
|
||||
"cloud config": [re.compile(".*gcp_config.*")],
|
||||
"k8s & kubeflow config": [
|
||||
re.compile(r".*/cs-demo-1103/ks_app/components/.*"),
|
||||
re.compile(r".*/cs-demo-1103/k8s_specs/.*")],
|
||||
"model": [
|
||||
re.compile(r".*t2t/.*\.py")
|
||||
],
|
||||
"serving k8s config": [
|
||||
re.compile(r".*/ks-web-app/components/.*"),
|
||||
],
|
||||
"batch k8s config": [
|
||||
re.compile(r".*/kubeflow/components/.*"),
|
||||
],
|
||||
"serving code": [
|
||||
re.compile(r".*/code_search/nmslib/.*\.py"),
|
||||
re.compile(r".*/ui.*\.js$"),
|
||||
],
|
||||
}
|
||||
|
||||
# Regexes matching files to exclude
|
||||
NAME_EXCLUDES = [
|
||||
re.compile(r".*\.pyc"),
|
||||
re.compile(r"__init__\.py"),
|
||||
]
|
||||
|
||||
class Results(object):
|
||||
def __init__(self):
|
||||
self.files = []
|
||||
self.loc = 0
|
||||
|
||||
def add_file(self, full_path):
|
||||
self.files.append(full_path)
|
||||
with open(full_path) as hf:
|
||||
lines = hf.readlines()
|
||||
self.loc += len(lines)
|
||||
|
||||
@property
|
||||
def num_files(self):
|
||||
return len(self.files)
|
||||
|
||||
def classify_files(root_dir):
|
||||
"""Return lists of files in different categories
|
||||
|
||||
Args:
|
||||
root_dir: Root directory to begin searching in
|
||||
|
||||
Returns:
|
||||
categories: Dictionary mapping a category to list of files.
|
||||
"""
|
||||
categories = {}
|
||||
for k in MATCH_RES.iterkeys():
|
||||
categories[k] = Results()
|
||||
|
||||
for root, _, files in os.walk(root_dir):
|
||||
for name in files:
|
||||
full_path = os.path.join(root, name)
|
||||
exclude = False
|
||||
for m in NAME_EXCLUDES:
|
||||
if m.match(name):
|
||||
exclude = True
|
||||
break
|
||||
if exclude:
|
||||
continue
|
||||
for k, patterns in MATCH_RES.iteritems():
|
||||
for p in patterns:
|
||||
if p.match(full_path):
|
||||
categories[k].add_file(full_path)
|
||||
break
|
||||
|
||||
return categories
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format=('%(levelname)s|%(asctime)s'
|
||||
'|%(pathname)s|%(lineno)d| %(message)s'),
|
||||
datefmt='%Y-%m-%dT%H:%M:%S',
|
||||
)
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Create a CSV file containing # of PRs by company.")
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="",
|
||||
type=str,
|
||||
help="The file to write.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.output:
|
||||
with tempfile.NamedTemporaryFile(prefix="tmpCS_demo_code_stats", dir=None,
|
||||
suffix=".csv",
|
||||
delete=True) as hf:
|
||||
args.output = hf.name
|
||||
logging.info("--output not specified; defaulting to %s", args.output)
|
||||
|
||||
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
logging.info("root_dir")
|
||||
|
||||
categories = classify_files(root_dir)
|
||||
|
||||
for k, v in categories.iteritems():
|
||||
for path in v.files:
|
||||
print(k, path)
|
||||
|
||||
logging.info("Writing output to %s", args.output)
|
||||
with open(args.output, "w") as hf:
|
||||
writer = csv.writer(hf)
|
||||
std_writer = csv.writer(sys.stdout)
|
||||
|
||||
row = ["category", "number of files", "lines of code"]
|
||||
writer.writerow(row)
|
||||
std_writer.writerow(row)
|
||||
|
||||
for k, v in categories.iteritems():
|
||||
row = [k, v.num_files, v.loc]
|
||||
writer.writerow(row)
|
||||
std_writer.writerow(row)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue