mirror of https://github.com/kubeflow/examples.git
Create a script to count lines of code. (#379)
* Create a script to count lines of code. * This is used in the presentation to get an estimate of where the human effort is involved. * Fix lint issues.
This commit is contained in:
parent
345e69ab4c
commit
ba9af34805
|
@ -0,0 +1,145 @@
|
||||||
|
"""Count lines of code in different types of file.
|
||||||
|
|
||||||
|
This has nothing to do with actually running code search.
|
||||||
|
|
||||||
|
The sole purpose of this script is to collect data for the presentation to
|
||||||
|
illustrate the point that most effort isn't spent on ML.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
# Mapping from categories to regexes to include
|
||||||
|
# These are applied to the full path.
|
||||||
|
MATCH_RES = {
|
||||||
|
"dataflow": [re.compile(r".*dataflow.*\.py")],
|
||||||
|
"packaging (e.g dockerfile)": [
|
||||||
|
re.compile(".*Dockerfile.*"),
|
||||||
|
re.compile(r"code_search/src/.*requirements.*\.txt")],
|
||||||
|
"cloud config": [re.compile(".*gcp_config.*")],
|
||||||
|
"k8s & kubeflow config": [
|
||||||
|
re.compile(r".*/cs-demo-1103/ks_app/components/.*"),
|
||||||
|
re.compile(r".*/cs-demo-1103/k8s_specs/.*")],
|
||||||
|
"model": [
|
||||||
|
re.compile(r".*t2t/.*\.py")
|
||||||
|
],
|
||||||
|
"serving k8s config": [
|
||||||
|
re.compile(r".*/ks-web-app/components/.*"),
|
||||||
|
],
|
||||||
|
"batch k8s config": [
|
||||||
|
re.compile(r".*/kubeflow/components/.*"),
|
||||||
|
],
|
||||||
|
"serving code": [
|
||||||
|
re.compile(r".*/code_search/nmslib/.*\.py"),
|
||||||
|
re.compile(r".*/ui.*\.js$"),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Regexes matching files to exclude
|
||||||
|
NAME_EXCLUDES = [
|
||||||
|
re.compile(r".*\.pyc"),
|
||||||
|
re.compile(r"__init__\.py"),
|
||||||
|
]
|
||||||
|
|
||||||
|
class Results(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.files = []
|
||||||
|
self.loc = 0
|
||||||
|
|
||||||
|
def add_file(self, full_path):
|
||||||
|
self.files.append(full_path)
|
||||||
|
with open(full_path) as hf:
|
||||||
|
lines = hf.readlines()
|
||||||
|
self.loc += len(lines)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_files(self):
|
||||||
|
return len(self.files)
|
||||||
|
|
||||||
|
def classify_files(root_dir):
|
||||||
|
"""Return lists of files in different categories
|
||||||
|
|
||||||
|
Args:
|
||||||
|
root_dir: Root directory to begin searching in
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
categories: Dictionary mapping a category to list of files.
|
||||||
|
"""
|
||||||
|
categories = {}
|
||||||
|
for k in MATCH_RES.iterkeys():
|
||||||
|
categories[k] = Results()
|
||||||
|
|
||||||
|
for root, _, files in os.walk(root_dir):
|
||||||
|
for name in files:
|
||||||
|
full_path = os.path.join(root, name)
|
||||||
|
exclude = False
|
||||||
|
for m in NAME_EXCLUDES:
|
||||||
|
if m.match(name):
|
||||||
|
exclude = True
|
||||||
|
break
|
||||||
|
if exclude:
|
||||||
|
continue
|
||||||
|
for k, patterns in MATCH_RES.iteritems():
|
||||||
|
for p in patterns:
|
||||||
|
if p.match(full_path):
|
||||||
|
categories[k].add_file(full_path)
|
||||||
|
break
|
||||||
|
|
||||||
|
return categories
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format=('%(levelname)s|%(asctime)s'
|
||||||
|
'|%(pathname)s|%(lineno)d| %(message)s'),
|
||||||
|
datefmt='%Y-%m-%dT%H:%M:%S',
|
||||||
|
)
|
||||||
|
logging.getLogger().setLevel(logging.INFO)
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Create a CSV file containing # of PRs by company.")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
default="",
|
||||||
|
type=str,
|
||||||
|
help="The file to write.")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.output:
|
||||||
|
with tempfile.NamedTemporaryFile(prefix="tmpCS_demo_code_stats", dir=None,
|
||||||
|
suffix=".csv",
|
||||||
|
delete=True) as hf:
|
||||||
|
args.output = hf.name
|
||||||
|
logging.info("--output not specified; defaulting to %s", args.output)
|
||||||
|
|
||||||
|
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
|
logging.info("root_dir")
|
||||||
|
|
||||||
|
categories = classify_files(root_dir)
|
||||||
|
|
||||||
|
for k, v in categories.iteritems():
|
||||||
|
for path in v.files:
|
||||||
|
print(k, path)
|
||||||
|
|
||||||
|
logging.info("Writing output to %s", args.output)
|
||||||
|
with open(args.output, "w") as hf:
|
||||||
|
writer = csv.writer(hf)
|
||||||
|
std_writer = csv.writer(sys.stdout)
|
||||||
|
|
||||||
|
row = ["category", "number of files", "lines of code"]
|
||||||
|
writer.writerow(row)
|
||||||
|
std_writer.writerow(row)
|
||||||
|
|
||||||
|
for k, v in categories.iteritems():
|
||||||
|
row = [k, v.num_files, v.loc]
|
||||||
|
writer.writerow(row)
|
||||||
|
std_writer.writerow(row)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue