diff --git a/code_search/demo/count_lines_of_code.py b/code_search/demo/count_lines_of_code.py new file mode 100644 index 00000000..8b83060c --- /dev/null +++ b/code_search/demo/count_lines_of_code.py @@ -0,0 +1,145 @@ +"""Count lines of code in different types of file. + +This has nothing to do with actually running code search. + +The sole purpose of this script is to collect data for the presentation to +illustrate the point that most effort isn't spent on ML. +""" + +import argparse +import csv +import logging +import os +import re +import sys +import tempfile + +# Mapping from categories to regexes to include +# These are applied to the full path. +MATCH_RES = { + "dataflow": [re.compile(r".*dataflow.*\.py")], + "packaging (e.g dockerfile)": [ + re.compile(".*Dockerfile.*"), + re.compile(r"code_search/src/.*requirements.*\.txt")], + "cloud config": [re.compile(".*gcp_config.*")], + "k8s & kubeflow config": [ + re.compile(r".*/cs-demo-1103/ks_app/components/.*"), + re.compile(r".*/cs-demo-1103/k8s_specs/.*")], + "model": [ + re.compile(r".*t2t/.*\.py") + ], + "serving k8s config": [ + re.compile(r".*/ks-web-app/components/.*"), + ], + "batch k8s config": [ + re.compile(r".*/kubeflow/components/.*"), + ], + "serving code": [ + re.compile(r".*/code_search/nmslib/.*\.py"), + re.compile(r".*/ui.*\.js$"), + ], +} + +# Regexes matching files to exclude +NAME_EXCLUDES = [ + re.compile(r".*\.pyc"), + re.compile(r"__init__\.py"), +] + +class Results(object): + def __init__(self): + self.files = [] + self.loc = 0 + + def add_file(self, full_path): + self.files.append(full_path) + with open(full_path) as hf: + lines = hf.readlines() + self.loc += len(lines) + + @property + def num_files(self): + return len(self.files) + +def classify_files(root_dir): + """Return lists of files in different categories + + Args: + root_dir: Root directory to begin searching in + + Returns: + categories: Dictionary mapping a category to list of files. + """ + categories = {} + for k in MATCH_RES.iterkeys(): + categories[k] = Results() + + for root, _, files in os.walk(root_dir): + for name in files: + full_path = os.path.join(root, name) + exclude = False + for m in NAME_EXCLUDES: + if m.match(name): + exclude = True + break + if exclude: + continue + for k, patterns in MATCH_RES.iteritems(): + for p in patterns: + if p.match(full_path): + categories[k].add_file(full_path) + break + + return categories + +def main(): + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + + parser = argparse.ArgumentParser( + description="Create a CSV file containing # of PRs by company.") + + parser.add_argument( + "--output", + default="", + type=str, + help="The file to write.") + + args = parser.parse_args() + + if not args.output: + with tempfile.NamedTemporaryFile(prefix="tmpCS_demo_code_stats", dir=None, + suffix=".csv", + delete=True) as hf: + args.output = hf.name + logging.info("--output not specified; defaulting to %s", args.output) + + root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + logging.info("root_dir") + + categories = classify_files(root_dir) + + for k, v in categories.iteritems(): + for path in v.files: + print(k, path) + + logging.info("Writing output to %s", args.output) + with open(args.output, "w") as hf: + writer = csv.writer(hf) + std_writer = csv.writer(sys.stdout) + + row = ["category", "number of files", "lines of code"] + writer.writerow(row) + std_writer.writerow(row) + + for k, v in categories.iteritems(): + row = [k, v.num_files, v.loc] + writer.writerow(row) + std_writer.writerow(row) + +if __name__ == "__main__": + main()