Create a script to count lines of code. (#379)

* Create a script to count lines of code. * This is used in the presentation to get an estimate of where the human effort is involved. * Fix lint issues.
2018-12-19 09:42:25 -08:00 · 2018-12-19 09:42:25 -08:00 · ba9af34805
parent 345e69ab4c
commit ba9af34805
1 changed files with 145 additions and 0 deletions
--- a/code_search/demo/count_lines_of_code.py
+++ b/code_search/demo/count_lines_of_code.py
@ -0,0 +1,145 @@
+"""Count lines of code in different types of file.
+
+This has nothing to do with actually running code search.
+
+The sole purpose of this script is to collect data for the presentation to
+illustrate the point that most effort isn't spent on ML.
+"""
+
+import argparse
+import csv
+import logging
+import os
+import re
+import sys
+import tempfile
+
+# Mapping from categories to regexes to include
+# These are applied to the full path.
+MATCH_RES = {
+  "dataflow": [re.compile(r".*dataflow.*\.py")],
+  "packaging (e.g dockerfile)": [
+    re.compile(".*Dockerfile.*"),
+    re.compile(r"code_search/src/.*requirements.*\.txt")],
+  "cloud config": [re.compile(".*gcp_config.*")],
+  "k8s & kubeflow config": [
+    re.compile(r".*/cs-demo-1103/ks_app/components/.*"),
+    re.compile(r".*/cs-demo-1103/k8s_specs/.*")],
+  "model": [
+    re.compile(r".*t2t/.*\.py")
+  ],
+  "serving k8s config": [
+    re.compile(r".*/ks-web-app/components/.*"),
+  ],
+  "batch k8s config": [
+    re.compile(r".*/kubeflow/components/.*"),
+  ],
+  "serving code": [
+      re.compile(r".*/code_search/nmslib/.*\.py"),
+      re.compile(r".*/ui.*\.js$"),
+  ],
+}
+
+# Regexes matching files to exclude
+NAME_EXCLUDES = [
+  re.compile(r".*\.pyc"),
+  re.compile(r"__init__\.py"),
+]
+
+class Results(object):
+  def __init__(self):
+    self.files = []
+    self.loc = 0
+
+  def add_file(self, full_path):
+    self.files.append(full_path)
+    with open(full_path) as hf:
+      lines = hf.readlines()
+      self.loc += len(lines)
+
+  @property
+  def num_files(self):
+    return len(self.files)
+
+def classify_files(root_dir):
+  """Return lists of files in different categories
+
+  Args:
+    root_dir: Root directory to begin searching in
+
+  Returns:
+    categories: Dictionary mapping a category to list of files.
+  """
+  categories = {}
+  for k in MATCH_RES.iterkeys():
+    categories[k] = Results()
+
+  for root, _, files in os.walk(root_dir):
+    for name in files:
+      full_path = os.path.join(root, name)
+      exclude = False
+      for m in NAME_EXCLUDES:
+        if m.match(name):
+          exclude = True
+          break
+      if exclude:
+        continue
+      for k, patterns in MATCH_RES.iteritems():
+        for p in patterns:
+          if p.match(full_path):
+            categories[k].add_file(full_path)
+            break
+
+  return categories
+
+def main():
+  logging.basicConfig(level=logging.INFO,
+                      format=('%(levelname)s|%(asctime)s'
+                              '|%(pathname)s|%(lineno)d| %(message)s'),
+                      datefmt='%Y-%m-%dT%H:%M:%S',
+                      )
+  logging.getLogger().setLevel(logging.INFO)
+
+  parser = argparse.ArgumentParser(
+     description="Create a CSV file containing # of PRs by company.")
+
+  parser.add_argument(
+    "--output",
+    default="",
+    type=str,
+    help="The file to write.")
+
+  args = parser.parse_args()
+
+  if not args.output:
+    with tempfile.NamedTemporaryFile(prefix="tmpCS_demo_code_stats", dir=None,
+                                     suffix=".csv",
+                                     delete=True) as hf:
+      args.output = hf.name
+    logging.info("--output not specified; defaulting to %s", args.output)
+
+  root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+  logging.info("root_dir")
+
+  categories = classify_files(root_dir)
+
+  for k, v in categories.iteritems():
+    for path in v.files:
+      print(k, path)
+
+  logging.info("Writing output to %s", args.output)
+  with open(args.output, "w") as hf:
+    writer = csv.writer(hf)
+    std_writer = csv.writer(sys.stdout)
+
+    row = ["category", "number of files", "lines of code"]
+    writer.writerow(row)
+    std_writer.writerow(row)
+
+    for k, v in categories.iteritems():
+      row = [k, v.num_files, v.loc]
+      writer.writerow(row)
+      std_writer.writerow(row)
+
+if __name__ == "__main__":
+  main()