Create a script to count lines of code. (#379)

* Create a script to count lines of code. * This is used in the presentation to get an estimate of where the human effort is involved. * Fix lint issues.
2018-12-19 09:42:25 -08:00 · 2018-12-19 09:42:25 -08:00 · ba9af34805
parent 345e69ab4c
commit ba9af34805
1 changed files with 145 additions and 0 deletions
--- a/code_search/demo/count_lines_of_code.py
+++ b/code_search/demo/count_lines_of_code.py
@ -0,0 +1,145 @@
 """Count lines of code in different types of file.
 This has nothing to do with actually running code search.
 The sole purpose of this script is to collect data for the presentation to
 illustrate the point that most effort isn't spent on ML.
 """
 import argparse
 import csv
 import logging
 import os
 import re
 import sys
 import tempfile
 # Mapping from categories to regexes to include
 # These are applied to the full path.
 MATCH_RES = {
  "dataflow": [re.compile(r".*dataflow.*\.py")],
  "packaging (e.g dockerfile)": [
    re.compile(".*Dockerfile.*"),
    re.compile(r"code_search/src/.*requirements.*\.txt")],
  "cloud config": [re.compile(".*gcp_config.*")],
  "k8s & kubeflow config": [
    re.compile(r".*/cs-demo-1103/ks_app/components/.*"),
    re.compile(r".*/cs-demo-1103/k8s_specs/.*")],
  "model": [
    re.compile(r".*t2t/.*\.py")
  ],
  "serving k8s config": [
    re.compile(r".*/ks-web-app/components/.*"),
  ],
  "batch k8s config": [
    re.compile(r".*/kubeflow/components/.*"),
  ],
  "serving code": [
      re.compile(r".*/code_search/nmslib/.*\.py"),
      re.compile(r".*/ui.*\.js$"),
  ],
 }
 # Regexes matching files to exclude
 NAME_EXCLUDES = [
  re.compile(r".*\.pyc"),
  re.compile(r"__init__\.py"),
 ]
 class Results(object):
  def __init__(self):
    self.files = []
    self.loc = 0
  def add_file(self, full_path):
    self.files.append(full_path)
    with open(full_path) as hf:
      lines = hf.readlines()
      self.loc += len(lines)
  @property
  def num_files(self):
    return len(self.files)
 def classify_files(root_dir):
  """Return lists of files in different categories
  Args:
    root_dir: Root directory to begin searching in
  Returns:
    categories: Dictionary mapping a category to list of files.
  """
  categories = {}
  for k in MATCH_RES.iterkeys():
    categories[k] = Results()
  for root, _, files in os.walk(root_dir):
    for name in files:
      full_path = os.path.join(root, name)
      exclude = False
      for m in NAME_EXCLUDES:
        if m.match(name):
          exclude = True
          break
      if exclude:
        continue
      for k, patterns in MATCH_RES.iteritems():
        for p in patterns:
          if p.match(full_path):
            categories[k].add_file(full_path)
            break
  return categories
 def main():
  logging.basicConfig(level=logging.INFO,
                      format=('%(levelname)s|%(asctime)s'
                              '|%(pathname)s|%(lineno)d| %(message)s'),
                      datefmt='%Y-%m-%dT%H:%M:%S',
                      )
  logging.getLogger().setLevel(logging.INFO)
  parser = argparse.ArgumentParser(
     description="Create a CSV file containing # of PRs by company.")
  parser.add_argument(
    "--output",
    default="",
    type=str,
    help="The file to write.")
  args = parser.parse_args()
  if not args.output:
    with tempfile.NamedTemporaryFile(prefix="tmpCS_demo_code_stats", dir=None,
                                     suffix=".csv",
                                     delete=True) as hf:
      args.output = hf.name
    logging.info("--output not specified; defaulting to %s", args.output)
  root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
  logging.info("root_dir")
  categories = classify_files(root_dir)
  for k, v in categories.iteritems():
    for path in v.files:
      print(k, path)
  logging.info("Writing output to %s", args.output)
  with open(args.output, "w") as hf:
    writer = csv.writer(hf)
    std_writer = csv.writer(sys.stdout)
    row = ["category", "number of files", "lines of code"]
    writer.writerow(row)
    std_writer.writerow(row)
    for k, v in categories.iteritems():
      row = [k, v.num_files, v.loc]
      writer.writerow(row)
      std_writer.writerow(row)
 if __name__ == "__main__":
  main()