mirror of https://github.com/kubeflow/examples.git
				
				
				
			Create a script to count lines of code. (#379)
* Create a script to count lines of code. * This is used in the presentation to get an estimate of where the human effort is involved. * Fix lint issues.
This commit is contained in:
		
							parent
							
								
									345e69ab4c
								
							
						
					
					
						commit
						ba9af34805
					
				| 
						 | 
				
			
			@ -0,0 +1,145 @@
 | 
			
		|||
"""Count lines of code in different types of file.
 | 
			
		||||
 | 
			
		||||
This has nothing to do with actually running code search.
 | 
			
		||||
 | 
			
		||||
The sole purpose of this script is to collect data for the presentation to
 | 
			
		||||
illustrate the point that most effort isn't spent on ML.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import csv
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import sys
 | 
			
		||||
import tempfile
 | 
			
		||||
 | 
			
		||||
# Mapping from categories to regexes to include
 | 
			
		||||
# These are applied to the full path.
 | 
			
		||||
MATCH_RES = {
 | 
			
		||||
  "dataflow": [re.compile(r".*dataflow.*\.py")],
 | 
			
		||||
  "packaging (e.g dockerfile)": [
 | 
			
		||||
    re.compile(".*Dockerfile.*"),
 | 
			
		||||
    re.compile(r"code_search/src/.*requirements.*\.txt")],
 | 
			
		||||
  "cloud config": [re.compile(".*gcp_config.*")],
 | 
			
		||||
  "k8s & kubeflow config": [
 | 
			
		||||
    re.compile(r".*/cs-demo-1103/ks_app/components/.*"),
 | 
			
		||||
    re.compile(r".*/cs-demo-1103/k8s_specs/.*")],
 | 
			
		||||
  "model": [
 | 
			
		||||
    re.compile(r".*t2t/.*\.py")
 | 
			
		||||
  ],
 | 
			
		||||
  "serving k8s config": [
 | 
			
		||||
    re.compile(r".*/ks-web-app/components/.*"),
 | 
			
		||||
  ],
 | 
			
		||||
  "batch k8s config": [
 | 
			
		||||
    re.compile(r".*/kubeflow/components/.*"),
 | 
			
		||||
  ],
 | 
			
		||||
  "serving code": [
 | 
			
		||||
      re.compile(r".*/code_search/nmslib/.*\.py"),
 | 
			
		||||
      re.compile(r".*/ui.*\.js$"),
 | 
			
		||||
  ],
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Regexes matching files to exclude
 | 
			
		||||
NAME_EXCLUDES = [
 | 
			
		||||
  re.compile(r".*\.pyc"),
 | 
			
		||||
  re.compile(r"__init__\.py"),
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
class Results(object):
 | 
			
		||||
  def __init__(self):
 | 
			
		||||
    self.files = []
 | 
			
		||||
    self.loc = 0
 | 
			
		||||
 | 
			
		||||
  def add_file(self, full_path):
 | 
			
		||||
    self.files.append(full_path)
 | 
			
		||||
    with open(full_path) as hf:
 | 
			
		||||
      lines = hf.readlines()
 | 
			
		||||
      self.loc += len(lines)
 | 
			
		||||
 | 
			
		||||
  @property
 | 
			
		||||
  def num_files(self):
 | 
			
		||||
    return len(self.files)
 | 
			
		||||
 | 
			
		||||
def classify_files(root_dir):
 | 
			
		||||
  """Return lists of files in different categories
 | 
			
		||||
 | 
			
		||||
  Args:
 | 
			
		||||
    root_dir: Root directory to begin searching in
 | 
			
		||||
 | 
			
		||||
  Returns:
 | 
			
		||||
    categories: Dictionary mapping a category to list of files.
 | 
			
		||||
  """
 | 
			
		||||
  categories = {}
 | 
			
		||||
  for k in MATCH_RES.iterkeys():
 | 
			
		||||
    categories[k] = Results()
 | 
			
		||||
 | 
			
		||||
  for root, _, files in os.walk(root_dir):
 | 
			
		||||
    for name in files:
 | 
			
		||||
      full_path = os.path.join(root, name)
 | 
			
		||||
      exclude = False
 | 
			
		||||
      for m in NAME_EXCLUDES:
 | 
			
		||||
        if m.match(name):
 | 
			
		||||
          exclude = True
 | 
			
		||||
          break
 | 
			
		||||
      if exclude:
 | 
			
		||||
        continue
 | 
			
		||||
      for k, patterns in MATCH_RES.iteritems():
 | 
			
		||||
        for p in patterns:
 | 
			
		||||
          if p.match(full_path):
 | 
			
		||||
            categories[k].add_file(full_path)
 | 
			
		||||
            break
 | 
			
		||||
 | 
			
		||||
  return categories
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
  logging.basicConfig(level=logging.INFO,
 | 
			
		||||
                      format=('%(levelname)s|%(asctime)s'
 | 
			
		||||
                              '|%(pathname)s|%(lineno)d| %(message)s'),
 | 
			
		||||
                      datefmt='%Y-%m-%dT%H:%M:%S',
 | 
			
		||||
                      )
 | 
			
		||||
  logging.getLogger().setLevel(logging.INFO)
 | 
			
		||||
 | 
			
		||||
  parser = argparse.ArgumentParser(
 | 
			
		||||
     description="Create a CSV file containing # of PRs by company.")
 | 
			
		||||
 | 
			
		||||
  parser.add_argument(
 | 
			
		||||
    "--output",
 | 
			
		||||
    default="",
 | 
			
		||||
    type=str,
 | 
			
		||||
    help="The file to write.")
 | 
			
		||||
 | 
			
		||||
  args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
  if not args.output:
 | 
			
		||||
    with tempfile.NamedTemporaryFile(prefix="tmpCS_demo_code_stats", dir=None,
 | 
			
		||||
                                     suffix=".csv",
 | 
			
		||||
                                     delete=True) as hf:
 | 
			
		||||
      args.output = hf.name
 | 
			
		||||
    logging.info("--output not specified; defaulting to %s", args.output)
 | 
			
		||||
 | 
			
		||||
  root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 | 
			
		||||
  logging.info("root_dir")
 | 
			
		||||
 | 
			
		||||
  categories = classify_files(root_dir)
 | 
			
		||||
 | 
			
		||||
  for k, v in categories.iteritems():
 | 
			
		||||
    for path in v.files:
 | 
			
		||||
      print(k, path)
 | 
			
		||||
 | 
			
		||||
  logging.info("Writing output to %s", args.output)
 | 
			
		||||
  with open(args.output, "w") as hf:
 | 
			
		||||
    writer = csv.writer(hf)
 | 
			
		||||
    std_writer = csv.writer(sys.stdout)
 | 
			
		||||
 | 
			
		||||
    row = ["category", "number of files", "lines of code"]
 | 
			
		||||
    writer.writerow(row)
 | 
			
		||||
    std_writer.writerow(row)
 | 
			
		||||
 | 
			
		||||
    for k, v in categories.iteritems():
 | 
			
		||||
      row = [k, v.num_files, v.loc]
 | 
			
		||||
      writer.writerow(row)
 | 
			
		||||
      std_writer.writerow(row)
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
  main()
 | 
			
		||||
		Loading…
	
		Reference in New Issue