fixed some lint errors

2019-08-16 01:04:15 -04:00 · 2019-08-16 01:04:15 -04:00 · d81d83512a
parent 153d9d87aa
commit d81d83512a
6 changed files with 301 additions and 275 deletions
--- a/pipelines/azurepipeline/code/deploy/score.py
+++ b/pipelines/azurepipeline/code/deploy/score.py
@ -1,33 +1,33 @@
 import json
 import time
+from io import BytesIO
+import datetime
 import requests
 import numpy as np
-import datetime
 from PIL import Image
-from io import BytesIO
 import tensorflow as tf

 from azureml.core.model import Model

-def init():
-  global model

-  try:
+def init():
+  if Model.get_model_path('tacosandburritos'):
    model_path = Model.get_model_path('tacosandburritos')
-  except:
+  else:
    model_path = '/model/latest.h5'

-  print('Attempting to load model')
+  print 'Attempting to load model'
  model = tf.keras.models.load_model(model_path)
  model.summary()
-  print('Done!')
+  print 'Done!'

-  print('Initialized model "{}" at {}'.format(model_path, datetime.datetime.now()))
+  print 'Initialized model "{}" at {}'.format(model_path, datetime.datetime.now())
+  return model

-def run(raw_data):
-  global model
+
+def run(raw_data, model):
  prev_time = time.time()
-      
+
  post = json.loads(raw_data)
  img_path = post['image']

@ -35,8 +35,8 @@ def run(raw_data):

  tensor = process_image(img_path, 160)
  t = tf.reshape(tensor, [-1, 160, 160, 3])
-  o = model.predict(t, steps=1)#[0][0]
-  print(o)
+  o = model.predict(t, steps=1)  # [0][0]
+  print o
  o = o[0][0]
  inference_time = datetime.timedelta(seconds=current_time - prev_time)
  payload = {
@ -45,28 +45,31 @@ def run(raw_data):
    'scores': str(o)
  }

-  print('Input ({}), Prediction ({})'.format(post['image'], payload))
+  print 'Input ({}), Prediction ({})'.format(post['image'], payload)

  return payload

+
 def process_image(path, image_size):
  # Extract image (from web or path)
-  if(path.startswith('http')):
+  if path.startswith('http'):
    response = requests.get(path)
    img = np.array(Image.open(BytesIO(response.content)))
  else:
    img = np.array(Image.open(path))

  img_tensor = tf.convert_to_tensor(img, dtype=tf.float32)
-  #tf.image.decode_jpeg(img_raw, channels=3)
+  # tf.image.decode_jpeg(img_raw, channels=3)
  img_final = tf.image.resize(img_tensor, [image_size, image_size]) / 255
  return img_final
-  
-def info(msg, char = "#", width = 75):
-  print("")
-  print(char * width)
-  print(char + "   %0*s" % ((-1*width)+5, msg) + char)
-  print(char * width)
+
+
+def info(msg, char="#", width=75):
+  print ""
+  print char * width
+  print char + "   %0*s" % ((-1 * width) + 5, msg) + char
+  print char * width
+

 if __name__ == "__main__":
  images = {
@ -74,17 +77,17 @@ if __name__ == "__main__":
    'burrito': 'https://www.exploreveg.org/files/2015/05/sofritas-burrito.jpeg'
  }

-  init()
+  my_model = init()

  for k, v in images.items():
-    print('{} => {}'.format(k, v))
+    print '{} => {}'.format(k, v)

  info('Taco Test')
-  taco = json.dumps({ 'image': images['tacos'] })
-  print(taco)
-  run(taco)
+  taco = json.dumps({'image': images['tacos']})
+  print taco
+  run(taco, my_model)

  info('Burrito Test')
-  burrito = json.dumps({ 'image': images['burrito'] })
-  print(burrito)
-  run(burrito)
+  burrito = json.dumps({'image': images['burrito']})
+  print burrito
+  run(burrito, my_model)
--- a/pipelines/azurepipeline/code/pipeline.py
+++ b/pipelines/azurepipeline/code/pipeline.py
@ -1,27 +1,29 @@
+"""Main pipeline file"""
+from kubernetes import client as k8s_client
 import kfp.dsl as dsl
 import kfp.compiler as compiler
-from kubernetes import client as k8s_client
-

@dsl.pipeline(
  name='Tacos vs. Burritos',
-  description='Simple TF CNN for binary classifier between burritos and tacos'
+  description='Simple TF CNN'
 )
 def tacosandburritos_train(
-        tenant_id,
-        service_principal_id,
-        service_principal_password,
-        subscription_id,
-        resource_group,
-        workspace,
-        persistent_volume_path='/mnt/azure',
-        data_download='https://aiadvocate.blob.core.windows.net/public/tacodata.zip',
-        epochs=5,
-        batch=32,
-        learning_rate=0.0001,
-        model_name='tacosandburritos',
-        profile_name='tacoprofile'
+    tenant_id,
+    service_principal_id,
+    service_principal_password,
+    subscription_id,
+    resource_group,
+    workspace
 ):
+  """Pipeline steps"""
+
+  persistent_volume_path = '/mnt/azure'
+  data_download = 'https://aiadvocate.blob.core.windows.net/public/tacodata.zip'
+  epochs = 5
+  batch = 32
+  learning_rate = 0.0001
+  model_name = 'tacosandburritos'
+  profile_name = 'tacoprofile'
  operations = {}
  image_size = 160
  training_folder = 'train'
@ -30,112 +32,109 @@ def tacosandburritos_train(

  # preprocess data
  operations['preprocess'] = dsl.ContainerOp(
-      name='preprocess',
-      image='insert your image here',
-      command=['python'],
-      arguments=[
-            '/scripts/data.py',
-            '--base_path', persistent_volume_path,
-            '--data', training_folder,
-            '--target', training_dataset,
-            '--img_size', image_size,
-            '--zipfile', data_download
-      ]
+    name='preprocess',
+    image='insert your image here',
+    command=['python'],
+    arguments=[
+      '/scripts/data.py',
+      '--base_path', persistent_volume_path,
+      '--data', training_folder,
+      '--target', training_dataset,
+      '--img_size', image_size,
+      '--zipfile', data_download
+    ]
  )

  # train
  operations['training'] = dsl.ContainerOp(
-      name='training',
-      image='insert your image here',
-      command=['python'],
-      arguments=[
-            '/scripts/train.py',
-            '--base_path', persistent_volume_path,
-            '--data', training_folder,
-            '--epochs', epochs,
-            '--batch', batch,
-            '--image_size', image_size,
-            '--lr', learning_rate,
-            '--outputs', model_folder,
-            '--dataset', training_dataset
-      ]
+    name='training',
+    image='insert your image here',
+    command=['python'],
+    arguments=[
+      '/scripts/train.py',
+      '--base_path', persistent_volume_path,
+      '--data', training_folder,
+      '--epochs', epochs,
+      '--batch', batch,
+      '--image_size', image_size,
+      '--lr', learning_rate,
+      '--outputs', model_folder,
+      '--dataset', training_dataset
+    ]
  )
  operations['training'].after(operations['preprocess'])

  # register model
  operations['register'] = dsl.ContainerOp(
-      name='register',
-      image='insert your image here',
-      command=['python'],
-      arguments=[
-            '/scripts/register.py',
-            '--base_path', persistent_volume_path,
-            '--model', 'latest.h5',
-            '--model_name', model_name,
-            '--tenant_id', tenant_id,
-            '--service_principal_id', service_principal_id,
-            '--service_principal_password', service_principal_password,
-            '--subscription_id', subscription_id,
-            '--resource_group', resource_group,
-            '--workspace', workspace
-      ]
+    name='register',
+    image='insert your image here',
+    command=['python'],
+    arguments=[
+      '/scripts/register.py',
+      '--base_path', persistent_volume_path,
+      '--model', 'latest.h5',
+      '--model_name', model_name,
+      '--tenant_id', tenant_id,
+      '--service_principal_id', service_principal_id,
+      '--service_principal_password', service_principal_password,
+      '--subscription_id', subscription_id,
+      '--resource_group', resource_group,
+      '--workspace', workspace
+    ]
  )
  operations['register'].after(operations['training'])

  operations['profile'] = dsl.ContainerOp(
-      name='profile',
-      image='insert your image here',
-      command=['sh'],
-      arguments=[
-            '/scripts/profile.sh',
-            '-n', profile_name,
-            '-m', model_name,
-            '-i', '/scripts/inferenceconfig.json',
-            '-d', '{"image":"https://www.exploreveg.org/files/2015/05/sofritas-burrito.jpeg"}',
-            '-t', tenant_id,
-            '-r', resource_group,
-            '-w', workspace,
-            '-s', service_principal_id,
-            '-p', service_principal_password,
-            '-u', subscription_id,
-            '-b', persistent_volume_path
-      ]
+    name='profile',
+    image='insert your image here',
+    command=['sh'],
+    arguments=[
+      '/scripts/profile.sh',
+      '-n', profile_name,
+      '-m', model_name,
+      '-i', '/scripts/inferenceconfig.json',
+      '-d', '{"image":"https://www.exploreveg.org/files/2015/05/sofritas-burrito.jpeg"}',
+      '-t', tenant_id,
+      '-r', resource_group,
+      '-w', workspace,
+      '-s', service_principal_id,
+      '-p', service_principal_password,
+      '-u', subscription_id,
+      '-b', persistent_volume_path
+    ]
  )
  operations['profile'].after(operations['register'])

  operations['deploy'] = dsl.ContainerOp(
-      name='deploy',
-      image='insert your image here',
-      command=['sh'],
-      arguments=[
-            '/scripts/deploy.sh',
-            '-n', model_name,
-            '-m', model_name,
-            '-i', '/scripts/inferenceconfig.json',
-            '-d', '/scripts/deploymentconfig.json',
-            '-t', tenant_id,
-          '-r', resource_group,
-          '-w', workspace,
-          '-s', service_principal_id,
-          '-p', service_principal_password,
-          '-u', subscription_id,
-          '-b', persistent_volume_path
-      ]
+    name='deploy',
+    image='insert your image here',
+    command=['sh'],
+    arguments=[
+      '/scripts/deploy.sh',
+      '-n', model_name,
+      '-m', model_name,
+      '-i', '/scripts/inferenceconfig.json',
+      '-d', '/scripts/deploymentconfig.json',
+      '-t', tenant_id,
+      '-r', resource_group,
+      '-w', workspace,
+      '-s', service_principal_id,
+      '-p', service_principal_password,
+      '-u', subscription_id,
+      '-b', persistent_volume_path
+    ]
  )
  operations['deploy'].after(operations['profile'])
-  for _, op in operations.items():
-    op.container.set_image_pull_policy("Always")
-    op.add_volume(
-        k8s_client.V1Volume(
-            name='azure',
-            persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
-                claim_name='azure-managed-disk')
-        )
+  for _, op_1 in operations.items():
+    op_1.container.set_image_pull_policy("Always")
+    op_1.add_volume(
+      k8s_client.V1Volume(
+        name='azure',
+        persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
+          claim_name='azure-managed-disk')
+      )
    ).add_volume_mount(k8s_client.V1VolumeMount(
-        mount_path='/mnt/azure',
-        name='azure')
-    )
-
+      mount_path='/mnt/azure', name='azure'))

 if __name__ == '__main__':
  compiler.Compiler().compile(tacosandburritos_train, __file__ + '.tar.gz')
--- a/pipelines/azurepipeline/code/preprocess/data.py
+++ b/pipelines/azurepipeline/code/preprocess/data.py
@ -1,69 +1,69 @@
 import os
 import shutil
-import wget
 import zipfile
 import argparse
-import numpy as np
+import wget
 import tensorflow as tf
 from pathlib2 import Path

-def check_dir(path, check=False):
-  if check:
-    assert os.path.exists(path), '{} does not exist!'.format(path)
-  else:
-    if not os.path.exists(path):
-      os.makedirs(path)
-    return Path(path).resolve(strict=False)
+
+def check_dir(path):
+  if not os.path.exists(path):
+    os.makedirs(path)
+  return Path(path).resolve(strict=False)
+

 def download(source, target, force_clear=False):
  if force_clear and os.path.exists(target):
-    print('Removing {}...'.format(target))
+    print 'Removing {}...'.format(target)
    shutil.rmtree(target)

  check_dir(target)
-  
+
  targt_file = str(Path(target).joinpath('data.zip'))
  if os.path.exists(targt_file) and not force_clear:
-    print('data already exists, skipping download')
+    print 'data already exists, skipping download'
    return

  if source.startswith('http'):
-    print("Downloading from {} to {}".format(source, target))
-    wget.download(source, targt_file)  
-    print("Done!")
+    print "Downloading from {} to {}".format(source, target)
+    wget.download(source, targt_file)
+    print "Done!"
  else:
-    print("Copying from {} to {}".format(source, target))
+    print "Copying from {} to {}".format(source, target)
    shutil.copyfile(source, targt_file)

-  print('Unzipping {}'.format(targt_file))
+  print 'Unzipping {}'.format(targt_file)
  zipr = zipfile.ZipFile(targt_file)
  zipr.extractall(target)
  zipr.close()

+
 def process_image(path, image_size=160):
  img_raw = tf.io.read_file(path)
  img_tensor = tf.image.decode_jpeg(img_raw, channels=3)
  img_final = tf.image.resize(img_tensor, [image_size, image_size]) / 255
  return img_final

+
 def walk_images(path, image_size=160):
-  images = []
-  print('Scanning {}'.format(path))
+  imgs = []
+  print 'Scanning {}'.format(path)
  # find subdirectories in base path
  # (they should be the labels)
  labels = []
  for (_, dirs, _) in os.walk(path):
-    print('Found {}'.format(dirs))
+    print 'Found {}'.format(dirs)
    labels = dirs
    break

  for d in labels:
    path = os.path.join(path, d)
-    print('Processing {}'.format(path))
+    print 'Processing {}'.format(path)
    # only care about files in directory
    for item in os.listdir(path):
      if not item.lower().endswith('.jpg'):
-        print('skipping {}'.format(item))
+        print 'skipping {}'.format(item)
        continue

      image = os.path.join(path, item)
@ -71,11 +71,12 @@ def walk_images(path, image_size=160):
        img = process_image(image, image_size)
        assert img.shape[2] == 3, "Invalid channel count"
        # write out good images
-        images.append(image)
+        imgs.append(image)
      except Exception as e:
-        print('{}\n{}\n'.format(e, image))
+        print '{}\n{}\n'.format(e, image)
+
+  return imgs

-  return images

 if __name__ == "__main__":
  parser = argparse.ArgumentParser(description='data cleaning for binary image task')
@ -84,31 +85,33 @@ if __name__ == "__main__":
  parser.add_argument('-t', '--target', help='target file to hold good data', default='train.txt')
  parser.add_argument('-i', '--img_size', help='target image size to verify', default=160, type=int)
  parser.add_argument('-z', '--zipfile', help='source data zip file', default='../../tacodata.zip')
-  parser.add_argument('-f', '--force', help='force clear all data', default=False, action='store_true')
+  parser.add_argument('-f', '--force',
+                      help='force clear all data', default=False, action='store_true')
  args = parser.parse_args()
-  print(args)
+  print args

-  print('Using TensorFlow v.{}'.format(tf.__version__))
+  print 'Using TensorFlow v.{}'.format(tf.__version__)

  base_path = Path(args.base_path).resolve(strict=False)
-  print('Base Path:  {}'.format(base_path))
+  print 'Base Path:  {}'.format(base_path)
  data_path = base_path.joinpath(args.data).resolve(strict=False)
-  print('Train Path: {}'.format(data_path))
+  print 'Train Path: {}'.format(data_path)
  target_path = Path(base_path).resolve(strict=False).joinpath(args.target)
-  print('Train File: {}'.format(target_path))
+  print 'Train File: {}'.format(target_path)
  zip_path = args.zipfile

-  print('Acquiring data...')
-  download('https://aiadvocate.blob.core.windows.net/public/tacodata.zip', str(base_path), args.force)
+  print 'Acquiring data...'
+  download('https://aiadvocate.blob.core.windows.net/public/tacodata.zip',
+           str(base_path), args.force)

  if os.path.exists(str(target_path)):
-    print('dataset text file already exists, skipping check')
+    print 'dataset text file already exists, skipping check'
  else:
-    print('Testing images...')
+    print 'Testing images...'
    images = walk_images(str(data_path), args.img_size)

    # save file
-    print('writing dataset to {}'.format(target_path))
+    print 'writing dataset to {}'.format(target_path)
    with open(str(target_path), 'w+') as f:
      f.write('\n'.join(images))

--- a/pipelines/azurepipeline/code/profile/score.py
+++ b/pipelines/azurepipeline/code/profile/score.py
@ -1,32 +1,35 @@
 import json
 import time
+import datetime
+from io import BytesIO
 import requests
 import numpy as np
-import datetime
+
 from PIL import Image
-from io import BytesIO
+
 import tensorflow as tf

 from azureml.core.model import Model

-def init():
-  global model

-  try:
+def init():
+  if Model.get_model_path('tacosandburritos'):
    model_path = Model.get_model_path('tacosandburritos')
-  except:
+  else:
    model_path = '/model/latest.h5'

-  print('Attempting to load model')
+  print 'Attempting to load model'
  model = tf.keras.models.load_model(model_path)
  model.summary()
-  print('Done!')
+  print 'Done!'

-  print('Initialized model "{}" at {}'.format(model_path, datetime.datetime.now()))
+  print 'Initialized model "{}" at {}'.format(model_path, datetime.datetime.now())
+  return model

-def run(raw_data):
+
+def run(raw_data, model):
  prev_time = time.time()
-      
+
  post = json.loads(raw_data)
  img_path = post['image']

@ -34,8 +37,8 @@ def run(raw_data):

  tensor = process_image(img_path, 160)
  t = tf.reshape(tensor, [-1, 160, 160, 3])
-  o = model.predict(t, steps=1)#[0][0]
-  print(o)
+  o = model.predict(t, steps=1)  # [0][0]
+  print o
  o = o[0][0]
  inference_time = datetime.timedelta(seconds=current_time - prev_time)
  payload = {
@ -44,28 +47,31 @@ def run(raw_data):
    'scores': str(o)
  }

-  print('Input ({}), Prediction ({})'.format(post['image'], payload))
+  print 'Input ({}), Prediction ({})'.format(post['image'], payload)

  return payload

+
 def process_image(path, image_size):
  # Extract image (from web or path)
-  if(path.startswith('http')):
+  if path.startswith('http'):
    response = requests.get(path)
    img = np.array(Image.open(BytesIO(response.content)))
  else:
    img = np.array(Image.open(path))

  img_tensor = tf.convert_to_tensor(img, dtype=tf.float32)
-  #tf.image.decode_jpeg(img_raw, channels=3)
+  # tf.image.decode_jpeg(img_raw, channels=3)
  img_final = tf.image.resize(img_tensor, [image_size, image_size]) / 255
  return img_final
-  
-def info(msg, char = "#", width = 75):
-  print("")
-  print(char * width)
-  print(char + "   %0*s" % ((-1*width)+5, msg) + char)
-  print(char * width)
+
+
+def info(msg, char="#", width=75):
+  print ""
+  print char * width
+  print char + "   %0*s" % ((-1 * width) + 5, msg) + char
+  print char * width
+

 if __name__ == "__main__":
  images = {
@ -73,17 +79,17 @@ if __name__ == "__main__":
    'burrito': 'https://www.exploreveg.org/files/2015/05/sofritas-burrito.jpeg'
  }

-  init()
+  my_model = init()

  for k, v in images.items():
-    print('{} => {}'.format(k, v))
+    print '{} => {}'.format(k, v)

  info('Taco Test')
-  taco = json.dumps({ 'image': images['tacos'] })
-  print(taco)
-  run(taco)
+  taco = json.dumps({'image': images['tacos']})
+  print taco
+  run(taco, my_model)

  info('Burrito Test')
-  burrito = json.dumps({ 'image': images['burrito'] })
-  print(burrito)
-  run(burrito)
+  burrito = json.dumps({'image': images['burrito']})
+  print burrito
+  run(burrito, my_model)
--- a/pipelines/azurepipeline/code/register/register.py
+++ b/pipelines/azurepipeline/code/register/register.py
@ -1,22 +1,22 @@
 import json
 from os.path import relpath
-import azureml
 import argparse
 from pathlib2 import Path
+import azureml
 from azureml.core import Workspace
 from azureml.core.model import Model
-from azureml.core.image import ContainerImage, Image
-from azureml.core.webservice import Webservice, AciWebservice
-from azureml.core.authentication import ServicePrincipalAuthentication 
+from azureml.core.authentication import ServicePrincipalAuthentication

-def info(msg, char = "#", width = 75):
-  print("")
-  print(char * width)
-  print(char + "   %0*s" % ((-1*width)+5, msg) + char)
-  print(char * width)

-def run(model_path, model_name, tenant_id, service_principal_id,
-    service_principal_password, subscription_id, resource_group, workspace, tags):
+def info(msg, char="#", width=75):
+  print ""
+  print char * width
+  print char + "   %0*s" % ((-1 * width) + 5, msg) + char
+  print char * width
+
+
+def get_ws(tenant_id, service_principal_id,
+        service_principal_password, subscription_id, resource_group, workspace):
  auth_args = {
    'tenant_id': tenant_id,
    'service_principal_id': service_principal_id,
@ -28,18 +28,21 @@ def run(model_path, model_name, tenant_id, service_principal_id,
    'subscription_id': subscription_id,
    'resource_group': resource_group
  }
-
  ws = Workspace.get(workspace, **ws_args)
+  return ws

-  print(ws.get_details())
+def run(mdl_path, model_name, ws, tgs):

-  print('\nSaving model {} to {}'.format(model_path, model_name))
+  print ws.get_details()
+
+  print '\nSaving model {} to {}'.format(mdl_path, model_name)

  # Model Path needs to be relative
-  model_path = relpath(model_path, '.')
+  mdl_path = relpath(mdl_path, '.')
+
+  Model.register(ws, model_name=model_name, model_path=mdl_path, tags=tgs)
+  print 'Done!'

-  model = Model.register(ws, model_name=model_name, model_path=model_path, tags=tags)
-  print('Done!')

 if __name__ == "__main__":
  # argparse stuff for model path and model name
@ -54,14 +57,14 @@ if __name__ == "__main__":
  parser.add_argument('-r', '--resource_group', help='resource_group')
  parser.add_argument('-w', '--workspace', help='workspace')
  args = parser.parse_args()
-  
-  print('Azure ML SDK Version: {}'.format(azureml.core.VERSION))
+
+  print 'Azure ML SDK Version: {}'.format(azureml.core.VERSION)
  args.model = 'model/' + args.model
-  model_path = str(Path(args.base_path).resolve(strict=False).joinpath(args.model).resolve(strict=False))
-  params_path = str(Path(args.base_path).resolve(strict=False).joinpath('params.json').resolve(strict=False))
-  rgs = {
-    'model_path': model_path,
-    'model_name': args.model_name,
+  model_path = str(Path(args.base_path).resolve(
+    strict=False).joinpath(args.model).resolve(strict=False))
+  params_path = str(Path(args.base_path).resolve(
+    strict=False).joinpath('params.json').resolve(strict=False))
+  wsrgs = {
    'tenant_id': args.tenant_id,
    'service_principal_id': args.service_principal_id,
    'service_principal_password': args.service_principal_password,
@ -69,23 +72,29 @@ if __name__ == "__main__":
    'resource_group': args.resource_group,
    'workspace': args.workspace
  }
+  rgs = {
+    'mdl_path': model_path,
+    'model_name': args.model_name
+  }

  # printing out args for posterity
-  for i in rgs:
+  for i in wsrgs:
    if i == 'service_principal_password':
-      print('{} => **********'.format(i))
+      print '{} => **********'.format(i)
    else:
-      print('{} => {}'.format(i, rgs[i]))
+      print '{} => {}'.format(i, rgs[i])

  with(open(str(params_path), 'r')) as f:
    tags = json.load(f)

-  print('\n\nUsing the following tags:')
+  print '\n\nUsing the following tags:'
  for tag in tags:
-    print('{} => {}'.format(tag, tags[tag]))
+    print '{} => {}'.format(tag, tags[tag])

  rgs['tags'] = tags

+  workspc = get_ws(**wsrgs)
+  rgs['ws'] = workspc
  run(**rgs)

  # python register.py --model_path v --model_name c --tenant_id c
--- a/pipelines/azurepipeline/code/training/train.py
+++ b/pipelines/azurepipeline/code/training/train.py
@ -5,51 +5,52 @@ import hmac
 import json
 import hashlib
 import argparse
+from random import shuffle
 import numpy as np
 import tensorflow as tf
-from pathlib2 import Path
-from random import shuffle
-from datetime import datetime
 from tensorflow.data import Dataset
+from pathlib2 import Path

-global image_size

-def info(msg, char = "#", width = 75):
+def info(msg, char="#", width=75):
  print("")
  print(char * width)
-  print(char + "   %0*s" % ((-1*width)+5, msg) + char)
+  print(char + "   %0*s" % ((-1 * width) + 5, msg) + char)
  print(char * width)

-def check_dir(path, check=False):
-  if check:
-    assert os.path.exists(path), '{} does not exist!'.format(path)
-  else:
-    if not os.path.exists(path):
-      os.makedirs(path)
-    return Path(path).resolve(strict=False)

-def process_image(path, label):
+def check_dir(path):
+  if not os.path.exists(path):
+    os.makedirs(path)
+  return Path(path).resolve(strict=False)
+
+
+def process_image(path, label, img_size):
  img_raw = tf.io.read_file(path)
  img_tensor = tf.image.decode_jpeg(img_raw, channels=3)
-  img_final = tf.image.resize(img_tensor, [image_size, image_size]) / 255
+  img_final = tf.image.resize(img_tensor, [img_size, img_size]) / 255
  return img_final, label

-def load_dataset(base_path, dataset, split=[8, 1, 1]):
+
+def load_dataset(base_path, dset, split=None):
  # normalize splits
+  if split is None:
+    split = [8, 1, 1]
  splits = np.array(split) / np.sum(np.array(split))

  # find labels - parent folder names
  labels = {}
  for (_, dirs, _) in os.walk(base_path):
    print('found {}'.format(dirs))
-    labels = { k: v for (v, k) in enumerate(dirs) }
+    labels = {k: v for (v, k) in enumerate(dirs)}
    print('using {}'.format(labels))
    break

  # load all files along with idx label
-  print('loading dataset from {}'.format(dataset))
-  with open(dataset, 'r') as d:
-    data = [(str(Path(f.strip()).absolute()), labels[Path(f.strip()).parent.name]) for f in d.readlines()]
+  print('loading dataset from {}'.format(dset))
+  with open(dset, 'r') as d:
+    data = [(str(Path(line.strip()).absolute()),
+             labels[Path(line.strip()).parent.name]) for line in d.readlines()]

  print('dataset size: {}\nsuffling data...'.format(len(data)))

@ -59,25 +60,29 @@ def load_dataset(base_path, dataset, split=[8, 1, 1]):
  print('splitting data...')
  # split data
  train_idx = int(len(data) * splits[0])
-  eval_idx = int(len(data) * splits[1])

-  return data[:train_idx], \
-    data[train_idx:train_idx + eval_idx], \
-    data[train_idx + eval_idx:], \
-    labels
+  return data[:train_idx]

-#@print_info
-def run(data_path, image_size=160, epochs=10, batch_size=32, learning_rate=0.0001, output='model', dataset=None):
-  img_shape = (image_size, image_size, 3)
+
+# @print_info
+def run(
+    dpath,
+    img_size=160,
+    epochs=10,
+    batch_size=32,
+    learning_rate=0.0001,
+    output='model',
+    dset=None):
+  img_shape = (img_size, img_size, 3)

  info('Loading Data Set')
  # load dataset
-  train, test, val, labels = load_dataset(data_path, dataset)
+  train = load_dataset(dpath, dset)

  # training data
  train_data, train_labels = zip(*train)
  train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)),
-                          Dataset.from_tensor_slices(list(train_labels))))
+                          Dataset.from_tensor_slices(list(train_labels)), img_size))

  train_ds = train_ds.map(map_func=process_image,
                          num_parallel_calls=5)
@ -91,8 +96,8 @@ def run(data_path, image_size=160, epochs=10, batch_size=32, learning_rate=0.000
  # model
  info('Creating Model')
  base_model = tf.keras.applications.MobileNetV2(input_shape=img_shape,
-                                              include_top=False,
-                                              weights='imagenet')
+                                                 include_top=False,
+                                                 weights='imagenet')
  base_model.trainable = True

  model = tf.keras.Sequential([
@ -102,15 +107,15 @@ def run(data_path, image_size=160, epochs=10, batch_size=32, learning_rate=0.000
  ])

  model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate),
-            loss='binary_crossentropy',
-            metrics=['accuracy'])
+                loss='binary_crossentropy',
+                metrics=['accuracy'])

  model.summary()

  # training
  info('Training')
-  steps_per_epoch = math.ceil(len(train)/batch_size)
-  history = model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch)
+  steps_per_epoch = math.ceil(len(train) / batch_size)
+  model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch)

  # save model
  info('Saving Model')
@ -130,19 +135,20 @@ def run(data_path, image_size=160, epochs=10, batch_size=32, learning_rate=0.000
  return generate_hash(file_output, 'kf_pipeline')


-def generate_hash(file, key):
-  print('Generating hash for {}'.format(file))
+def generate_hash(dfile, key):
+  print('Generating hash for {}'.format(dfile))
  m = hmac.new(str.encode(key), digestmod=hashlib.sha256)
  BUF_SIZE = 65536
-  with open(str(file), 'rb') as f:
+  with open(str(dfile), 'rb') as myfile:
    while True:
-      data = f.read(BUF_SIZE)
+      data = myfile.read(BUF_SIZE)
      if not data:
        break
      m.update(data)

  return m.hexdigest()

+
 if __name__ == "__main__":
  parser = argparse.ArgumentParser(description='transfer learning for binary image task')
  parser.add_argument('-s', '--base_path', help='directory to base data', default='../../data')
@ -165,13 +171,13 @@ if __name__ == "__main__":
  params = Path(args.base_path).joinpath('params.json')

  args = {
-      "data_path": str(data_path),
-      "image_size": image_size,
-      "epochs": args.epochs,
-      "batch_size": args.batch,
-      "learning_rate": args.lr,
-      "output": str(target_path),
-      "dataset": str(dataset)
+    "dpath": str(data_path),
+    "img_size": image_size,
+    "epochs": args.epochs,
+    "batch_size": args.batch,
+    "learning_rate": args.lr,
+    "output": str(target_path),
+    "dset": str(dataset)
  }

  dataset_signature = generate_hash(dataset, 'kf_pipeline')