185 lines
6.5 KiB
Python
185 lines
6.5 KiB
Python
"""
|
|
Usage:
|
|
python arena_launcher.py
|
|
--name=tf-test
|
|
--tensorboard=true
|
|
mpijob
|
|
--gpus=1
|
|
--workers=2
|
|
--image=registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/horovod:0.13.11-tf1.10.0-torch0.4.0-py3.5
|
|
--
|
|
mpirun python /benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model resnet101 --batch_size 64 --variable_update horovod --train_dir=/training_logs --summary_verbosity=3 --save_summaries_steps=10
|
|
"""
|
|
# TODO: Add unit/integration tests
|
|
|
|
import argparse
|
|
import datetime
|
|
import json
|
|
import os
|
|
import sys
|
|
import logging
|
|
import requests
|
|
import subprocess
|
|
import six
|
|
import time
|
|
import yaml
|
|
from subprocess import Popen,PIPE
|
|
from shlex import split
|
|
from utils import *
|
|
from job_generator import *
|
|
|
|
def main(argv=None):
|
|
setup_custom_logging()
|
|
import sys
|
|
all_args = sys.argv[1:]
|
|
logging.info("args: {0}".format(' '.join(sys.argv)))
|
|
parser = argparse.ArgumentParser(description='Arena launcher')
|
|
parser.add_argument('--name', type=str,
|
|
help='The job name to specify.',default=None)
|
|
parser.add_argument('--tensorboard', type=str, default="False")
|
|
parser.add_argument('--rdma', type=str, default="False")
|
|
parser.add_argument('--tensorboard-image', type=str, default='tensorflow/tensorflow:1.12.0')
|
|
parser.add_argument('--timeout-hours', type=int,
|
|
default=200,
|
|
help='Time in minutes to wait for the Job submitted by arena to complete')
|
|
parser.add_argument('--pending-timeout-minutes', type=int,
|
|
default=360,
|
|
help='Time in hours to wait for the Job submitted by arena from pending to running')
|
|
# parser.add_argument('--command', type=str)
|
|
parser.add_argument('--output-dir', type=str, default='')
|
|
parser.add_argument('--output-data', type=str, default='None')
|
|
parser.add_argument('--log-dir', type=str, default='')
|
|
|
|
parser.add_argument('--image', type=str)
|
|
parser.add_argument('--gpus', type=int, default=0)
|
|
parser.add_argument('--cpu', type=str, default='0')
|
|
parser.add_argument('--memory', type=str, default='0')
|
|
parser.add_argument('--workers', type=int, default=2)
|
|
|
|
parser.add_argument('--env', action='append', type=str, default=[])
|
|
parser.add_argument('--data', action='append', type=str, default=[])
|
|
parser.add_argument('--metric', action='append', type=str, default=[])
|
|
parser.add_argument('--sync-source', type=str, default='')
|
|
|
|
parser.add_argument('--workflow-name', type=str, default='')
|
|
parser.add_argument('--step-name', type=str, default='')
|
|
|
|
subparsers = parser.add_subparsers(help='arena sub-command help')
|
|
|
|
#create the parser for the 'mpijob' command
|
|
parser_mpi = subparsers.add_parser('mpijob', help='mpijob help')
|
|
parser_mpi.set_defaults(func=generate_mpjob_command)
|
|
|
|
#create the parser for the 'job' command
|
|
parser_job = subparsers.add_parser('job', help='job help')
|
|
parser_job.set_defaults(func=generate_job_command)
|
|
|
|
|
|
separator_idx = all_args.index('--')
|
|
launcher_args = all_args[:separator_idx]
|
|
remaining_args = all_args[separator_idx + 1:]
|
|
|
|
args = parser.parse_args(launcher_args)
|
|
commandArray, job_type = args.func(args)
|
|
|
|
args_dict = vars(args)
|
|
if args.name is None:
|
|
logging.error("Please specify the name")
|
|
sys.exit(-1)
|
|
if len(remaining_args) == 0:
|
|
logging.error("Please specify the command.")
|
|
sys.exit(-1)
|
|
|
|
internalCommand = ' '.join(remaining_args)
|
|
|
|
name = args.name
|
|
fullname = name + datetime.datetime.now().strftime("%Y%M%d%H%M%S")
|
|
timeout_hours = args_dict.pop('timeout_hours')
|
|
logging.info("timeout_hours: {0}".format(timeout_hours))
|
|
|
|
enableTensorboard = str2bool(args.tensorboard)
|
|
|
|
commandArray.append('"{0}"'.format(internalCommand))
|
|
command = ' '.join(commandArray)
|
|
|
|
command=command.replace("--name={0}".format(name),"--name={0}".format(fullname))
|
|
|
|
logging.info('Start training {0}.'.format(command))
|
|
|
|
submit_job(command)
|
|
|
|
succ = True
|
|
|
|
# wait for job done
|
|
# wait_job_done(fullname, job_type, datetime.timedelta(minutes=timeout_hours))
|
|
pending_timeout_minutes = args.pending_timeout_minutes
|
|
wait_job_running(fullname, job_type, datetime.timedelta(minutes=pending_timeout_minutes))
|
|
|
|
rc = job_logging(fullname, job_type)
|
|
logging.info("rc: {0}".format(rc))
|
|
|
|
wait_job_done(fullname, job_type, datetime.timedelta(hours=timeout_hours))
|
|
|
|
status = get_job_status(fullname, job_type)
|
|
|
|
if status == "SUCCEEDED":
|
|
logging.info("Training Job {0} success.".format(fullname))
|
|
if len(args.metric) > 0:
|
|
metrics_data = {
|
|
'metrics': []
|
|
}
|
|
metric_list = []
|
|
metric_unit="RAW"
|
|
for m in args.metric:
|
|
mArray = m.split(":")
|
|
metric_name = mArray[0]
|
|
if len(mArray) > 1:
|
|
metric_unit = mArray[1]
|
|
logging.info("determine metric name {0} with metric unit {1}".format(metric_name, metric_unit))
|
|
value = collect_metrics(fullname, job_type, metric_name)
|
|
if value > 0:
|
|
import re
|
|
p = re.compile('^[a-z]([-a-z0-9]{0,62}[a-z0-9])?')
|
|
result = p.search(metric_name.lower())
|
|
if result is None:
|
|
logging.info("Failed to parse metric_name {0},skip".format(metric_name))
|
|
continue
|
|
else:
|
|
metric_name=result.group(0)
|
|
|
|
metric_data = {
|
|
'name': metric_name.lower(), # The name of the metric. Visualized as the column name in the runs table.
|
|
'numberValue': value, # The value of the metric. Must be a numeric value.
|
|
'format': metric_unit, # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
|
|
}
|
|
logging.info("metric data: {0}".format(metric_data))
|
|
metric_list.append(metric_data)
|
|
metrics_data['metrics'] = metric_list
|
|
with open('/mlpipeline-metrics.json', 'w') as f:
|
|
logging.info("metrics: {0}".format(metrics_data))
|
|
json.dump(metrics_data, f)
|
|
logging.info("write down /mlpipeline-metrics.json")
|
|
elif status == "FAILED":
|
|
logging.error("Training Job {0} fail.".format(fullname))
|
|
sys.exit(-1)
|
|
else:
|
|
logging.error("Training Job {0}'s status {1}".format(fullname, status))
|
|
sys.exit(-1)
|
|
|
|
# TODO(cheyang): copy the output.txt from training job
|
|
output=""
|
|
with open('/output.txt', 'w') as f:
|
|
f.write(output)
|
|
|
|
with open('/workflow-name.txt', 'w') as f:
|
|
f.write(args.workflow_name)
|
|
|
|
with open('/step-name.txt', 'w') as f:
|
|
f.write(args.step_name)
|
|
|
|
with open('/name.txt', 'w') as f:
|
|
f.write(args.name)
|
|
|
|
if __name__== "__main__":
|
|
main()
|