425 lines
15 KiB
Python
Executable File
425 lines
15 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright 2021 kubeflow.org
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import concurrent.futures
|
|
import datetime
|
|
import functools
|
|
import os
|
|
import tempfile
|
|
import time
|
|
import threading
|
|
import json
|
|
import yaml
|
|
|
|
from collections import defaultdict
|
|
from datetime import datetime as dt
|
|
from datetime import timedelta
|
|
from os import environ as env
|
|
from os.path import pathsep
|
|
from pathlib import Path
|
|
from typing import Dict, Mapping
|
|
|
|
from kfp_server_api import ApiException, ApiRun, ApiRunDetail
|
|
from kfp_tekton.compiler.main import compile_pyfile
|
|
from kfp_tekton._client import TektonClient
|
|
from kfp_tekton.compiler.pipeline_utils import TektonPipelineConf
|
|
|
|
|
|
# =============================================================================
|
|
# load test settings from environment variables
|
|
# =============================================================================
|
|
|
|
# TODO: turn env vars into script parameters, use argparse
|
|
PUBLIC_IP = env.get("PUBLIC_IP")
|
|
NAMESPACE = env.get("NAMESPACE", None)
|
|
USER_INFO = env.get("USER_INFO")
|
|
CONNECT_SID = env.get("CONNECT_SID")
|
|
NUM_WORKERS = int(env.get("NUM_WORKERS", 1))
|
|
TEST_CONFIG = env.get("TEST_CONFIG") or Path(__file__).parents[0].joinpath("perf_test_config.yaml")
|
|
EXPERIMENT = env.get("EXPERIMENT_NAME", "PERF_TEST")
|
|
OUTPUT_FILE = env.get("OUTPUT_FILE", f"perf_test_{dt.now().strftime('%Y%m%d_%H%M%S')}_N{NUM_WORKERS}_{PUBLIC_IP}.csv")
|
|
OUTPUT_SEP = env.get("OUTPUT_SEP", ",")
|
|
|
|
|
|
print(f"Environment variables:\n\n"
|
|
f" PUBLIC_IP: {PUBLIC_IP}\n"
|
|
f" NAMESPACE: {NAMESPACE}\n"
|
|
f" USER_INFO: {USER_INFO}\n"
|
|
f" CONNECT_SID: {CONNECT_SID}\n"
|
|
f" NUM_WORKERS: {NUM_WORKERS}\n"
|
|
f" TEST_CONFIG: {TEST_CONFIG}\n"
|
|
f" EXPERIMENT: {EXPERIMENT}\n"
|
|
f" OUTPUT_FILE: {OUTPUT_FILE}\n"
|
|
f" OUTPUT_SEP: {OUTPUT_SEP}\n")
|
|
|
|
|
|
# =============================================================================
|
|
# local variables
|
|
# =============================================================================
|
|
|
|
execution_times: Dict[str, Dict[str, timedelta]] = defaultdict(dict)
|
|
|
|
|
|
def record_execution_time(pipeline_name: str,
|
|
function_name: str,
|
|
execution_time: timedelta):
|
|
|
|
execution_times[pipeline_name][function_name] = execution_time
|
|
|
|
|
|
# method annotation to record execution times
|
|
def time_it(function):
|
|
|
|
@functools.wraps(function)
|
|
def _timed_function(*args, **kwargs):
|
|
|
|
start_time = dt.now()
|
|
|
|
functions_result = function(*args, **kwargs)
|
|
|
|
execution_time = dt.now() - start_time
|
|
|
|
assert "pipeline_name" in kwargs, \
|
|
f"The function '{function.__name__}' has to be invoked with keyword" \
|
|
f" argument parameter 'pipeline_name'."
|
|
|
|
record_execution_time(pipeline_name=kwargs["pipeline_name"],
|
|
function_name=function.__name__,
|
|
execution_time=execution_time)
|
|
|
|
return functions_result
|
|
|
|
return _timed_function
|
|
|
|
|
|
# method annotation to ensure the wrapped function is executed synchronously
|
|
def synchronized(function):
|
|
lock = threading.Lock()
|
|
|
|
@functools.wraps(function)
|
|
def _synchronized_function(*args, **kwargs):
|
|
with lock:
|
|
result = function(*args, **kwargs)
|
|
return result
|
|
|
|
return _synchronized_function
|
|
|
|
|
|
# TODO: cannot compile multiple pipelines in parallel due to use of static variables
|
|
# causing Exception "Nested pipelines are not allowed." in kfp/dsl/_pipeline.py
|
|
# def __enter__(self):
|
|
# if Pipeline._default_pipeline:
|
|
# raise Exception('Nested pipelines are not allowed.')
|
|
# NOTE: synchronizing the method compile_pipeline could skews the recorded compilation
|
|
# times since the start time for all pipelines are equal, but some pipelines will
|
|
# wait for others to be compiled with the wait time included in the total compilation
|
|
# time.
|
|
# We need to change test design to run and time pipeline compilation sequentially
|
|
# and only execute pipelines in parallel
|
|
@synchronized # keep decorator precedence: synchronize outside of (before) time_it
|
|
@time_it # time_it inside the synchronized block so idle wait is not recorded
|
|
def compile_pipeline(*, # force kwargs for time_it decorator to get pipeline_name
|
|
pipeline_name: str,
|
|
pipeline_script: Path) -> str:
|
|
|
|
file_name = pipeline_name + '.yaml'
|
|
tmpdir = tempfile.gettempdir() # TODO: keep compiled pipelines?
|
|
pipeline_package_path = os.path.join(tmpdir, file_name)
|
|
pipeline_conf = TektonPipelineConf()
|
|
|
|
try:
|
|
compile_pyfile(pyfile=pipeline_script,
|
|
function_name=None,
|
|
output_path=pipeline_package_path,
|
|
type_check=True,
|
|
tekton_pipeline_conf=pipeline_conf)
|
|
|
|
except ValueError as e:
|
|
print(f"{e.__class__.__name__} trying to compile {pipeline_script}: {str(e)}")
|
|
|
|
# TODO: delete those files after running test or keep for inspection?
|
|
return pipeline_package_path
|
|
|
|
|
|
@time_it
|
|
def submit_pipeline_run(*, # force kwargs for time_it decorator to get pipeline_name
|
|
pipeline_name: str,
|
|
pipeline_file: str,
|
|
arguments: Mapping[str, str] = None):
|
|
|
|
client = get_client()
|
|
experiment = client.create_experiment(EXPERIMENT) # get or create
|
|
|
|
try:
|
|
run_result: ApiRun = client.run_pipeline(
|
|
experiment_id=experiment.id,
|
|
job_name=pipeline_name,
|
|
pipeline_package_path=pipeline_file,
|
|
params=arguments)
|
|
return run_result.id
|
|
|
|
except ApiException as e:
|
|
print(f"KFP Server Exception trying to submit pipeline {pipeline_file}:"
|
|
f" '{e.reason}' {e.status} '{e.body}'"
|
|
f" {datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')}")
|
|
|
|
except Exception as e:
|
|
print(f"Exception trying to submit pipeline {pipeline_file}:"
|
|
f" '{str(e)}'"
|
|
f" {datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')}")
|
|
|
|
return None
|
|
|
|
|
|
@time_it
|
|
def wait_for_run_to_complete(*, # force kwargs so the time_it decorator can get pipeline_name
|
|
pipeline_name: str,
|
|
run_id: str) -> ApiRunDetail:
|
|
if not run_id:
|
|
return None
|
|
|
|
client = get_client()
|
|
status = None
|
|
|
|
while status not in ["Succeeded", "Failed", "Error", "Skipped", "Terminated",
|
|
"Completed", "CouldntGetTask"]:
|
|
try:
|
|
run_detail: ApiRunDetail = client.get_run(run_id)
|
|
run: ApiRun = run_detail.run
|
|
status = run.status
|
|
except ApiException as e: # TODO: add timeout or max retries on ApiError
|
|
print(f"KFP Server Exception waiting for {pipeline_name} run {run_id}: {e.reason}")
|
|
time.sleep(10)
|
|
|
|
time.sleep(0.1)
|
|
|
|
print(f"{pipeline_name.ljust(20)[:20]}"
|
|
f" {run.status.lower().ljust(10)[:10]}"
|
|
f" after {(run.finished_at - run.created_at)}"
|
|
f" ({run.created_at.strftime('%H:%M:%S')}->{run.finished_at.strftime('%H:%M:%S')})")
|
|
|
|
return run_detail
|
|
|
|
|
|
def get_client() -> TektonClient:
|
|
|
|
host = f"http://{PUBLIC_IP}/pipeline"
|
|
cookies = f"connect.sid={CONNECT_SID}; userinfo={USER_INFO}" if CONNECT_SID and USER_INFO else None
|
|
client = TektonClient(host=host, cookies=cookies)
|
|
client.set_user_namespace(NAMESPACE) # overwrite system default with None if necessary
|
|
|
|
return client
|
|
|
|
|
|
def get_project_root_dir() -> Path:
|
|
|
|
script_path_presumed = "sdk/python/tests/performance_tests.py"
|
|
script_path_actually = Path(__file__)
|
|
project_root_folder = script_path_actually.parents[3]
|
|
|
|
assert script_path_actually == project_root_folder.joinpath(script_path_presumed), \
|
|
"Can not determine project root folder. Was this script file moved or renamed?"
|
|
|
|
return project_root_folder
|
|
|
|
|
|
def load_test_config() -> dict:
|
|
|
|
# script_path = Path(__file__)
|
|
# script_dir = script_path.parents[0]
|
|
# config_file = script_dir.joinpath("perf_test_config.yaml")
|
|
|
|
with open(TEST_CONFIG, "r") as f:
|
|
test_config = yaml.safe_load(f)
|
|
|
|
return test_config
|
|
|
|
|
|
def load_pipeline_scripts() -> [(Path, str)]:
|
|
|
|
pipeline_files_with_name = []
|
|
test_config = load_test_config()
|
|
project_dir = get_project_root_dir()
|
|
|
|
for path_name_dict in test_config["pipeline_scripts"]:
|
|
|
|
path = path_name_dict["path"]
|
|
name = path_name_dict.get("name") or Path(path).stem
|
|
copies = path_name_dict.get("copies", 1)
|
|
if not path.startswith(pathsep):
|
|
# path assumed to be relative to project root
|
|
fp: Path = project_dir.joinpath(path)
|
|
else:
|
|
# path is absolute
|
|
fp = Path(path)
|
|
|
|
assert fp.exists(), f"Cannot find file: {fp.resolve()}"
|
|
for i in range(int(copies)):
|
|
pipeline_files_with_name.append((fp, f'{name}{i}'))
|
|
|
|
print(f"Loaded {len(pipeline_files_with_name)} pipelines from {TEST_CONFIG}\n")
|
|
|
|
return pipeline_files_with_name
|
|
|
|
|
|
def run_concurrently(pipelinescript_name_tuples: [(Path, str)]):
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
|
|
performance_tests = (
|
|
executor.submit(run_single_pipeline_performance_test, pipeline_script, name)
|
|
for (pipeline_script, name) in pipelinescript_name_tuples
|
|
)
|
|
for performance_test in concurrent.futures.as_completed(performance_tests):
|
|
try:
|
|
run_details = performance_test.result() # noqa F841
|
|
except Exception as e:
|
|
error = f"{e.__class__.__name__}: {str(e)}"
|
|
print(error)
|
|
|
|
|
|
def run_single_pipeline_performance_test(pipeline_script: Path,
|
|
pipeline_name: str):
|
|
try:
|
|
pipeline_file = compile_pipeline(pipeline_name=pipeline_name, pipeline_script=pipeline_script)
|
|
run_id = submit_pipeline_run(pipeline_name=pipeline_name, pipeline_file=pipeline_file)
|
|
run_details = wait_for_run_to_complete(pipeline_name=pipeline_name, run_id=run_id)
|
|
|
|
status = run_details.run.status if run_details else "Error"
|
|
task_details = parse_run_details(run_details)
|
|
|
|
append_exec_times_to_output_file(pipeline_name, status, task_details)
|
|
|
|
except Exception as e:
|
|
error = f"{e.__class__.__name__} while testing '{pipeline_name}': {str(e)}"
|
|
print(error)
|
|
|
|
|
|
def parse_run_details(run_details: ApiRunDetail) -> dict:
|
|
task_details = {}
|
|
|
|
if not run_details:
|
|
return {}
|
|
|
|
pipelinerun = json.loads(run_details.to_dict()["pipeline_runtime"]["workflow_manifest"])
|
|
status = pipelinerun["status"]
|
|
|
|
def get_details(data):
|
|
|
|
info = {}
|
|
total = timedelta(0)
|
|
count = 0
|
|
dt_fmt = "%Y-%m-%dT%H:%M:%SZ"
|
|
|
|
for key in data.keys():
|
|
run = data[key]
|
|
status = run["status"]
|
|
conditions = status["conditions"]
|
|
state = conditions[len(conditions) - 1]['type']
|
|
elapsed = dt.strptime(status['completionTime'], dt_fmt) - dt.strptime(status['startTime'], dt_fmt)
|
|
info[run['pipelineTaskName']] = {
|
|
"elapsed": elapsed,
|
|
"status": state
|
|
}
|
|
count += 1
|
|
total += elapsed
|
|
|
|
info["count"] = count
|
|
info["total_elapsed"] = total
|
|
return info
|
|
|
|
if "taskRuns" in status:
|
|
task_details["taskRuns"] = get_details(status["taskRuns"])
|
|
|
|
if "runs" in status:
|
|
task_details["runs"] = get_details(status["runs"])
|
|
|
|
return task_details
|
|
|
|
|
|
def append_exec_times_to_output_file(pipeline_name: str,
|
|
status: str = "",
|
|
tasks: dict = {}):
|
|
|
|
compile_time = execution_times[pipeline_name][compile_pipeline.__name__]
|
|
submit_time = execution_times[pipeline_name][submit_pipeline_run.__name__]
|
|
run_time = execution_times[pipeline_name][wait_for_run_to_complete.__name__]
|
|
taskruns = 0
|
|
taskrun_elapsed = timedelta(0)
|
|
runs = 0
|
|
run_elapsed = timedelta(0)
|
|
|
|
if "taskRuns" in tasks:
|
|
taskruns = tasks["taskRuns"]["count"]
|
|
taskrun_elapsed = tasks["taskRuns"]["total_elapsed"]
|
|
|
|
if "runs" in tasks:
|
|
runs = tasks["runs"]["count"]
|
|
run_elapsed = tasks["runs"]["total_elapsed"]
|
|
|
|
taskruns_average = taskrun_elapsed / taskruns if taskruns > 0 else taskrun_elapsed
|
|
runs_average = run_elapsed / runs if runs > 0 else run_elapsed
|
|
non_task_average = (taskrun_elapsed + run_elapsed) / (taskruns + runs) if (taskruns + runs) > 0 \
|
|
else (taskrun_elapsed + run_elapsed)
|
|
|
|
with open(OUTPUT_FILE, "a") as f:
|
|
f.write(OUTPUT_SEP.join([
|
|
pipeline_name, status, str(compile_time), str(submit_time), str(run_time),
|
|
str(taskruns), str(runs), str(taskrun_elapsed), str(run_elapsed),
|
|
str(taskrun_elapsed + run_elapsed), str(run_time - (taskrun_elapsed + run_elapsed)),
|
|
str(taskruns_average), str(runs_average),
|
|
str(taskruns + runs), str(non_task_average)
|
|
]))
|
|
f.write("\n")
|
|
|
|
|
|
def create_output_file():
|
|
|
|
with open(OUTPUT_FILE, "w") as f:
|
|
|
|
f.write(OUTPUT_SEP.join([
|
|
"Pipeline", "Status", "Compile_Time", "Submit_Time", "Run_Time",
|
|
"Num_TaskRuns", "Num_Runs", "Total_TaskRun_Time", "Total_Run_Time",
|
|
"Total_Time_Spent_On_Tasks", "Time_Spent_Outside_Of_Tasks", "Average_Taskrun_Time",
|
|
"Average_RunCR_Time", "Total_Number_Of_Tasks", "Average_Time_Spent_Outside_Of_Tasks"
|
|
]))
|
|
f.write("\n")
|
|
|
|
|
|
def run_performance_tests():
|
|
|
|
create_output_file()
|
|
pipeline_scripts = load_pipeline_scripts()
|
|
|
|
if NUM_WORKERS == 1: # TODO: use `run_concurrently()` even with 1 worker
|
|
for script, name in pipeline_scripts:
|
|
run_single_pipeline_performance_test(script, name)
|
|
else:
|
|
run_concurrently(pipeline_scripts)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
run_performance_tests()
|
|
|
|
# client = get_client()
|
|
# from kfp_server_api import ApiListExperimentsResponse, ApiExperiment, ApiListRunsResponse
|
|
# experiments: ApiListExperimentsResponse = client.list_experiments()
|
|
# experiment: ApiExperiment = client.create_experiment(name='PERF_TESTS')
|
|
# experiment: ApiExperiment = client.get_experiment(experiment_name='PERF_TEST')
|
|
# runs: ApiListRunsResponse = client.list_runs(experiment_id=experiment.id, page_size=100)
|
|
# print("Experiments: " + ", ".join([e.name for e in experiments.experiments]))
|
|
# print("Runs: " + ", ".join([r.name for r in runs.runs]))
|