#!/usr/bin/env python3 # Copyright 2021 kubeflow.org # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import concurrent.futures import datetime import functools import os import tempfile import time import threading import json import yaml from collections import defaultdict from datetime import datetime as dt from datetime import timedelta from os import environ as env from os.path import pathsep from pathlib import Path from typing import Dict, Mapping from kfp_server_api import ApiException, ApiRun, ApiRunDetail from kfp_tekton.compiler.main import compile_pyfile from kfp_tekton._client import TektonClient from kfp_tekton.compiler.pipeline_utils import TektonPipelineConf # ============================================================================= # load test settings from environment variables # ============================================================================= # TODO: turn env vars into script parameters, use argparse PUBLIC_IP = env.get("PUBLIC_IP") NAMESPACE = env.get("NAMESPACE", None) USER_INFO = env.get("USER_INFO") CONNECT_SID = env.get("CONNECT_SID") NUM_WORKERS = int(env.get("NUM_WORKERS", 1)) TEST_CONFIG = env.get("TEST_CONFIG") or Path(__file__).parents[0].joinpath("perf_test_config.yaml") EXPERIMENT = env.get("EXPERIMENT_NAME", "PERF_TEST") OUTPUT_FILE = env.get("OUTPUT_FILE", f"perf_test_{dt.now().strftime('%Y%m%d_%H%M%S')}_N{NUM_WORKERS}_{PUBLIC_IP}.csv") OUTPUT_SEP = env.get("OUTPUT_SEP", ",") print(f"Environment variables:\n\n" f" PUBLIC_IP: {PUBLIC_IP}\n" f" NAMESPACE: {NAMESPACE}\n" f" USER_INFO: {USER_INFO}\n" f" CONNECT_SID: {CONNECT_SID}\n" f" NUM_WORKERS: {NUM_WORKERS}\n" f" TEST_CONFIG: {TEST_CONFIG}\n" f" EXPERIMENT: {EXPERIMENT}\n" f" OUTPUT_FILE: {OUTPUT_FILE}\n" f" OUTPUT_SEP: {OUTPUT_SEP}\n") # ============================================================================= # local variables # ============================================================================= execution_times: Dict[str, Dict[str, timedelta]] = defaultdict(dict) def record_execution_time(pipeline_name: str, function_name: str, execution_time: timedelta): execution_times[pipeline_name][function_name] = execution_time # method annotation to record execution times def time_it(function): @functools.wraps(function) def _timed_function(*args, **kwargs): start_time = dt.now() functions_result = function(*args, **kwargs) execution_time = dt.now() - start_time assert "pipeline_name" in kwargs, \ f"The function '{function.__name__}' has to be invoked with keyword" \ f" argument parameter 'pipeline_name'." record_execution_time(pipeline_name=kwargs["pipeline_name"], function_name=function.__name__, execution_time=execution_time) return functions_result return _timed_function # method annotation to ensure the wrapped function is executed synchronously def synchronized(function): lock = threading.Lock() @functools.wraps(function) def _synchronized_function(*args, **kwargs): with lock: result = function(*args, **kwargs) return result return _synchronized_function # TODO: cannot compile multiple pipelines in parallel due to use of static variables # causing Exception "Nested pipelines are not allowed." in kfp/dsl/_pipeline.py # def __enter__(self): # if Pipeline._default_pipeline: # raise Exception('Nested pipelines are not allowed.') # NOTE: synchronizing the method compile_pipeline could skews the recorded compilation # times since the start time for all pipelines are equal, but some pipelines will # wait for others to be compiled with the wait time included in the total compilation # time. # We need to change test design to run and time pipeline compilation sequentially # and only execute pipelines in parallel @synchronized # keep decorator precedence: synchronize outside of (before) time_it @time_it # time_it inside the synchronized block so idle wait is not recorded def compile_pipeline(*, # force kwargs for time_it decorator to get pipeline_name pipeline_name: str, pipeline_script: Path) -> str: file_name = pipeline_name + '.yaml' tmpdir = tempfile.gettempdir() # TODO: keep compiled pipelines? pipeline_package_path = os.path.join(tmpdir, file_name) pipeline_conf = TektonPipelineConf() try: compile_pyfile(pyfile=pipeline_script, function_name=None, output_path=pipeline_package_path, type_check=True, tekton_pipeline_conf=pipeline_conf) except ValueError as e: print(f"{e.__class__.__name__} trying to compile {pipeline_script}: {str(e)}") # TODO: delete those files after running test or keep for inspection? return pipeline_package_path @time_it def submit_pipeline_run(*, # force kwargs for time_it decorator to get pipeline_name pipeline_name: str, pipeline_file: str, arguments: Mapping[str, str] = None): client = get_client() experiment = client.create_experiment(EXPERIMENT) # get or create try: run_result: ApiRun = client.run_pipeline( experiment_id=experiment.id, job_name=pipeline_name, pipeline_package_path=pipeline_file, params=arguments) return run_result.id except ApiException as e: print(f"KFP Server Exception trying to submit pipeline {pipeline_file}:" f" '{e.reason}' {e.status} '{e.body}'" f" {datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')}") except Exception as e: print(f"Exception trying to submit pipeline {pipeline_file}:" f" '{str(e)}'" f" {datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')}") return None @time_it def wait_for_run_to_complete(*, # force kwargs so the time_it decorator can get pipeline_name pipeline_name: str, run_id: str) -> ApiRunDetail: if not run_id: return None client = get_client() status = None while status not in ["Succeeded", "Failed", "Error", "Skipped", "Terminated", "Completed", "CouldntGetTask"]: try: run_detail: ApiRunDetail = client.get_run(run_id) run: ApiRun = run_detail.run status = run.status except ApiException as e: # TODO: add timeout or max retries on ApiError print(f"KFP Server Exception waiting for {pipeline_name} run {run_id}: {e.reason}") time.sleep(10) time.sleep(0.1) print(f"{pipeline_name.ljust(20)[:20]}" f" {run.status.lower().ljust(10)[:10]}" f" after {(run.finished_at - run.created_at)}" f" ({run.created_at.strftime('%H:%M:%S')}->{run.finished_at.strftime('%H:%M:%S')})") return run_detail def get_client() -> TektonClient: host = f"http://{PUBLIC_IP}/pipeline" cookies = f"connect.sid={CONNECT_SID}; userinfo={USER_INFO}" if CONNECT_SID and USER_INFO else None client = TektonClient(host=host, cookies=cookies) client.set_user_namespace(NAMESPACE) # overwrite system default with None if necessary return client def get_project_root_dir() -> Path: script_path_presumed = "sdk/python/tests/performance_tests.py" script_path_actually = Path(__file__) project_root_folder = script_path_actually.parents[3] assert script_path_actually == project_root_folder.joinpath(script_path_presumed), \ "Can not determine project root folder. Was this script file moved or renamed?" return project_root_folder def load_test_config() -> dict: # script_path = Path(__file__) # script_dir = script_path.parents[0] # config_file = script_dir.joinpath("perf_test_config.yaml") with open(TEST_CONFIG, "r") as f: test_config = yaml.safe_load(f) return test_config def load_pipeline_scripts() -> [(Path, str)]: pipeline_files_with_name = [] test_config = load_test_config() project_dir = get_project_root_dir() for path_name_dict in test_config["pipeline_scripts"]: path = path_name_dict["path"] name = path_name_dict.get("name") or Path(path).stem copies = path_name_dict.get("copies", 1) if not path.startswith(pathsep): # path assumed to be relative to project root fp: Path = project_dir.joinpath(path) else: # path is absolute fp = Path(path) assert fp.exists(), f"Cannot find file: {fp.resolve()}" for i in range(int(copies)): pipeline_files_with_name.append((fp, f'{name}{i}')) print(f"Loaded {len(pipeline_files_with_name)} pipelines from {TEST_CONFIG}\n") return pipeline_files_with_name def run_concurrently(pipelinescript_name_tuples: [(Path, str)]): with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: performance_tests = ( executor.submit(run_single_pipeline_performance_test, pipeline_script, name) for (pipeline_script, name) in pipelinescript_name_tuples ) for performance_test in concurrent.futures.as_completed(performance_tests): try: run_details = performance_test.result() # noqa F841 except Exception as e: error = f"{e.__class__.__name__}: {str(e)}" print(error) def run_single_pipeline_performance_test(pipeline_script: Path, pipeline_name: str): try: pipeline_file = compile_pipeline(pipeline_name=pipeline_name, pipeline_script=pipeline_script) run_id = submit_pipeline_run(pipeline_name=pipeline_name, pipeline_file=pipeline_file) run_details = wait_for_run_to_complete(pipeline_name=pipeline_name, run_id=run_id) status = run_details.run.status if run_details else "Error" task_details = parse_run_details(run_details) append_exec_times_to_output_file(pipeline_name, status, task_details) except Exception as e: error = f"{e.__class__.__name__} while testing '{pipeline_name}': {str(e)}" print(error) def parse_run_details(run_details: ApiRunDetail) -> dict: task_details = {} if not run_details: return {} pipelinerun = json.loads(run_details.to_dict()["pipeline_runtime"]["workflow_manifest"]) status = pipelinerun["status"] def get_details(data): info = {} total = timedelta(0) count = 0 dt_fmt = "%Y-%m-%dT%H:%M:%SZ" for key in data.keys(): run = data[key] status = run["status"] conditions = status["conditions"] state = conditions[len(conditions) - 1]['type'] elapsed = dt.strptime(status['completionTime'], dt_fmt) - dt.strptime(status['startTime'], dt_fmt) info[run['pipelineTaskName']] = { "elapsed": elapsed, "status": state } count += 1 total += elapsed info["count"] = count info["total_elapsed"] = total return info if "taskRuns" in status: task_details["taskRuns"] = get_details(status["taskRuns"]) if "runs" in status: task_details["runs"] = get_details(status["runs"]) return task_details def append_exec_times_to_output_file(pipeline_name: str, status: str = "", tasks: dict = {}): compile_time = execution_times[pipeline_name][compile_pipeline.__name__] submit_time = execution_times[pipeline_name][submit_pipeline_run.__name__] run_time = execution_times[pipeline_name][wait_for_run_to_complete.__name__] taskruns = 0 taskrun_elapsed = timedelta(0) runs = 0 run_elapsed = timedelta(0) if "taskRuns" in tasks: taskruns = tasks["taskRuns"]["count"] taskrun_elapsed = tasks["taskRuns"]["total_elapsed"] if "runs" in tasks: runs = tasks["runs"]["count"] run_elapsed = tasks["runs"]["total_elapsed"] taskruns_average = taskrun_elapsed / taskruns if taskruns > 0 else taskrun_elapsed runs_average = run_elapsed / runs if runs > 0 else run_elapsed non_task_average = (taskrun_elapsed + run_elapsed) / (taskruns + runs) if (taskruns + runs) > 0 \ else (taskrun_elapsed + run_elapsed) with open(OUTPUT_FILE, "a") as f: f.write(OUTPUT_SEP.join([ pipeline_name, status, str(compile_time), str(submit_time), str(run_time), str(taskruns), str(runs), str(taskrun_elapsed), str(run_elapsed), str(taskrun_elapsed + run_elapsed), str(run_time - (taskrun_elapsed + run_elapsed)), str(taskruns_average), str(runs_average), str(taskruns + runs), str(non_task_average) ])) f.write("\n") def create_output_file(): with open(OUTPUT_FILE, "w") as f: f.write(OUTPUT_SEP.join([ "Pipeline", "Status", "Compile_Time", "Submit_Time", "Run_Time", "Num_TaskRuns", "Num_Runs", "Total_TaskRun_Time", "Total_Run_Time", "Total_Time_Spent_On_Tasks", "Time_Spent_Outside_Of_Tasks", "Average_Taskrun_Time", "Average_RunCR_Time", "Total_Number_Of_Tasks", "Average_Time_Spent_Outside_Of_Tasks" ])) f.write("\n") def run_performance_tests(): create_output_file() pipeline_scripts = load_pipeline_scripts() if NUM_WORKERS == 1: # TODO: use `run_concurrently()` even with 1 worker for script, name in pipeline_scripts: run_single_pipeline_performance_test(script, name) else: run_concurrently(pipeline_scripts) if __name__ == '__main__': run_performance_tests() # client = get_client() # from kfp_server_api import ApiListExperimentsResponse, ApiExperiment, ApiListRunsResponse # experiments: ApiListExperimentsResponse = client.list_experiments() # experiment: ApiExperiment = client.create_experiment(name='PERF_TESTS') # experiment: ApiExperiment = client.get_experiment(experiment_name='PERF_TEST') # runs: ApiListRunsResponse = client.list_runs(experiment_id=experiment.id, page_size=100) # print("Experiments: " + ", ".join([e.name for e in experiments.experiments])) # print("Runs: " + ", ".join([r.name for r in runs.runs]))