fix(sdk): add default value for inputs (#7405)

* fix(sdk): add default value for inputs

* merge conflict

* release

* fix sample
This commit is contained in:
Yaqi Ji 2022-03-23 11:59:20 -07:00 committed by GitHub
parent 09ca1c2ec2
commit 04123280a4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 524 additions and 448 deletions

View File

@ -40,6 +40,7 @@ def verify(run: kfp_server_api.ApiRun, mlmd_connection_config, **kwargs):
{
'inputs': {
'parameters': {
'empty_message': '',
'message': 'message',
}
},
@ -61,8 +62,8 @@ def verify(run: kfp_server_api.ApiRun, mlmd_connection_config, **kwargs):
'parameters': {
'output_bool_parameter_path': True,
'output_dict_parameter_path': {
"A": 1,
"B": 2
"A": 1.0,
"B": 2.0
},
'output_list_parameter_path': ["a", "b", "c"],
'output_parameter_path': 'message'
@ -96,7 +97,8 @@ def verify(run: kfp_server_api.ApiRun, mlmd_connection_config, **kwargs):
"B": 2.0,
},
'input_list': ["a", "b", "c"],
'message': 'message'
'message': 'message',
'num_steps': 100.0
}
},
'name': 'train',

View File

@ -83,6 +83,8 @@
* Depends on `typing-extensions>=3.7.4,<5; python_version<"3.9"` [\#7288](https://github.com/kubeflow/pipelines/pull/7288)
* Depends on `google-api-core>=1.31.5, >=2.3.2` [\#7377](https://github.com/kubeflow/pipelines/pull/7377)
* Fix bug that required KFP API server for `kfp components build` command to work [\#7430](https://github.com/kubeflow/pipelines/pull/7430)
* Pass default value for inputs and remove deprecated items in v1 [\#7405](https://github.com/kubeflow/pipelines/pull/7405)
## Documentation Updates

View File

@ -13,8 +13,7 @@
# limitations under the License.
"""KFP DSL compiler.
This is an experimental implementation of KFP compiler that compiles KFP
pipeline into Pipeline IR:
Implementation of KFP compiler that compiles KFP pipeline into Pipeline IR:
https://docs.google.com/document/d/1PUDuSQ8vmeKSBloli53mp7GIvzekaY7sggg6ywy35Dk/
"""
import collections

View File

@ -727,6 +727,37 @@ class TestWriteToFileTypes(parameterized.TestCase):
finally:
shutil.rmtree(tmpdir)
def test_compile_pipeline_with_default_value(self):
tmpdir = tempfile.mkdtemp()
try:
producer_op = components.load_component_from_text("""
name: producer
inputs:
- {name: location, type: String, default: 'us-central1'}
- {name: name, type: Integer, default: 1}
- {name: noDefault, type: String}
implementation:
container:
image: gcr.io/my-project/my-image:tag
args:
- {inputValue: location}
""")
@dsl.pipeline(name='test-pipeline')
def simple_pipeline():
producer = producer_op(location="1")
target_json_file = os.path.join(tmpdir, 'result.json')
compiler.Compiler().compile(
pipeline_func=simple_pipeline, package_path=target_json_file)
self.assertTrue(os.path.exists(target_json_file))
with open(target_json_file, 'r') as f:
print(f.read())
pass
finally:
shutil.rmtree(tmpdir)
if __name__ == '__main__':
unittest.main()

View File

@ -339,13 +339,17 @@ def build_component_spec_for_task(
continue
# skip inputs not present, as a workaround to support optional inputs.
if input_name not in task.inputs:
if input_name not in task.inputs and input_spec.default is None:
continue
if type_utils.is_parameter_type(input_spec.type):
component_spec.input_definitions.parameters[
input_name].parameter_type = type_utils.get_parameter_type(
input_spec.type)
if input_spec.default is not None:
component_spec.input_definitions.parameters[
input_name].default_value.CopyFrom(_to_protobuf_value(input_spec.default))
else:
component_spec.input_definitions.artifacts[
input_name].artifact_type.CopyFrom(
@ -503,7 +507,6 @@ def build_component_spec_for_group(
input_name].parameter_type = type_utils.get_parameter_type(
channel.channel_type)
# TODO: should we fill in default value for all groups and tasks?
if is_root_group:
_fill_in_component_input_default_value(
component_spec=component_spec,

View File

@ -7,6 +7,11 @@ deploymentSpec:
exec-preprocess:
container:
image: python:3.7
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- preprocess
command:
- sh
- -c
@ -51,14 +56,13 @@ deploymentSpec:
\ 'w') as f:\n f.write(json.dumps(input_dict_parameter))\n\n with\
\ open(output_list_parameter_path, 'w') as f:\n f.write(json.dumps(input_list_parameter))\n\
\n"
exec-train:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- preprocess
exec-train:
container:
image: python:3.7
- train
command:
- sh
- -c
@ -101,40 +105,36 @@ deploymentSpec:
\ Model artifact, which has a .metadata dictionary\n # to store arbitrary\
\ metadata for the output artifact.\n model.metadata['accuracy'] = 0.9\n\
\n"
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- train
image: python:3.7
components:
comp-preprocess:
inputDefinitions:
parameters:
message:
parameterType: STRING
input_dict_parameter:
parameterType: STRUCT
message:
parameterType: STRING
input_list_parameter:
parameterType: LIST
outputDefinitions:
artifacts:
output_dataset_one:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
output_dataset_two_path:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
output_dataset_one:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
parameters:
output_parameter_path:
parameterType: STRING
output_bool_parameter_path:
parameterType: BOOLEAN
output_dict_parameter_path:
parameterType: STRUCT
output_list_parameter_path:
parameterType: LIST
output_parameter_path:
parameterType: STRING
output_dict_parameter_path:
parameterType: STRUCT
executorLabel: exec-preprocess
comp-train:
inputDefinitions:
@ -148,12 +148,15 @@ components:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
parameters:
message:
parameterType: STRING
input_bool:
parameterType: BOOLEAN
message:
parameterType: STRING
input_dict:
parameterType: STRUCT
num_steps:
parameterType: NUMBER_INTEGER
defaultValue: 100.0
input_list:
parameterType: LIST
outputDefinitions:
@ -175,30 +178,15 @@ root:
B: 2.0
dag:
tasks:
preprocess:
taskInfo:
name: preprocess
inputs:
parameters:
message:
componentInputParameter: message
input_dict_parameter:
componentInputParameter: input_dict
input_list_parameter:
runtimeValue:
constant:
- a
- b
- c
cachingOptions:
enableCache: true
componentRef:
name: comp-preprocess
train:
taskInfo:
name: train
inputs:
parameters:
input_list:
taskOutputParameter:
producerTask: preprocess
outputParameterKey: output_list_parameter_path
message:
taskOutputParameter:
producerTask: preprocess
@ -211,23 +199,38 @@ root:
taskOutputParameter:
producerTask: preprocess
outputParameterKey: output_dict_parameter_path
input_list:
taskOutputParameter:
producerTask: preprocess
outputParameterKey: output_list_parameter_path
artifacts:
dataset_one_path:
taskOutputArtifact:
producerTask: preprocess
outputArtifactKey: output_dataset_one
dataset_two:
taskOutputArtifact:
producerTask: preprocess
outputArtifactKey: output_dataset_two_path
dataset_one_path:
taskOutputArtifact:
producerTask: preprocess
outputArtifactKey: output_dataset_one
dependentTasks:
- preprocess
cachingOptions:
enableCache: true
componentRef:
name: comp-train
preprocess:
taskInfo:
name: preprocess
inputs:
parameters:
input_list_parameter:
runtimeValue:
constant:
- a
- b
- c
message:
componentInputParameter: message
input_dict_parameter:
componentInputParameter: input_dict
cachingOptions:
enableCache: true
componentRef:
name: comp-preprocess
defaultPipelineRoot: dummy_root

View File

@ -6,7 +6,6 @@ deploymentSpec:
executors:
exec-component-op:
container:
image: python:3.7
command:
- sh
- -c
@ -29,6 +28,7 @@ deploymentSpec:
\ {input1}, type: {type(input1)}')\n print(f'input2: {input2}, type:\
\ {type(input2)}')\n print(f'input3: {input3}, type: {type(input3)}')\n\
\n"
image: python:3.7
args:
- --executor_input
- '{{$}}'
@ -38,10 +38,11 @@ components:
comp-component-op:
inputDefinitions:
parameters:
input1:
parameterType: STRING
input2:
parameterType: STRING
input1:
parameterType: STRING
defaultValue: default value
executorLabel: exec-component-op
root:
dag:
@ -51,12 +52,12 @@ root:
name: component-op
inputs:
parameters:
input1:
runtimeValue:
constant: Hello
input2:
runtimeValue:
constant: World
input1:
runtimeValue:
constant: Hello
cachingOptions:
enableCache: true
componentRef:

View File

@ -4,26 +4,62 @@ sdkVersion: kfp-2.0.0-alpha.1
schemaVersion: 2.1.0
deploymentSpec:
executors:
exec-chicago-taxi-trips-dataset:
exec-xgboost-predict-2:
container:
image: byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342
image: python:3.7
args:
- --data
- '{{$.inputs.artifacts[''data''].path}}'
- --model
- '{{$.inputs.artifacts[''model''].path}}'
- --label-column-name
- '{{$.inputs.parameters[''label_column_name'']}}'
- --predictions
- '{{$.outputs.artifacts[''predictions''].path}}'
command:
- sh
- -c
- "set -e -x -o pipefail\noutput_path=\"$0\"\nselect=\"$1\"\nwhere=\"$2\"\n\
limit=\"$3\"\nformat=\"$4\"\nmkdir -p \"$(dirname \"$output_path\")\"\n\
curl --get 'https://data.cityofchicago.org/resource/wrvz-psew.'\"${format}\"\
\ \\\n --data-urlencode '$limit='\"${limit}\" \\\n --data-urlencode\
\ '$where='\"${where}\" \\\n --data-urlencode '$select='\"${select}\"\
\ \\\n | tr -d '\"' > \"$output_path\" # Removing unneeded quotes around\
\ all numbers\n"
- '{{$.outputs.artifacts[''table''].path}}'
- '{{$.inputs.parameters[''select'']}}'
- '{{$.inputs.parameters[''where'']}}'
- '{{$.inputs.parameters[''limit'']}}'
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1
python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1'
'pandas==1.0.5' 'pyarrow==0.17.1' --user) && "$0" "$@"
- python3
- -u
- -c
- "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\
\ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\
\ file_path\n\ndef xgboost_predict(\n data_path,\n model_path,\n \
\ predictions_path,\n label_column_name = None,\n):\n '''Make predictions\
\ using a trained XGBoost model.\n\n Args:\n data_path: Path for\
\ the feature data in Apache Parquet format.\n model_path: Path for\
\ the trained model in binary XGBoost format.\n predictions_path:\
\ Output path for the predictions.\n label_column_name: Optional.\
\ Name of the column containing the label data that is excluded during the\
\ prediction.\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>\n\
\ '''\n from pathlib import Path\n\n import numpy\n import pandas\n\
\ import xgboost\n\n # Loading data\n df = pandas.read_parquet(data_path)\n\
\ if label_column_name:\n df = df.drop(columns=[label_column_name])\n\
\n evaluation_data = xgboost.DMatrix(\n data=df,\n )\n\n \
\ # Training\n model = xgboost.Booster(model_file=model_path)\n\n \
\ predictions = model.predict(evaluation_data)\n\n Path(predictions_path).parent.mkdir(parents=True,\
\ exist_ok=True)\n numpy.savetxt(predictions_path, predictions)\n\nimport\
\ argparse\n_parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make\
\ predictions using a trained XGBoost model.\\n\\n Args:\\n data_path:\
\ Path for the feature data in Apache Parquet format.\\n model_path:\
\ Path for the trained model in binary XGBoost format.\\n predictions_path:\
\ Output path for the predictions.\\n label_column_name: Optional.\
\ Name of the column containing the label data that is excluded during the\
\ prediction.\\n\\n Annotations:\\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')\n\
_parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\
\ default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"\
model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"\
--label-column-name\", dest=\"label_column_name\", type=str, required=False,\
\ default=argparse.SUPPRESS)\n_parser.add_argument(\"--predictions\", dest=\"\
predictions_path\", type=_make_parent_dirs_and_return_path, required=True,\
\ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\
\n_outputs = xgboost_predict(**_parsed_args)\n"
exec-xgboost-train:
container:
image: python:3.7
command:
- sh
- -c
@ -129,9 +165,116 @@ deploymentSpec:
- '{{$.outputs.artifacts[''model''].path}}'
- --model-config
- '{{$.outputs.artifacts[''model_config''].path}}'
exec-xgboost-predict:
image: python:3.7
exec-xgboost-predict-3:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1
python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1'
'pandas==1.0.5' 'pyarrow==0.17.1' --user) && "$0" "$@"
- python3
- -u
- -c
- "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\
\ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\
\ file_path\n\ndef xgboost_predict(\n data_path,\n model_path,\n \
\ predictions_path,\n label_column_name = None,\n):\n '''Make predictions\
\ using a trained XGBoost model.\n\n Args:\n data_path: Path for\
\ the feature data in Apache Parquet format.\n model_path: Path for\
\ the trained model in binary XGBoost format.\n predictions_path:\
\ Output path for the predictions.\n label_column_name: Optional.\
\ Name of the column containing the label data that is excluded during the\
\ prediction.\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>\n\
\ '''\n from pathlib import Path\n\n import numpy\n import pandas\n\
\ import xgboost\n\n # Loading data\n df = pandas.read_parquet(data_path)\n\
\ if label_column_name:\n df = df.drop(columns=[label_column_name])\n\
\n evaluation_data = xgboost.DMatrix(\n data=df,\n )\n\n \
\ # Training\n model = xgboost.Booster(model_file=model_path)\n\n \
\ predictions = model.predict(evaluation_data)\n\n Path(predictions_path).parent.mkdir(parents=True,\
\ exist_ok=True)\n numpy.savetxt(predictions_path, predictions)\n\nimport\
\ argparse\n_parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make\
\ predictions using a trained XGBoost model.\\n\\n Args:\\n data_path:\
\ Path for the feature data in Apache Parquet format.\\n model_path:\
\ Path for the trained model in binary XGBoost format.\\n predictions_path:\
\ Output path for the predictions.\\n label_column_name: Optional.\
\ Name of the column containing the label data that is excluded during the\
\ prediction.\\n\\n Annotations:\\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')\n\
_parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\
\ default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"\
model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"\
--label-column-name\", dest=\"label_column_name\", type=str, required=False,\
\ default=argparse.SUPPRESS)\n_parser.add_argument(\"--predictions\", dest=\"\
predictions_path\", type=_make_parent_dirs_and_return_path, required=True,\
\ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\
\n_outputs = xgboost_predict(**_parsed_args)\n"
args:
- --data
- '{{$.inputs.artifacts[''data''].path}}'
- --model
- '{{$.inputs.artifacts[''model''].path}}'
- --label-column-name
- '{{$.inputs.parameters[''label_column_name'']}}'
- --predictions
- '{{$.outputs.artifacts[''predictions''].path}}'
exec-chicago-taxi-trips-dataset:
container:
command:
- sh
- -c
- "set -e -x -o pipefail\noutput_path=\"$0\"\nselect=\"$1\"\nwhere=\"$2\"\n\
limit=\"$3\"\nformat=\"$4\"\nmkdir -p \"$(dirname \"$output_path\")\"\n\
curl --get 'https://data.cityofchicago.org/resource/wrvz-psew.'\"${format}\"\
\ \\\n --data-urlencode '$limit='\"${limit}\" \\\n --data-urlencode\
\ '$where='\"${where}\" \\\n --data-urlencode '$select='\"${select}\"\
\ \\\n | tr -d '\"' > \"$output_path\" # Removing unneeded quotes around\
\ all numbers\n"
- '{{$.outputs.artifacts[''table''].path}}'
- '{{$.inputs.parameters[''select'']}}'
- '{{$.inputs.parameters[''where'']}}'
- '{{$.inputs.parameters[''limit'']}}'
image: byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342
exec-convert-csv-to-apache-parquet:
container:
image: python:3.7
args:
- --data
- '{{$.inputs.artifacts[''data''].path}}'
- --output-data
- '{{$.outputs.artifacts[''output_data''].path}}'
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install
--quiet --no-warn-script-location 'pyarrow==0.17.1' --user) && "$0" "$@"
- python3
- -u
- -c
- "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\
\ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\
\ file_path\n\ndef convert_csv_to_apache_parquet(\n data_path,\n output_data_path,\n\
):\n '''Converts CSV table to Apache Parquet.\n\n [Apache Parquet](https://parquet.apache.org/)\n\
\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>\n\
\ '''\n from pyarrow import csv, parquet\n\n table = csv.read_csv(data_path)\n\
\ parquet.write_table(table, output_data_path)\n\nimport argparse\n_parser\
\ = argparse.ArgumentParser(prog='Convert csv to apache parquet', description='Converts\
\ CSV table to Apache Parquet.\\n\\n [Apache Parquet](https://parquet.apache.org/)\\\
n\\n Annotations:\\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')\n\
_parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\
\ default=argparse.SUPPRESS)\n_parser.add_argument(\"--output-data\", dest=\"\
output_data_path\", type=_make_parent_dirs_and_return_path, required=True,\
\ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\
_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = convert_csv_to_apache_parquet(**_parsed_args)\n\
\n_output_serializers = [\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n\
\ try:\n os.makedirs(os.path.dirname(output_file))\n except\
\ OSError:\n pass\n with open(output_file, 'w') as f:\n \
\ f.write(_output_serializers[idx](_outputs[idx]))\n"
exec-xgboost-predict-4:
container:
command:
- sh
- -c
@ -171,6 +314,7 @@ deploymentSpec:
_parser.add_argument(\"--predictions\", dest=\"predictions_path\", type=_make_parent_dirs_and_return_path,\
\ required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\
\n_outputs = xgboost_predict(**_parsed_args)\n"
image: python:3.7
args:
- --data
- '{{$.inputs.artifacts[''data''].path}}'
@ -180,45 +324,21 @@ deploymentSpec:
- '{{$.inputs.parameters[''label_column'']}}'
- --predictions
- '{{$.outputs.artifacts[''predictions''].path}}'
exec-convert-csv-to-apache-parquet:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install
--quiet --no-warn-script-location 'pyarrow==0.17.1' --user) && "$0" "$@"
- python3
- -u
- -c
- "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\
\ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\
\ file_path\n\ndef convert_csv_to_apache_parquet(\n data_path,\n output_data_path,\n\
):\n '''Converts CSV table to Apache Parquet.\n\n [Apache Parquet](https://parquet.apache.org/)\n\
\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>\n\
\ '''\n from pyarrow import csv, parquet\n\n table = csv.read_csv(data_path)\n\
\ parquet.write_table(table, output_data_path)\n\nimport argparse\n_parser\
\ = argparse.ArgumentParser(prog='Convert csv to apache parquet', description='Converts\
\ CSV table to Apache Parquet.\\n\\n [Apache Parquet](https://parquet.apache.org/)\\\
n\\n Annotations:\\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')\n\
_parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\
\ default=argparse.SUPPRESS)\n_parser.add_argument(\"--output-data\", dest=\"\
output_data_path\", type=_make_parent_dirs_and_return_path, required=True,\
\ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\
_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = convert_csv_to_apache_parquet(**_parsed_args)\n\
\n_output_serializers = [\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n\
\ try:\n os.makedirs(os.path.dirname(output_file))\n except\
\ OSError:\n pass\n with open(output_file, 'w') as f:\n \
\ f.write(_output_serializers[idx](_outputs[idx]))\n"
args:
- --data
- '{{$.inputs.artifacts[''data''].path}}'
- --output-data
- '{{$.outputs.artifacts[''output_data''].path}}'
exec-xgboost-train-2:
container:
image: python:3.7
args:
- --training-data
- '{{$.inputs.artifacts[''training_data''].path}}'
- --label-column-name
- '{{$.inputs.parameters[''label_column_name'']}}'
- --num-iterations
- '{{$.inputs.parameters[''num_iterations'']}}'
- --objective
- '{{$.inputs.parameters[''objective'']}}'
- --model
- '{{$.outputs.artifacts[''model''].path}}'
- --model-config
- '{{$.outputs.artifacts[''model_config''].path}}'
command:
- sh
- -c
@ -311,130 +431,9 @@ deploymentSpec:
, dest=\"model_config_path\", type=_make_parent_dirs_and_return_path, required=True,\
\ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\
\n_outputs = xgboost_train(**_parsed_args)\n"
args:
- --training-data
- '{{$.inputs.artifacts[''training_data''].path}}'
- --label-column-name
- '{{$.inputs.parameters[''label_column_name'']}}'
- --num-iterations
- '{{$.inputs.parameters[''num_iterations'']}}'
- --objective
- '{{$.inputs.parameters[''objective'']}}'
- --model
- '{{$.outputs.artifacts[''model''].path}}'
- --model-config
- '{{$.outputs.artifacts[''model_config''].path}}'
exec-xgboost-predict-2:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1
python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1'
'pandas==1.0.5' 'pyarrow==0.17.1' --user) && "$0" "$@"
- python3
- -u
- -c
- "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\
\ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\
\ file_path\n\ndef xgboost_predict(\n data_path,\n model_path,\n \
\ predictions_path,\n label_column_name = None,\n):\n '''Make predictions\
\ using a trained XGBoost model.\n\n Args:\n data_path: Path for\
\ the feature data in Apache Parquet format.\n model_path: Path for\
\ the trained model in binary XGBoost format.\n predictions_path:\
\ Output path for the predictions.\n label_column_name: Optional.\
\ Name of the column containing the label data that is excluded during the\
\ prediction.\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>\n\
\ '''\n from pathlib import Path\n\n import numpy\n import pandas\n\
\ import xgboost\n\n # Loading data\n df = pandas.read_parquet(data_path)\n\
\ if label_column_name:\n df = df.drop(columns=[label_column_name])\n\
\n evaluation_data = xgboost.DMatrix(\n data=df,\n )\n\n \
\ # Training\n model = xgboost.Booster(model_file=model_path)\n\n \
\ predictions = model.predict(evaluation_data)\n\n Path(predictions_path).parent.mkdir(parents=True,\
\ exist_ok=True)\n numpy.savetxt(predictions_path, predictions)\n\nimport\
\ argparse\n_parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make\
\ predictions using a trained XGBoost model.\\n\\n Args:\\n data_path:\
\ Path for the feature data in Apache Parquet format.\\n model_path:\
\ Path for the trained model in binary XGBoost format.\\n predictions_path:\
\ Output path for the predictions.\\n label_column_name: Optional.\
\ Name of the column containing the label data that is excluded during the\
\ prediction.\\n\\n Annotations:\\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')\n\
_parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\
\ default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"\
model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"\
--label-column-name\", dest=\"label_column_name\", type=str, required=False,\
\ default=argparse.SUPPRESS)\n_parser.add_argument(\"--predictions\", dest=\"\
predictions_path\", type=_make_parent_dirs_and_return_path, required=True,\
\ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\
\n_outputs = xgboost_predict(**_parsed_args)\n"
args:
- --data
- '{{$.inputs.artifacts[''data''].path}}'
- --model
- '{{$.inputs.artifacts[''model''].path}}'
- --label-column-name
- '{{$.inputs.parameters[''label_column_name'']}}'
- --predictions
- '{{$.outputs.artifacts[''predictions''].path}}'
exec-xgboost-predict-3:
exec-xgboost-predict:
container:
image: python:3.7
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1
python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1'
'pandas==1.0.5' 'pyarrow==0.17.1' --user) && "$0" "$@"
- python3
- -u
- -c
- "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\
\ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\
\ file_path\n\ndef xgboost_predict(\n data_path,\n model_path,\n \
\ predictions_path,\n label_column_name = None,\n):\n '''Make predictions\
\ using a trained XGBoost model.\n\n Args:\n data_path: Path for\
\ the feature data in Apache Parquet format.\n model_path: Path for\
\ the trained model in binary XGBoost format.\n predictions_path:\
\ Output path for the predictions.\n label_column_name: Optional.\
\ Name of the column containing the label data that is excluded during the\
\ prediction.\n\n Annotations:\n author: Alexey Volkov <alexey.volkov@ark-kun.com>\n\
\ '''\n from pathlib import Path\n\n import numpy\n import pandas\n\
\ import xgboost\n\n # Loading data\n df = pandas.read_parquet(data_path)\n\
\ if label_column_name:\n df = df.drop(columns=[label_column_name])\n\
\n evaluation_data = xgboost.DMatrix(\n data=df,\n )\n\n \
\ # Training\n model = xgboost.Booster(model_file=model_path)\n\n \
\ predictions = model.predict(evaluation_data)\n\n Path(predictions_path).parent.mkdir(parents=True,\
\ exist_ok=True)\n numpy.savetxt(predictions_path, predictions)\n\nimport\
\ argparse\n_parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make\
\ predictions using a trained XGBoost model.\\n\\n Args:\\n data_path:\
\ Path for the feature data in Apache Parquet format.\\n model_path:\
\ Path for the trained model in binary XGBoost format.\\n predictions_path:\
\ Output path for the predictions.\\n label_column_name: Optional.\
\ Name of the column containing the label data that is excluded during the\
\ prediction.\\n\\n Annotations:\\n author: Alexey Volkov <alexey.volkov@ark-kun.com>')\n\
_parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\
\ default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"\
model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"\
--label-column-name\", dest=\"label_column_name\", type=str, required=False,\
\ default=argparse.SUPPRESS)\n_parser.add_argument(\"--predictions\", dest=\"\
predictions_path\", type=_make_parent_dirs_and_return_path, required=True,\
\ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\
\n_outputs = xgboost_predict(**_parsed_args)\n"
args:
- --data
- '{{$.inputs.artifacts[''data''].path}}'
- --model
- '{{$.inputs.artifacts[''model''].path}}'
- --label-column-name
- '{{$.inputs.parameters[''label_column_name'']}}'
- --predictions
- '{{$.outputs.artifacts[''predictions''].path}}'
exec-xgboost-predict-4:
container:
image: python:3.7
command:
- sh
- -c
@ -474,6 +473,7 @@ deploymentSpec:
_parser.add_argument(\"--predictions\", dest=\"predictions_path\", type=_make_parent_dirs_and_return_path,\
\ required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\
\n_outputs = xgboost_predict(**_parsed_args)\n"
image: python:3.7
args:
- --data
- '{{$.inputs.artifacts[''data''].path}}'
@ -487,12 +487,18 @@ components:
comp-chicago-taxi-trips-dataset:
inputDefinitions:
parameters:
where:
parameterType: STRING
limit:
parameterType: NUMBER_INTEGER
select:
parameterType: STRING
defaultValue: trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location
limit:
parameterType: NUMBER_INTEGER
defaultValue: '1000'
format:
parameterType: STRING
defaultValue: csv
where:
parameterType: STRING
defaultValue: trip_start_timestamp>="1900-01-01" AND trip_start_timestamp<"2100-01-01"
outputDefinitions:
artifacts:
table:
@ -500,52 +506,6 @@ components:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-chicago-taxi-trips-dataset
comp-xgboost-train:
inputDefinitions:
artifacts:
training_data:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
parameters:
label_column:
parameterType: NUMBER_INTEGER
num_iterations:
parameterType: NUMBER_INTEGER
objective:
parameterType: STRING
outputDefinitions:
artifacts:
model:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
model_config:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-xgboost-train
comp-xgboost-predict:
inputDefinitions:
artifacts:
data:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
model:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
parameters:
label_column:
parameterType: NUMBER_INTEGER
outputDefinitions:
artifacts:
predictions:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-xgboost-predict
comp-convert-csv-to-apache-parquet:
inputDefinitions:
artifacts:
@ -560,7 +520,7 @@ components:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-convert-csv-to-apache-parquet
comp-xgboost-train-2:
comp-xgboost-train:
inputDefinitions:
artifacts:
training_data:
@ -568,65 +528,38 @@ components:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
parameters:
label_column_name:
booster:
parameterType: STRING
num_iterations:
defaultValue: gbtree
learning_rate:
parameterType: NUMBER_DOUBLE
defaultValue: '0.3'
max_depth:
parameterType: NUMBER_INTEGER
defaultValue: '6'
objective:
parameterType: STRING
defaultValue: reg:squarederror
num_iterations:
parameterType: NUMBER_INTEGER
defaultValue: '10'
label_column:
parameterType: NUMBER_INTEGER
defaultValue: '0'
min_split_loss:
parameterType: NUMBER_DOUBLE
defaultValue: '0'
outputDefinitions:
artifacts:
model:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
model_config:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-xgboost-train-2
comp-xgboost-predict-2:
inputDefinitions:
artifacts:
data:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
model:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
parameters:
label_column_name:
parameterType: STRING
outputDefinitions:
artifacts:
predictions:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-xgboost-predict-2
comp-xgboost-predict-3:
inputDefinitions:
artifacts:
data:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
model:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
parameters:
label_column_name:
parameterType: STRING
outputDefinitions:
artifacts:
predictions:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-xgboost-predict-3
executorLabel: exec-xgboost-train
comp-xgboost-predict-4:
inputDefinitions:
artifacts:
@ -648,53 +581,111 @@ components:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-xgboost-predict-4
comp-xgboost-predict-2:
inputDefinitions:
artifacts:
model:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
data:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
parameters:
label_column_name:
parameterType: STRING
outputDefinitions:
artifacts:
predictions:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-xgboost-predict-2
comp-xgboost-train-2:
inputDefinitions:
artifacts:
training_data:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
parameters:
learning_rate:
parameterType: NUMBER_DOUBLE
defaultValue: '0.3'
objective:
parameterType: STRING
defaultValue: reg:squarederror
min_split_loss:
parameterType: NUMBER_DOUBLE
defaultValue: '0'
max_depth:
parameterType: NUMBER_INTEGER
defaultValue: '6'
num_iterations:
parameterType: NUMBER_INTEGER
defaultValue: '10'
label_column_name:
parameterType: STRING
booster:
parameterType: STRING
defaultValue: gbtree
outputDefinitions:
artifacts:
model:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
model_config:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-xgboost-train-2
comp-xgboost-predict-3:
inputDefinitions:
artifacts:
model:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
data:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
parameters:
label_column_name:
parameterType: STRING
outputDefinitions:
artifacts:
predictions:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-xgboost-predict-3
comp-xgboost-predict:
inputDefinitions:
artifacts:
model:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
data:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
parameters:
label_column:
parameterType: NUMBER_INTEGER
outputDefinitions:
artifacts:
predictions:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
executorLabel: exec-xgboost-predict
root:
dag:
tasks:
chicago-taxi-trips-dataset:
taskInfo:
name: chicago-taxi-trips-dataset
inputs:
parameters:
where:
runtimeValue:
constant: trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp
< "2019-02-01"
select:
runtimeValue:
constant: tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total
limit:
runtimeValue:
constant: 10000.0
cachingOptions:
enableCache: true
componentRef:
name: comp-chicago-taxi-trips-dataset
xgboost-train:
taskInfo:
name: xgboost-train
inputs:
parameters:
label_column:
runtimeValue:
constant: 0.0
objective:
runtimeValue:
constant: reg:squarederror
num_iterations:
runtimeValue:
constant: 200.0
artifacts:
training_data:
taskOutputArtifact:
producerTask: chicago-taxi-trips-dataset
outputArtifactKey: table
dependentTasks:
- chicago-taxi-trips-dataset
cachingOptions:
enableCache: true
componentRef:
name: comp-xgboost-train
xgboost-predict:
taskInfo:
name: xgboost-predict
@ -719,6 +710,55 @@ root:
enableCache: true
componentRef:
name: comp-xgboost-predict
xgboost-predict-3:
taskInfo:
name: xgboost-predict-3
inputs:
parameters:
label_column_name:
runtimeValue:
constant: tips
artifacts:
model:
taskOutputArtifact:
producerTask: xgboost-train
outputArtifactKey: model
data:
taskOutputArtifact:
producerTask: convert-csv-to-apache-parquet
outputArtifactKey: output_data
dependentTasks:
- convert-csv-to-apache-parquet
- xgboost-train
cachingOptions:
enableCache: true
componentRef:
name: comp-xgboost-predict-3
xgboost-train:
taskInfo:
name: xgboost-train
inputs:
parameters:
num_iterations:
runtimeValue:
constant: 200.0
label_column:
runtimeValue:
constant: 0.0
objective:
runtimeValue:
constant: reg:squarederror
artifacts:
training_data:
taskOutputArtifact:
producerTask: chicago-taxi-trips-dataset
outputArtifactKey: table
dependentTasks:
- chicago-taxi-trips-dataset
cachingOptions:
enableCache: true
componentRef:
name: comp-xgboost-train
convert-csv-to-apache-parquet:
taskInfo:
name: convert-csv-to-apache-parquet
@ -734,79 +774,6 @@ root:
enableCache: true
componentRef:
name: comp-convert-csv-to-apache-parquet
xgboost-train-2:
taskInfo:
name: xgboost-train-2
inputs:
parameters:
label_column_name:
runtimeValue:
constant: tips
objective:
runtimeValue:
constant: reg:squarederror
num_iterations:
runtimeValue:
constant: 200.0
artifacts:
training_data:
taskOutputArtifact:
producerTask: convert-csv-to-apache-parquet
outputArtifactKey: output_data
dependentTasks:
- convert-csv-to-apache-parquet
cachingOptions:
enableCache: true
componentRef:
name: comp-xgboost-train-2
xgboost-predict-2:
taskInfo:
name: xgboost-predict-2
inputs:
parameters:
label_column_name:
runtimeValue:
constant: tips
artifacts:
data:
taskOutputArtifact:
producerTask: convert-csv-to-apache-parquet
outputArtifactKey: output_data
model:
taskOutputArtifact:
producerTask: xgboost-train-2
outputArtifactKey: model
dependentTasks:
- convert-csv-to-apache-parquet
- xgboost-train-2
cachingOptions:
enableCache: true
componentRef:
name: comp-xgboost-predict-2
xgboost-predict-3:
taskInfo:
name: xgboost-predict-3
inputs:
parameters:
label_column_name:
runtimeValue:
constant: tips
artifacts:
data:
taskOutputArtifact:
producerTask: convert-csv-to-apache-parquet
outputArtifactKey: output_data
model:
taskOutputArtifact:
producerTask: xgboost-train
outputArtifactKey: model
dependentTasks:
- convert-csv-to-apache-parquet
- xgboost-train
cachingOptions:
enableCache: true
componentRef:
name: comp-xgboost-predict-3
xgboost-predict-4:
taskInfo:
name: xgboost-predict-4
@ -831,4 +798,72 @@ root:
enableCache: true
componentRef:
name: comp-xgboost-predict-4
chicago-taxi-trips-dataset:
taskInfo:
name: chicago-taxi-trips-dataset
inputs:
parameters:
limit:
runtimeValue:
constant: 10000.0
where:
runtimeValue:
constant: trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp
< "2019-02-01"
select:
runtimeValue:
constant: tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total
cachingOptions:
enableCache: true
componentRef:
name: comp-chicago-taxi-trips-dataset
xgboost-train-2:
taskInfo:
name: xgboost-train-2
inputs:
parameters:
objective:
runtimeValue:
constant: reg:squarederror
num_iterations:
runtimeValue:
constant: 200.0
label_column_name:
runtimeValue:
constant: tips
artifacts:
training_data:
taskOutputArtifact:
producerTask: convert-csv-to-apache-parquet
outputArtifactKey: output_data
dependentTasks:
- convert-csv-to-apache-parquet
cachingOptions:
enableCache: true
componentRef:
name: comp-xgboost-train-2
xgboost-predict-2:
taskInfo:
name: xgboost-predict-2
inputs:
parameters:
label_column_name:
runtimeValue:
constant: tips
artifacts:
model:
taskOutputArtifact:
producerTask: xgboost-train-2
outputArtifactKey: model
data:
taskOutputArtifact:
producerTask: convert-csv-to-apache-parquet
outputArtifactKey: output_data
dependentTasks:
- convert-csv-to-apache-parquet
- xgboost-train-2
cachingOptions:
enableCache: true
componentRef:
name: comp-xgboost-predict-2
defaultPipelineRoot: dummy_root