pipelines/components/gcp/automl/split_dataset_table_column_.../component.yaml

92 lines
3.3 KiB
YAML

name: Automl split dataset table column names
inputs:
- name: dataset_path
type: String
- name: target_column_name
type: String
- name: table_index
type: Integer
default: '0'
optional: true
outputs:
- name: target_column_path
type: String
- name: feature_column_paths
type: JsonArray
implementation:
container:
image: python:3.7
command:
- python3
- -u
- -c
- |
from typing import NamedTuple
def automl_split_dataset_table_column_names(
dataset_path: str,
target_column_name: str,
table_index: int = 0,
) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]):
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
from google.cloud import automl
client = automl.AutoMlClient()
list_table_specs_response = client.list_table_specs(dataset_path)
table_specs = [s for s in list_table_specs_response]
print('table_specs=')
print(table_specs)
table_spec_name = table_specs[table_index].name
list_column_specs_response = client.list_column_specs(table_spec_name)
column_specs = [s for s in list_column_specs_response]
print('column_specs=')
print(column_specs)
target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0]
feature_column_specs = [s for s in column_specs if s.display_name != target_column_name]
feature_column_names = [s.name for s in feature_column_specs]
import json
return (target_column_spec.name, json.dumps(feature_column_names))
import argparse
_missing_arg = object()
_parser = argparse.ArgumentParser(prog='Automl split dataset table column names', description='')
_parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
_parser.add_argument("--target-column-name", dest="target_column_name", type=str, required=True, default=_missing_arg)
_parser.add_argument("--table-index", dest="table_index", type=int, required=False, default=_missing_arg)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
_parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = automl_split_dataset_table_column_names(**_parsed_args)
if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
_outputs = [_outputs]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(str(_outputs[idx]))
args:
- --dataset-path
- inputValue: dataset_path
- --target-column-name
- inputValue: target_column_name
- if:
cond:
isPresent: table_index
then:
- --table-index
- inputValue: table_index
- '----output-paths'
- outputPath: target_column_path
- outputPath: feature_column_paths