pipelines/components/gcp/automl/split_dataset_table_column_.../component.yaml

name: Automl split dataset table column names
inputs:
- name: dataset_path
  type: String
- name: target_column_name
  type: String
- name: table_index
  type: Integer
  default: '0'
  optional: true
outputs:
- name: target_column_path
  type: String
- name: feature_column_paths
  type: JsonArray
implementation:
  container:
    image: python:3.7
    command:
    - python3
    - -u
    - -c
    - |
      from typing import NamedTuple

      def automl_split_dataset_table_column_names(
          dataset_path: str,
          target_column_name: str,
          table_index: int = 0,
      ) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]):
          import sys
          import subprocess
          subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)

          from google.cloud import automl
          client = automl.AutoMlClient()
          list_table_specs_response = client.list_table_specs(dataset_path)
          table_specs = [s for s in list_table_specs_response]
          print('table_specs=')
          print(table_specs)
          table_spec_name = table_specs[table_index].name

          list_column_specs_response = client.list_column_specs(table_spec_name)
          column_specs = [s for s in list_column_specs_response]
          print('column_specs=')
          print(column_specs)

          target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0]
          feature_column_specs = [s for s in column_specs if s.display_name != target_column_name]
          feature_column_names = [s.name for s in feature_column_specs]

          import json
          return (target_column_spec.name, json.dumps(feature_column_names))

      import argparse
      _missing_arg = object()
      _parser = argparse.ArgumentParser(prog='Automl split dataset table column names', description='')
      _parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg)
      _parser.add_argument("--target-column-name", dest="target_column_name", type=str, required=True, default=_missing_arg)
      _parser.add_argument("--table-index", dest="table_index", type=int, required=False, default=_missing_arg)
      _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
      _parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg}
      _output_files = _parsed_args.pop("_output_paths", [])

      _outputs = automl_split_dataset_table_column_names(**_parsed_args)

      if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
          _outputs = [_outputs]

      import os
      for idx, output_file in enumerate(_output_files):
          try:
              os.makedirs(os.path.dirname(output_file))
          except OSError:
              pass
          with open(output_file, 'w') as f:
              f.write(str(_outputs[idx]))
    args:
    - --dataset-path
    - inputValue: dataset_path
    - --target-column-name
    - inputValue: target_column_name
    - if:
        cond:
          isPresent: table_index
        then:
        - --table-index
        - inputValue: table_index
    - '----output-paths'
    - outputPath: target_column_path
    - outputPath: feature_column_paths