# Copyright 2020 kubeflow.org # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import kfp.dsl as dsl import json import kfp.components as comp from collections import OrderedDict from kubernetes import client as k8s_client def loaddata(): from kale.utils import mlmd_utils as _kale_mlmd_utils _kale_mlmd_utils.init_metadata() block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' path = "data/" PREDICTION_LABEL = 'Survived' test_df = pd.read_csv(path + "test.csv") train_df = pd.read_csv(path + "train.csv") ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(PREDICTION_LABEL, "PREDICTION_LABEL") _kale_marshal_utils.save(test_df, "test_df") _kale_marshal_utils.save(train_df, "train_df") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.kfp_utils import \ update_uimetadata as _kale_update_uimetadata blocks = ( block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/loaddata.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('loaddata') _kale_mlmd_utils.call("mark_execution_complete") def datapreprocessing(): from kale.utils import mlmd_utils as _kale_mlmd_utils _kale_mlmd_utils.init_metadata() data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() test_df = _kale_marshal_utils.load("test_df") train_df = _kale_marshal_utils.load("train_df") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' data = [train_df, test_df] for dataset in data: dataset['relatives'] = dataset['SibSp'] + dataset['Parch'] dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0 dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1 dataset['not_alone'] = dataset['not_alone'].astype(int) train_df['not_alone'].value_counts() ''' block3 = ''' # This does not contribute to a person survival probability train_df = train_df.drop(['PassengerId'], axis=1) ''' block4 = ''' import re deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8} data = [train_df, test_df] for dataset in data: dataset['Cabin'] = dataset['Cabin'].fillna("U0") dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group()) dataset['Deck'] = dataset['Deck'].map(deck) dataset['Deck'] = dataset['Deck'].fillna(0) dataset['Deck'] = dataset['Deck'].astype(int) # we can now drop the cabin feature train_df = train_df.drop(['Cabin'], axis=1) test_df = test_df.drop(['Cabin'], axis=1) ''' block5 = ''' data = [train_df, test_df] for dataset in data: mean = train_df["Age"].mean() std = test_df["Age"].std() is_null = dataset["Age"].isnull().sum() # compute random numbers between the mean, std and is_null rand_age = np.random.randint(mean - std, mean + std, size = is_null) # fill NaN values in Age column with random values generated age_slice = dataset["Age"].copy() age_slice[np.isnan(age_slice)] = rand_age dataset["Age"] = age_slice dataset["Age"] = train_df["Age"].astype(int) train_df["Age"].isnull().sum() ''' block6 = ''' train_df['Embarked'].describe() ''' block7 = ''' # fill with most common value common_value = 'S' data = [train_df, test_df] for dataset in data: dataset['Embarked'] = dataset['Embarked'].fillna(common_value) ''' block8 = ''' train_df.info() ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(test_df, "test_df") _kale_marshal_utils.save(train_df, "train_df") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.kfp_utils import \ update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, block3, block4, block5, block6, block7, block8, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/datapreprocessing.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('datapreprocessing') _kale_mlmd_utils.call("mark_execution_complete") def featureengineering(): from kale.utils import mlmd_utils as _kale_mlmd_utils _kale_mlmd_utils.init_metadata() data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() PREDICTION_LABEL = _kale_marshal_utils.load("PREDICTION_LABEL") test_df = _kale_marshal_utils.load("test_df") train_df = _kale_marshal_utils.load("train_df") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' data = [train_df, test_df] for dataset in data: dataset['Fare'] = dataset['Fare'].fillna(0) dataset['Fare'] = dataset['Fare'].astype(int) ''' block3 = ''' data = [train_df, test_df] titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} for dataset in data: # extract titles dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\\.', expand=False) # replace titles with a more common title or as Rare dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\\ 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') dataset['Title'] = dataset['Title'].replace('Ms', 'Miss') dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs') # convert titles into numbers dataset['Title'] = dataset['Title'].map(titles) # filling NaN with 0, to get safe dataset['Title'] = dataset['Title'].fillna(0) train_df = train_df.drop(['Name'], axis=1) test_df = test_df.drop(['Name'], axis=1) ''' block4 = ''' genders = {"male": 0, "female": 1} data = [train_df, test_df] for dataset in data: dataset['Sex'] = dataset['Sex'].map(genders) ''' block5 = ''' train_df = train_df.drop(['Ticket'], axis=1) test_df = test_df.drop(['Ticket'], axis=1) ''' block6 = ''' ports = {"S": 0, "C": 1, "Q": 2} data = [train_df, test_df] for dataset in data: dataset['Embarked'] = dataset['Embarked'].map(ports) ''' block7 = ''' data = [train_df, test_df] for dataset in data: dataset['Age'] = dataset['Age'].astype(int) dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0 dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1 dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2 dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3 dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4 dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5 dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6 dataset.loc[ dataset['Age'] > 66, 'Age'] = 6 # let's see how it's distributed train_df['Age'].value_counts() ''' block8 = ''' data = [train_df, test_df] for dataset in data: dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0 dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare'] = 3 dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare'] = 4 dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5 dataset['Fare'] = dataset['Fare'].astype(int) ''' block9 = ''' data = [train_df, test_df] for dataset in data: dataset['Age_Class']= dataset['Age']* dataset['Pclass'] ''' block10 = ''' for dataset in data: dataset['Fare_Per_Person'] = dataset['Fare']/(dataset['relatives']+1) dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int) # Let's take a last look at the training set, before we start training the models. train_df.head(10) ''' block11 = ''' train_labels = train_df[PREDICTION_LABEL] train_df = train_df.drop(PREDICTION_LABEL, axis=1) ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(train_df, "train_df") _kale_marshal_utils.save(train_labels, "train_labels") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.kfp_utils import \ update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, block3, block4, block5, block6, block7, block8, block9, block10, block11, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/featureengineering.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('featureengineering') _kale_mlmd_utils.call("mark_execution_complete") def decisiontree(): from kale.utils import mlmd_utils as _kale_mlmd_utils _kale_mlmd_utils.init_metadata() data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() train_df = _kale_marshal_utils.load("train_df") train_labels = _kale_marshal_utils.load("train_labels") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' decision_tree = DecisionTreeClassifier() decision_tree.fit(train_df, train_labels) acc_decision_tree = round(decision_tree.score(train_df, train_labels) * 100, 2) ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(acc_decision_tree, "acc_decision_tree") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.kfp_utils import \ update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/decisiontree.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('decisiontree') _kale_mlmd_utils.call("mark_execution_complete") def svm(): from kale.utils import mlmd_utils as _kale_mlmd_utils _kale_mlmd_utils.init_metadata() data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() train_df = _kale_marshal_utils.load("train_df") train_labels = _kale_marshal_utils.load("train_labels") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' linear_svc = SVC(gamma='auto') linear_svc.fit(train_df, train_labels) acc_linear_svc = round(linear_svc.score(train_df, train_labels) * 100, 2) ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(acc_linear_svc, "acc_linear_svc") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.kfp_utils import \ update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/svm.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('svm') _kale_mlmd_utils.call("mark_execution_complete") def naivebayes(): from kale.utils import mlmd_utils as _kale_mlmd_utils _kale_mlmd_utils.init_metadata() data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() train_df = _kale_marshal_utils.load("train_df") train_labels = _kale_marshal_utils.load("train_labels") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' gaussian = GaussianNB() gaussian.fit(train_df, train_labels) acc_gaussian = round(gaussian.score(train_df, train_labels) * 100, 2) ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(acc_gaussian, "acc_gaussian") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.kfp_utils import \ update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/naivebayes.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('naivebayes') _kale_mlmd_utils.call("mark_execution_complete") def logisticregression(): from kale.utils import mlmd_utils as _kale_mlmd_utils _kale_mlmd_utils.init_metadata() data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() train_df = _kale_marshal_utils.load("train_df") train_labels = _kale_marshal_utils.load("train_labels") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' logreg = LogisticRegression(solver='lbfgs', max_iter=110) logreg.fit(train_df, train_labels) acc_log = round(logreg.score(train_df, train_labels) * 100, 2) ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(acc_log, "acc_log") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.kfp_utils import \ update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/logisticregression.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('logisticregression') _kale_mlmd_utils.call("mark_execution_complete") def randomforest(): from kale.utils import mlmd_utils as _kale_mlmd_utils _kale_mlmd_utils.init_metadata() data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() train_df = _kale_marshal_utils.load("train_df") train_labels = _kale_marshal_utils.load("train_labels") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(train_df, train_labels) acc_random_forest = round(random_forest.score(train_df, train_labels) * 100, 2) ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(acc_random_forest, "acc_random_forest") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.kfp_utils import \ update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/randomforest.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('randomforest') _kale_mlmd_utils.call("mark_execution_complete") def results(): from kale.utils import mlmd_utils as _kale_mlmd_utils _kale_mlmd_utils.init_metadata() data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() acc_decision_tree = _kale_marshal_utils.load("acc_decision_tree") acc_gaussian = _kale_marshal_utils.load("acc_gaussian") acc_linear_svc = _kale_marshal_utils.load("acc_linear_svc") acc_log = _kale_marshal_utils.load("acc_log") acc_random_forest = _kale_marshal_utils.load("acc_random_forest") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' results = pd.DataFrame({ 'Model': ['Support Vector Machines', 'logistic Regression', 'Random Forest', 'Naive Bayes', 'Decision Tree'], 'Score': [acc_linear_svc, acc_log, acc_random_forest, acc_gaussian, acc_decision_tree]}) result_df = results.sort_values(by='Score', ascending=False) result_df = result_df.set_index('Score') print(result_df) ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.kfp_utils import \ update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, ) html_artifact = _kale_run_code(blocks) with open("/results.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('results') _kale_mlmd_utils.call("mark_execution_complete") loaddata_op = comp.func_to_container_op(loaddata) datapreprocessing_op = comp.func_to_container_op(datapreprocessing) featureengineering_op = comp.func_to_container_op(featureengineering) decisiontree_op = comp.func_to_container_op(decisiontree) svm_op = comp.func_to_container_op(svm) naivebayes_op = comp.func_to_container_op(naivebayes) logisticregression_op = comp.func_to_container_op(logisticregression) randomforest_op = comp.func_to_container_op(randomforest) results_op = comp.func_to_container_op(results) @dsl.pipeline( name='titanic-ml-gxj28', description='Predict which passengers survived the Titanic shipwreck' ) def auto_generated_pipeline(): pvolumes_dict = OrderedDict() volume_step_names = [] volume_name_parameters = [] marshal_vop = dsl.VolumeOp( name="kale-marshal-volume", resource_name="kale-marshal-pvc", modes=dsl.VOLUME_MODE_RWM, size="1Gi" ) volume_step_names.append(marshal_vop.name) volume_name_parameters.append(marshal_vop.outputs["name"].full_name) pvolumes_dict['/marshal'] = marshal_vop.volume volume_step_names.sort() volume_name_parameters.sort() loaddata_task = loaddata_op()\ .add_pvolumes(pvolumes_dict)\ .after() loaddata_task.container.working_dir = "/Users/animeshsingh/go/src/github.com/kubeflow/kale/examples/titanic-ml-dataset" loaddata_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) output_artifacts = {} output_artifacts.update( {'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'}) output_artifacts.update({'loaddata': '/loaddata.html'}) loaddata_task.output_artifact_paths.update(output_artifacts) loaddata_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") dep_names = loaddata_task.dependent_names + volume_step_names loaddata_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(dep_names)) if volume_name_parameters: loaddata_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(volume_name_parameters)) datapreprocessing_task = datapreprocessing_op()\ .add_pvolumes(pvolumes_dict)\ .after(loaddata_task) datapreprocessing_task.container.working_dir = "/Users/animeshsingh/go/src/github.com/kubeflow/kale/examples/titanic-ml-dataset" datapreprocessing_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) output_artifacts = {} output_artifacts.update( {'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'}) output_artifacts.update({'datapreprocessing': '/datapreprocessing.html'}) datapreprocessing_task.output_artifact_paths.update(output_artifacts) datapreprocessing_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") dep_names = datapreprocessing_task.dependent_names + volume_step_names datapreprocessing_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(dep_names)) if volume_name_parameters: datapreprocessing_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(volume_name_parameters)) featureengineering_task = featureengineering_op()\ .add_pvolumes(pvolumes_dict)\ .after(datapreprocessing_task) featureengineering_task.container.working_dir = "/Users/animeshsingh/go/src/github.com/kubeflow/kale/examples/titanic-ml-dataset" featureengineering_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) output_artifacts = {} output_artifacts.update( {'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'}) output_artifacts.update({'featureengineering': '/featureengineering.html'}) featureengineering_task.output_artifact_paths.update(output_artifacts) featureengineering_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") dep_names = featureengineering_task.dependent_names + volume_step_names featureengineering_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(dep_names)) if volume_name_parameters: featureengineering_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(volume_name_parameters)) decisiontree_task = decisiontree_op()\ .add_pvolumes(pvolumes_dict)\ .after(featureengineering_task) decisiontree_task.container.working_dir = "/Users/animeshsingh/go/src/github.com/kubeflow/kale/examples/titanic-ml-dataset" decisiontree_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) output_artifacts = {} output_artifacts.update( {'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'}) output_artifacts.update({'decisiontree': '/decisiontree.html'}) decisiontree_task.output_artifact_paths.update(output_artifacts) decisiontree_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") dep_names = decisiontree_task.dependent_names + volume_step_names decisiontree_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(dep_names)) if volume_name_parameters: decisiontree_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(volume_name_parameters)) svm_task = svm_op()\ .add_pvolumes(pvolumes_dict)\ .after(featureengineering_task) svm_task.container.working_dir = "/Users/animeshsingh/go/src/github.com/kubeflow/kale/examples/titanic-ml-dataset" svm_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) output_artifacts = {} output_artifacts.update( {'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'}) output_artifacts.update({'svm': '/svm.html'}) svm_task.output_artifact_paths.update(output_artifacts) svm_task.add_pod_label("pipelines.kubeflow.org/metadata_written", "true") dep_names = svm_task.dependent_names + volume_step_names svm_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(dep_names)) if volume_name_parameters: svm_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(volume_name_parameters)) naivebayes_task = naivebayes_op()\ .add_pvolumes(pvolumes_dict)\ .after(featureengineering_task) naivebayes_task.container.working_dir = "/Users/animeshsingh/go/src/github.com/kubeflow/kale/examples/titanic-ml-dataset" naivebayes_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) output_artifacts = {} output_artifacts.update( {'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'}) output_artifacts.update({'naivebayes': '/naivebayes.html'}) naivebayes_task.output_artifact_paths.update(output_artifacts) naivebayes_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") dep_names = naivebayes_task.dependent_names + volume_step_names naivebayes_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(dep_names)) if volume_name_parameters: naivebayes_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(volume_name_parameters)) logisticregression_task = logisticregression_op()\ .add_pvolumes(pvolumes_dict)\ .after(featureengineering_task) logisticregression_task.container.working_dir = "/Users/animeshsingh/go/src/github.com/kubeflow/kale/examples/titanic-ml-dataset" logisticregression_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) output_artifacts = {} output_artifacts.update( {'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'}) output_artifacts.update({'logisticregression': '/logisticregression.html'}) logisticregression_task.output_artifact_paths.update(output_artifacts) logisticregression_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") dep_names = logisticregression_task.dependent_names + volume_step_names logisticregression_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(dep_names)) if volume_name_parameters: logisticregression_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(volume_name_parameters)) randomforest_task = randomforest_op()\ .add_pvolumes(pvolumes_dict)\ .after(featureengineering_task) randomforest_task.container.working_dir = "/Users/animeshsingh/go/src/github.com/kubeflow/kale/examples/titanic-ml-dataset" randomforest_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) output_artifacts = {} output_artifacts.update( {'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'}) output_artifacts.update({'randomforest': '/randomforest.html'}) randomforest_task.output_artifact_paths.update(output_artifacts) randomforest_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") dep_names = randomforest_task.dependent_names + volume_step_names randomforest_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(dep_names)) if volume_name_parameters: randomforest_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(volume_name_parameters)) results_task = results_op()\ .add_pvolumes(pvolumes_dict)\ .after(randomforest_task, logisticregression_task, naivebayes_task, svm_task, decisiontree_task) results_task.container.working_dir = "/Users/animeshsingh/go/src/github.com/kubeflow/kale/examples/titanic-ml-dataset" results_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) output_artifacts = {} output_artifacts.update( {'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json'}) output_artifacts.update({'results': '/results.html'}) results_task.output_artifact_paths.update(output_artifacts) results_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") dep_names = results_task.dependent_names + volume_step_names results_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(dep_names)) if volume_name_parameters: results_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(volume_name_parameters)) if __name__ == "__main__": pipeline_func = auto_generated_pipeline pipeline_filename = pipeline_func.__name__ + '.pipeline.tar.gz' import kfp.compiler as compiler compiler.Compiler().compile(pipeline_func, pipeline_filename) # Get or create an experiment and submit a pipeline run import kfp client = kfp.Client() experiment = client.create_experiment('titanic') # Submit a pipeline run from kale.utils.kfp_utils import generate_run_name run_name = generate_run_name('titanic-ml-gxj28') run_result = client.run_pipeline( experiment.id, run_name, pipeline_filename, {})