examples/natural-language-processing.../natural-language-processing...

616 lines
26 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Requirement already satisfied: kfp in /home/jovyan/.local/lib/python3.6/site-packages (1.8.11)\n",
"Collecting kfp\n",
" Using cached kfp-1.8.12.tar.gz (301 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25hRequirement already satisfied: absl-py<2,>=0.9 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.11.0)\n",
"Requirement already satisfied: PyYAML<6,>=5.3 in /usr/local/lib/python3.6/dist-packages (from kfp) (5.4.1)\n",
"Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.31.5)\n",
"Requirement already satisfied: google-cloud-storage<2,>=1.20.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.44.0)\n",
"Requirement already satisfied: kubernetes<19,>=8.0.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (12.0.1)\n",
"Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.12.8)\n",
"Requirement already satisfied: google-auth<2,>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.35.0)\n",
"Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.9.1)\n",
"Requirement already satisfied: cloudpickle<3,>=2.0.0 in /home/jovyan/.local/lib/python3.6/site-packages (from kfp) (2.0.0)\n",
"Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.7.1)\n",
"Requirement already satisfied: jsonschema<4,>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n",
"Requirement already satisfied: tabulate<1,>=0.8.6 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.8.9)\n",
"Requirement already satisfied: click<9,>=7.1.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (7.1.2)\n",
"Requirement already satisfied: Deprecated<2,>=1.2.7 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.2.13)\n",
"Requirement already satisfied: strip-hints<1,>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.1.10)\n",
"Requirement already satisfied: docstring-parser<1,>=0.7.3 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.13)\n",
"Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.13 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.1.13)\n",
"Requirement already satisfied: protobuf<4,>=3.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.19.3)\n",
"Requirement already satisfied: fire<1,>=0.3.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.4.0)\n",
"Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from kfp) (0.8)\n",
"Requirement already satisfied: uritemplate<4,>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.0.1)\n",
"Requirement already satisfied: typer<1.0,>=0.3.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.4.0)\n",
"Requirement already satisfied: typing-extensions<4,>=3.7.4 in /home/jovyan/.local/lib/python3.6/site-packages (from kfp) (3.7.4.3)\n",
"Requirement already satisfied: pydantic<2,>=1.8.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.9.0)\n",
"Requirement already satisfied: six in /home/jovyan/.local/lib/python3.6/site-packages (from absl-py<2,>=0.9->kfp) (1.15.0)\n",
"Requirement already satisfied: wrapt<2,>=1.10 in /home/jovyan/.local/lib/python3.6/site-packages (from Deprecated<2,>=1.2.7->kfp) (1.12.1)\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from fire<1,>=0.3.1->kfp) (1.1.0)\n",
"Requirement already satisfied: google-auth-httplib2>=0.0.3 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client<2,>=1.7.8->kfp) (0.1.0)\n",
"Requirement already satisfied: httplib2<1dev,>=0.15.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client<2,>=1.7.8->kfp) (0.20.2)\n",
"Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (59.6.0)\n",
"Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (4.2.4)\n",
"Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (4.8)\n",
"Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (0.2.8)\n",
"Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage<2,>=1.20.0->kfp) (2.27.1)\n",
"Requirement already satisfied: google-cloud-core<3.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage<2,>=1.20.0->kfp) (2.2.1)\n",
"Requirement already satisfied: google-resumable-media<3.0dev,>=1.3.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage<2,>=1.20.0->kfp) (2.1.0)\n",
"Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.6/dist-packages (from jsonschema<4,>=3.0.1->kfp) (4.8.3)\n",
"Requirement already satisfied: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema<4,>=3.0.1->kfp) (0.18.0)\n",
"Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema<4,>=3.0.1->kfp) (20.3.0)\n",
"Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp) (2.8.2)\n",
"Requirement already satisfied: urllib3>=1.15 in /usr/local/lib/python3.6/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp) (1.26.8)\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp) (2021.10.8)\n",
"Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<19,>=8.0.0->kfp) (1.2.3)\n",
"Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<19,>=8.0.0->kfp) (1.3.0)\n",
"Requirement already satisfied: wheel in /home/jovyan/.local/lib/python3.6/site-packages (from strip-hints<1,>=0.1.8->kfp) (0.37.1)\n",
"Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp) (2021.3)\n",
"Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp) (1.54.0)\n",
"Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.6/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp) (21.3)\n",
"Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.6/dist-packages (from google-resumable-media<3.0dev,>=1.3.0->google-cloud-storage<2,>=1.20.0->kfp) (1.3.0)\n",
"Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /usr/local/lib/python3.6/dist-packages (from httplib2<1dev,>=0.15.0->google-api-python-client<2,>=1.7.8->kfp) (3.0.6)\n",
"Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.1->kfp) (0.4.8)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage<2,>=1.20.0->kfp) (3.3)\n",
"Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage<2,>=1.20.0->kfp) (2.0.10)\n",
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata->jsonschema<4,>=3.0.1->kfp) (3.6.0)\n",
"Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<19,>=8.0.0->kfp) (3.1.1)\n"
]
}
],
"source": [
"!pip install kfp --upgrade "
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Import kubeflow pipeline libraries\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import kfp\n",
"import kfp.components as comp\n",
"import kfp.dsl as dsl\n",
"from kfp.components import InputPath, OutputPath"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"## Download Dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#Download data\n",
"def download_data(download_link: str, data_path: OutputPath(str)):\n",
" import zipfile\n",
" import wget\n",
" import os\n",
"\n",
" if not os.path.exists(data_path):\n",
" os.makedirs(data_path)\n",
"\n",
" # download files\n",
" wget.download(download_link.format(file='train'), f'{data_path}/train_csv.zip')\n",
" wget.download(download_link.format(file='test'), f'{data_path}/test_csv.zip')\n",
" \n",
" with zipfile.ZipFile(f\"{data_path}/train_csv.zip\",\"r\") as zip_ref:\n",
" zip_ref.extractall(data_path)\n",
" \n",
" with zipfile.ZipFile(f\"{data_path}/test_csv.zip\",\"r\") as zip_ref:\n",
" zip_ref.extractall(data_path)\n",
" \n",
" return(print('Done!'))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"download_op = kfp.components.create_component_from_func(download_data,base_image=\"python:3.7\", packages_to_install=['wget', 'zipfile36'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# Load Data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def load_data(data_path:comp.InputPath(str),load_data_path: comp.OutputPath()):\n",
" import pandas as pd\n",
" import os, pickle\n",
" \n",
" train_data_path = data_path + '/train.csv'\n",
" test_data_path = data_path + '/test.csv'\n",
" tweet_df= pd.read_csv(train_data_path)\n",
" test_df=pd.read_csv(test_data_path)\n",
" df=pd.concat([tweet_df,test_df])\n",
" \n",
" #creating the preprocess directory\n",
" os.makedirs(load_data_path, exist_ok = True)\n",
" \n",
" # join train and test together\n",
" ntrain = tweet_df.shape[0]\n",
" ntest = test_df.shape[0]\n",
" with open(f'{load_data_path}/df', 'wb') as f:\n",
" pickle.dump((ntrain, df, tweet_df), f)\n",
" return(print('Done!'))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"load_data_step = kfp.components.create_component_from_func(\n",
" func=load_data,\n",
" output_component_file='load_data_component.yaml', # This is optional. It saves the component spec for future use.\n",
" base_image='python:3.7',\n",
" packages_to_install=['pandas','pickle5'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def preprocess_data(load_data_path:comp.InputPath(str), preprocess_data_path:comp.OutputPath(str)):\n",
" \n",
" import re\n",
" import pandas as pd\n",
" import os, pickle\n",
" import string\n",
" \n",
" #loading the train data\n",
" with open(f'{load_data_path}/df', 'rb') as f:\n",
" ntrain, df, tweet_df = pickle.load(f)\n",
" \n",
" \n",
" def remove_URL(text):\n",
" url = re.compile(r'https?://\\S+|www\\.\\S+')\n",
" return url.sub(r'',text)\n",
" def remove_html(text):\n",
" html=re.compile(r'<.*?>')\n",
" return html.sub(r'',text)\n",
" def remove_emoji(text):\n",
" emoji_pattern = re.compile(\"[\"\n",
" u\"\\U0001F600-\\U0001F64F\" # emoticons\n",
" u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\n",
" u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\n",
" u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\n",
" u\"\\U00002702-\\U000027B0\"\n",
" u\"\\U000024C2-\\U0001F251\"\n",
" \"]+\", flags=re.UNICODE)\n",
" return emoji_pattern.sub(r'', text)\n",
" def remove_punct(text):\n",
" table=str.maketrans('','',string.punctuation)\n",
" return text.translate(table)\n",
" \n",
"\n",
" \n",
" \n",
" df['text'] = df['text'].apply(lambda x : remove_URL(x))\n",
" df['text'] = df['text'].apply(lambda x: remove_html(x))\n",
" df['text'] = df['text'].apply(lambda x: remove_emoji(x))\n",
" df['text'] = df['text'].apply(lambda x: remove_punct(x))\n",
"\n",
" \n",
" #creating the preprocess directory\n",
" os.makedirs(preprocess_data_path, exist_ok = True)\n",
" \n",
" with open(f'{preprocess_data_path}/df', 'wb') as f:\n",
" pickle.dump((ntrain, df, tweet_df), f)\n",
" return(print('Done!'))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"preprocess_data_step = kfp.components.create_component_from_func(preprocess_data, packages_to_install=['pandas', 'regex', 'pickle5'], output_component_file='preprocess_data_component.yaml')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def corpus_creation(preprocess_data_path: comp.InputPath(str), corpus_path: comp.OutputPath(str)):\n",
" import os, pickle\n",
" import pandas as pd\n",
" import nltk\n",
" nltk.download('stopwords')\n",
" nltk.download('punkt')\n",
" from nltk.corpus import stopwords\n",
" from nltk.util import ngrams\n",
" from nltk.tokenize import word_tokenize\n",
" stop=set(stopwords.words('english'))\n",
" from tqdm import tqdm\n",
" \n",
" with open(f'{preprocess_data_path}/df', 'rb') as f:\n",
" ntrain, df, tweet_df = pickle.load(f)\n",
" \n",
" def create_corpus(df):\n",
" corpus=[]\n",
" for tweet in tqdm(df['text']):\n",
" words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]\n",
" corpus.append(words)\n",
" return corpus\n",
" \n",
" #creating the preprocess directory\n",
" os.makedirs(corpus_path, exist_ok = True)\n",
" \n",
" corpus=create_corpus(df)\n",
" with open(f'{corpus_path}/corpus', 'wb') as f:\n",
" pickle.dump((corpus,tweet_df), f)\n",
" return(print('Done!'))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"corpus_creation_step = kfp.components.create_component_from_func(corpus_creation, packages_to_install=['pandas', 'pickle5', 'nltk','tqdm'], output_component_file='corpus_creation_component.yaml')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def embedding_step(download_link_glove: str,corpus_path: comp.InputPath(str), embedded_path: comp.OutputPath(str)):\n",
" \n",
" import os, pickle\n",
" import pandas as pd\n",
" import zipfile\n",
" import wget\n",
" import os\n",
" from keras.preprocessing.text import Tokenizer\n",
" from keras.utils import pad_sequences\n",
" import numpy as np\n",
" from tqdm import tqdm\n",
" \n",
" with open(f'{corpus_path}/corpus', 'rb') as f:\n",
" corpus, tweet_df = pickle.load(f)\n",
" \n",
" if not os.path.exists(embedded_path):\n",
" os.makedirs(embedded_path)\n",
" # download files\n",
" wget.download(download_link_glove, f'{embedded_path}/glove.6B.zip')\n",
" \n",
" with zipfile.ZipFile(f'{embedded_path}/glove.6B.zip', 'r') as zip_ref:\n",
" zip_ref.extractall(embedded_path)\n",
" \n",
" embedding_dict={}\n",
" \"\"\"path_to_glove_file = os.path.join(\n",
" os.path.expanduser(\"~\"), f\"{embedded_path}/glove.6B.100d.txt\"\n",
" )\"\"\"\n",
" with open(f\"{embedded_path}/glove.6B.100d.txt\",'r') as f:\n",
" for line in f:\n",
" values=line.split()\n",
" word=values[0]\n",
" vectors=np.asarray(values[1:],'float32')\n",
" embedding_dict[word]=vectors\n",
" f.close()\n",
" \n",
" MAX_LEN=50\n",
" tokenizer_obj=Tokenizer()\n",
" tokenizer_obj.fit_on_texts(corpus)\n",
" sequences=tokenizer_obj.texts_to_sequences(corpus)\n",
" \n",
" tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')\n",
" word_index=tokenizer_obj.word_index\n",
" num_words=len(word_index)+1\n",
" embedding_matrix=np.zeros((num_words,100))\n",
"\n",
" for word,i in tqdm(word_index.items()):\n",
" if i > num_words:\n",
" continue\n",
"\n",
" emb_vec=embedding_dict.get(word)\n",
" if emb_vec is not None:\n",
" embedding_matrix[i]=emb_vec\n",
" \n",
" with open(f'{embedded_path}/embedding', 'wb') as f:\n",
" pickle.dump((embedding_matrix, num_words, tweet_pad, tweet_df, MAX_LEN), f)\n",
" return(print('Done!'))\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"embedding_creation_step = kfp.components.create_component_from_func(embedding_step, packages_to_install=['pandas', 'zipfile36', 'wget','tqdm','keras','numpy','tensorflow', 'pickle5'], output_component_file='embedding_component.yaml')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def model_building_and_training(embedded_path: comp.InputPath(str), model_path: comp.OutputPath(str)):\n",
" \n",
" import os, pickle;\n",
" import pandas as pd\n",
" import numpy as np\n",
" from keras.models import Sequential\n",
" from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D\n",
" from keras.initializers import Constant\n",
" from sklearn.model_selection import train_test_split\n",
" from tensorflow.keras.optimizers import Adam\n",
" \n",
" with open(f'{embedded_path}/embedding', 'rb') as f:\n",
" embedding_matrix, num_words, tweet_pad, tweet_df, MAX_LEN = pickle.load(f)\n",
" \n",
" train=tweet_pad[:tweet_df.shape[0]]\n",
" final_test=tweet_pad[tweet_df.shape[0]:]\n",
" X_train,X_test,y_train,y_test=train_test_split(train,tweet_df['target'].values,test_size=0.15)\n",
" \n",
" #defining model\n",
" model=Sequential()\n",
"\n",
" embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),\n",
" input_length=MAX_LEN,trainable=False)\n",
"\n",
" model.add(embedding)\n",
" model.add(SpatialDropout1D(0.2))\n",
" model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))\n",
" model.add(Dense(1, activation='sigmoid'))\n",
"\n",
"\n",
" optimzer=Adam(learning_rate=1e-5)\n",
" \n",
" #Compiling the classifier model with Adam optimizer\n",
" model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])\n",
" \n",
" #fitting the model\n",
" history=model.fit(X_train,y_train,batch_size=4,epochs=5,validation_data=(X_test,y_test),verbose=2)\n",
"\n",
" #evaluate model\n",
" test_loss, test_acc = model.evaluate(np.array(X_test), np.array(y_test), verbose=0)\n",
" print(\"Test_loss: {}, Test_accuracy: {} \".format(test_loss,test_acc))\n",
" \n",
" #creating the preprocess directory\n",
" os.makedirs(model_path, exist_ok = True)\n",
" \n",
" #saving the model\n",
" model.save(f'{model_path}/model.h5')\n",
" \n",
" #dumping other values\n",
" with open(f'{model_path}/values', 'wb') as f:\n",
" pickle.dump((test_loss, test_acc), f)\n",
" return(print('Done!'))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"model_building_step = kfp.components.create_component_from_func(model_building_and_training, packages_to_install=['pandas', 'zipfile36', 'wget','tqdm','keras','numpy','tensorflow','sklearn','pickle5'], output_component_file='model_creation_component.yaml')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"@dsl.pipeline(\n",
" name='Trial pipeline',\n",
" description='An example pipeline that performs pd formation and plotting class distibution.'\n",
")\n",
"def trial_pipeline(\n",
" download_link: str,\n",
" data_path: str,\n",
" load_data_path: str, \n",
" preprocess_data_path: str,\n",
" corpus_path:str,\n",
" download_link_glove:str,\n",
" model_path:str,\n",
"):\n",
" download_container = download_op(download_link)\n",
" output1 = load_data_step(download_container.output)\n",
" output2 = preprocess_data_step(output1.output)\n",
" output3 = corpus_creation_step(output2.output)\n",
" output4 = embedding_creation_step(download_link_glove, output3.output)\n",
" output5 = model_building_step(output4.output)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# replace download_link with the repo link where the data is stored https:github-repo/data-dir/{file}.csv.zip?raw=true\n",
"download_link = 'https://github.com/AnkitRai-22/natural-language-processing-with-disaster-tweets-kaggle-competition/blob/main/data/{file}.csv.zip?raw=true'\n",
"data_path = \"/mnt\"\n",
"load_data_path = \"load\"\n",
"preprocess_data_path = \"preprocess\"\n",
"corpus_path = \"corpus\",\n",
"download_link_glove = \"http://nlp.stanford.edu/data/glove.6B.zip\"\n",
"model_path = \"model\""
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<a href=\"/pipeline/#/experiments/details/6954af44-cdf8-47d7-bda1-211cbabdedec\" target=\"_blank\" >Experiment details</a>."
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<a href=\"/pipeline/#/runs/details/ba2fe0ed-6351-4b8c-bd7b-649fea61a5d9\" target=\"_blank\" >Run details</a>."
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"RunPipelineResult(run_id=ba2fe0ed-6351-4b8c-bd7b-649fea61a5d9)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client = kfp.Client()\n",
"arguments = {\"download_link\": download_link,\n",
" \"data_path\": data_path,\n",
" \"load_data_path\": load_data_path,\n",
" \"preprocess_data_path\": preprocess_data_path,\n",
" \"corpus_path\":corpus_path,\n",
" \"download_link_glove\":download_link_glove,\n",
" \"model_path\":model_path,\n",
" }\n",
"client.create_run_from_pipeline_func(trial_pipeline, arguments=arguments)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"kubeflow_notebook": {
"autosnapshot": true,
"docker_image": "",
"experiment": {
"id": "",
"name": ""
},
"experiment_name": "",
"katib_metadata": {
"algorithm": {
"algorithmName": "grid"
},
"maxFailedTrialCount": 3,
"maxTrialCount": 12,
"objective": {
"objectiveMetricName": "",
"type": "minimize"
},
"parallelTrialCount": 3,
"parameters": []
},
"katib_run": false,
"pipeline_description": "",
"pipeline_name": "",
"snapshot_volumes": true,
"steps_defaults": [
"label:access-ml-pipeline:true",
"label:access-rok:true"
],
"volume_access_mode": "rwm",
"volumes": [
{
"annotations": [],
"mount_point": "/home/jovyan",
"name": "nlp-getting-started-vanilla-final-workspace-wwnkd",
"size": 14,
"size_type": "Gi",
"snapshot": false,
"type": "clone"
}
]
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}