examples/natural-language-processing.../natural-language-processing...

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Defaulting to user installation because normal site-packages is not writeable\n",
      "Requirement already satisfied: kfp in /home/jovyan/.local/lib/python3.6/site-packages (1.8.11)\n",
      "Collecting kfp\n",
      "  Using cached kfp-1.8.12.tar.gz (301 kB)\n",
      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25hRequirement already satisfied: absl-py<2,>=0.9 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.11.0)\n",
      "Requirement already satisfied: PyYAML<6,>=5.3 in /usr/local/lib/python3.6/dist-packages (from kfp) (5.4.1)\n",
      "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.31.5)\n",
      "Requirement already satisfied: google-cloud-storage<2,>=1.20.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.44.0)\n",
      "Requirement already satisfied: kubernetes<19,>=8.0.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (12.0.1)\n",
      "Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.12.8)\n",
      "Requirement already satisfied: google-auth<2,>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.35.0)\n",
      "Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.9.1)\n",
      "Requirement already satisfied: cloudpickle<3,>=2.0.0 in /home/jovyan/.local/lib/python3.6/site-packages (from kfp) (2.0.0)\n",
      "Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.7.1)\n",
      "Requirement already satisfied: jsonschema<4,>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n",
      "Requirement already satisfied: tabulate<1,>=0.8.6 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.8.9)\n",
      "Requirement already satisfied: click<9,>=7.1.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (7.1.2)\n",
      "Requirement already satisfied: Deprecated<2,>=1.2.7 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.2.13)\n",
      "Requirement already satisfied: strip-hints<1,>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.1.10)\n",
      "Requirement already satisfied: docstring-parser<1,>=0.7.3 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.13)\n",
      "Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.13 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.1.13)\n",
      "Requirement already satisfied: protobuf<4,>=3.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.19.3)\n",
      "Requirement already satisfied: fire<1,>=0.3.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.4.0)\n",
      "Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from kfp) (0.8)\n",
      "Requirement already satisfied: uritemplate<4,>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.0.1)\n",
      "Requirement already satisfied: typer<1.0,>=0.3.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.4.0)\n",
      "Requirement already satisfied: typing-extensions<4,>=3.7.4 in /home/jovyan/.local/lib/python3.6/site-packages (from kfp) (3.7.4.3)\n",
      "Requirement already satisfied: pydantic<2,>=1.8.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.9.0)\n",
      "Requirement already satisfied: six in /home/jovyan/.local/lib/python3.6/site-packages (from absl-py<2,>=0.9->kfp) (1.15.0)\n",
      "Requirement already satisfied: wrapt<2,>=1.10 in /home/jovyan/.local/lib/python3.6/site-packages (from Deprecated<2,>=1.2.7->kfp) (1.12.1)\n",
      "Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from fire<1,>=0.3.1->kfp) (1.1.0)\n",
      "Requirement already satisfied: google-auth-httplib2>=0.0.3 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client<2,>=1.7.8->kfp) (0.1.0)\n",
      "Requirement already satisfied: httplib2<1dev,>=0.15.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client<2,>=1.7.8->kfp) (0.20.2)\n",
      "Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (59.6.0)\n",
      "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (4.2.4)\n",
      "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (4.8)\n",
      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (0.2.8)\n",
      "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage<2,>=1.20.0->kfp) (2.27.1)\n",
      "Requirement already satisfied: google-cloud-core<3.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage<2,>=1.20.0->kfp) (2.2.1)\n",
      "Requirement already satisfied: google-resumable-media<3.0dev,>=1.3.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage<2,>=1.20.0->kfp) (2.1.0)\n",
      "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.6/dist-packages (from jsonschema<4,>=3.0.1->kfp) (4.8.3)\n",
      "Requirement already satisfied: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema<4,>=3.0.1->kfp) (0.18.0)\n",
      "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema<4,>=3.0.1->kfp) (20.3.0)\n",
      "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp) (2.8.2)\n",
      "Requirement already satisfied: urllib3>=1.15 in /usr/local/lib/python3.6/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp) (1.26.8)\n",
      "Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp) (2021.10.8)\n",
      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<19,>=8.0.0->kfp) (1.2.3)\n",
      "Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<19,>=8.0.0->kfp) (1.3.0)\n",
      "Requirement already satisfied: wheel in /home/jovyan/.local/lib/python3.6/site-packages (from strip-hints<1,>=0.1.8->kfp) (0.37.1)\n",
      "Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp) (2021.3)\n",
      "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp) (1.54.0)\n",
      "Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.6/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp) (21.3)\n",
      "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.6/dist-packages (from google-resumable-media<3.0dev,>=1.3.0->google-cloud-storage<2,>=1.20.0->kfp) (1.3.0)\n",
      "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /usr/local/lib/python3.6/dist-packages (from httplib2<1dev,>=0.15.0->google-api-python-client<2,>=1.7.8->kfp) (3.0.6)\n",
      "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.1->kfp) (0.4.8)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage<2,>=1.20.0->kfp) (3.3)\n",
      "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage<2,>=1.20.0->kfp) (2.0.10)\n",
      "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata->jsonschema<4,>=3.0.1->kfp) (3.6.0)\n",
      "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<19,>=8.0.0->kfp) (3.1.1)\n"
     ]
    }
   ],
   "source": [
    "!pip install kfp --upgrade "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Import kubeflow pipeline libraries\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import kfp\n",
    "import kfp.components as comp\n",
    "import kfp.dsl as dsl\n",
    "from kfp.components import InputPath, OutputPath"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Download Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Download data\n",
    "def download_data(download_link: str, data_path: OutputPath(str)):\n",
    "    import zipfile\n",
    "    import wget\n",
    "    import os\n",
    "\n",
    "    if not os.path.exists(data_path):\n",
    "        os.makedirs(data_path)\n",
    "\n",
    "    # download files\n",
    "    wget.download(download_link.format(file='train'), f'{data_path}/train_csv.zip')\n",
    "    wget.download(download_link.format(file='test'), f'{data_path}/test_csv.zip')\n",
    "    \n",
    "    with zipfile.ZipFile(f\"{data_path}/train_csv.zip\",\"r\") as zip_ref:\n",
    "        zip_ref.extractall(data_path)\n",
    "        \n",
    "    with zipfile.ZipFile(f\"{data_path}/test_csv.zip\",\"r\") as zip_ref:\n",
    "        zip_ref.extractall(data_path)\n",
    "    \n",
    "    return(print('Done!'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "download_op = kfp.components.create_component_from_func(download_data,base_image=\"python:3.7\", packages_to_install=['wget', 'zipfile36'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_data(data_path:comp.InputPath(str),load_data_path: comp.OutputPath()):\n",
    "    import pandas as pd\n",
    "    import os, pickle\n",
    "    \n",
    "    train_data_path = data_path + '/train.csv'\n",
    "    test_data_path = data_path + '/test.csv'\n",
    "    tweet_df= pd.read_csv(train_data_path)\n",
    "    test_df=pd.read_csv(test_data_path)\n",
    "    df=pd.concat([tweet_df,test_df])\n",
    "    \n",
    "    #creating the preprocess directory\n",
    "    os.makedirs(load_data_path, exist_ok = True)\n",
    "    \n",
    "    # join train and test together\n",
    "    ntrain = tweet_df.shape[0]\n",
    "    ntest = test_df.shape[0]\n",
    "    with open(f'{load_data_path}/df', 'wb') as f:\n",
    "        pickle.dump((ntrain, df, tweet_df), f)\n",
    "    return(print('Done!'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_data_step = kfp.components.create_component_from_func(\n",
    "    func=load_data,\n",
    "    output_component_file='load_data_component.yaml', # This is optional. It saves the component spec for future use.\n",
    "    base_image='python:3.7',\n",
    "    packages_to_install=['pandas','pickle5'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_data(load_data_path:comp.InputPath(str), preprocess_data_path:comp.OutputPath(str)):\n",
    "    \n",
    "    import re\n",
    "    import pandas as pd\n",
    "    import os, pickle\n",
    "    import string\n",
    "    \n",
    "     #loading the train data\n",
    "    with open(f'{load_data_path}/df', 'rb') as f:\n",
    "        ntrain, df, tweet_df = pickle.load(f)\n",
    "        \n",
    "    \n",
    "    def remove_URL(text):\n",
    "        url = re.compile(r'https?://\\S+|www\\.\\S+')\n",
    "        return url.sub(r'',text)\n",
    "    def remove_html(text):\n",
    "        html=re.compile(r'<.*?>')\n",
    "        return html.sub(r'',text)\n",
    "    def remove_emoji(text):\n",
    "        emoji_pattern = re.compile(\"[\"\n",
    "                               u\"\\U0001F600-\\U0001F64F\"  # emoticons\n",
    "                               u\"\\U0001F300-\\U0001F5FF\"  # symbols & pictographs\n",
    "                               u\"\\U0001F680-\\U0001F6FF\"  # transport & map symbols\n",
    "                               u\"\\U0001F1E0-\\U0001F1FF\"  # flags (iOS)\n",
    "                               u\"\\U00002702-\\U000027B0\"\n",
    "                               u\"\\U000024C2-\\U0001F251\"\n",
    "                               \"]+\", flags=re.UNICODE)\n",
    "        return emoji_pattern.sub(r'', text)\n",
    "    def remove_punct(text):\n",
    "        table=str.maketrans('','',string.punctuation)\n",
    "        return text.translate(table)\n",
    "    \n",
    "\n",
    "    \n",
    "    \n",
    "    df['text'] = df['text'].apply(lambda x : remove_URL(x))\n",
    "    df['text'] = df['text'].apply(lambda x: remove_html(x))\n",
    "    df['text'] = df['text'].apply(lambda x: remove_emoji(x))\n",
    "    df['text'] = df['text'].apply(lambda x: remove_punct(x))\n",
    "\n",
    "    \n",
    "    #creating the preprocess directory\n",
    "    os.makedirs(preprocess_data_path, exist_ok = True)\n",
    "    \n",
    "    with open(f'{preprocess_data_path}/df', 'wb') as f:\n",
    "        pickle.dump((ntrain, df, tweet_df), f)\n",
    "    return(print('Done!'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocess_data_step = kfp.components.create_component_from_func(preprocess_data, packages_to_install=['pandas', 'regex', 'pickle5'], output_component_file='preprocess_data_component.yaml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def corpus_creation(preprocess_data_path: comp.InputPath(str), corpus_path: comp.OutputPath(str)):\n",
    "    import os, pickle\n",
    "    import pandas as pd\n",
    "    import nltk\n",
    "    nltk.download('stopwords')\n",
    "    nltk.download('punkt')\n",
    "    from nltk.corpus import stopwords\n",
    "    from nltk.util import ngrams\n",
    "    from nltk.tokenize import word_tokenize\n",
    "    stop=set(stopwords.words('english'))\n",
    "    from tqdm import tqdm\n",
    "    \n",
    "    with open(f'{preprocess_data_path}/df', 'rb') as f:\n",
    "        ntrain, df, tweet_df = pickle.load(f)\n",
    "        \n",
    "    def create_corpus(df):\n",
    "        corpus=[]\n",
    "        for tweet in tqdm(df['text']):\n",
    "            words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]\n",
    "            corpus.append(words)\n",
    "        return corpus\n",
    "    \n",
    "     #creating the preprocess directory\n",
    "    os.makedirs(corpus_path, exist_ok = True)\n",
    "    \n",
    "    corpus=create_corpus(df)\n",
    "    with open(f'{corpus_path}/corpus', 'wb') as f:\n",
    "        pickle.dump((corpus,tweet_df), f)\n",
    "    return(print('Done!'))\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus_creation_step = kfp.components.create_component_from_func(corpus_creation, packages_to_install=['pandas', 'pickle5', 'nltk','tqdm'], output_component_file='corpus_creation_component.yaml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def embedding_step(download_link_glove: str,corpus_path: comp.InputPath(str), embedded_path: comp.OutputPath(str)):\n",
    "    \n",
    "    import os, pickle\n",
    "    import pandas as pd\n",
    "    import zipfile\n",
    "    import wget\n",
    "    import os\n",
    "    from keras.preprocessing.text import Tokenizer\n",
    "    from keras.utils import pad_sequences\n",
    "    import numpy as np\n",
    "    from tqdm import tqdm\n",
    "    \n",
    "    with open(f'{corpus_path}/corpus', 'rb') as f:\n",
    "        corpus, tweet_df = pickle.load(f)\n",
    "    \n",
    "    if not os.path.exists(embedded_path):\n",
    "        os.makedirs(embedded_path)\n",
    "    # download files\n",
    "    wget.download(download_link_glove, f'{embedded_path}/glove.6B.zip')\n",
    "    \n",
    "    with zipfile.ZipFile(f'{embedded_path}/glove.6B.zip', 'r') as zip_ref:\n",
    "        zip_ref.extractall(embedded_path)\n",
    "    \n",
    "    embedding_dict={}\n",
    "    \"\"\"path_to_glove_file = os.path.join(\n",
    "        os.path.expanduser(\"~\"), f\"{embedded_path}/glove.6B.100d.txt\"\n",
    "    )\"\"\"\n",
    "    with open(f\"{embedded_path}/glove.6B.100d.txt\",'r') as f:\n",
    "        for line in f:\n",
    "            values=line.split()\n",
    "            word=values[0]\n",
    "            vectors=np.asarray(values[1:],'float32')\n",
    "            embedding_dict[word]=vectors\n",
    "    f.close()\n",
    "    \n",
    "    MAX_LEN=50\n",
    "    tokenizer_obj=Tokenizer()\n",
    "    tokenizer_obj.fit_on_texts(corpus)\n",
    "    sequences=tokenizer_obj.texts_to_sequences(corpus)\n",
    "    \n",
    "    tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')\n",
    "    word_index=tokenizer_obj.word_index\n",
    "    num_words=len(word_index)+1\n",
    "    embedding_matrix=np.zeros((num_words,100))\n",
    "\n",
    "    for word,i in tqdm(word_index.items()):\n",
    "        if i > num_words:\n",
    "            continue\n",
    "\n",
    "        emb_vec=embedding_dict.get(word)\n",
    "        if emb_vec is not None:\n",
    "            embedding_matrix[i]=emb_vec\n",
    "    \n",
    "    with open(f'{embedded_path}/embedding', 'wb') as f:\n",
    "        pickle.dump((embedding_matrix, num_words, tweet_pad, tweet_df, MAX_LEN), f)\n",
    "    return(print('Done!'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_creation_step = kfp.components.create_component_from_func(embedding_step, packages_to_install=['pandas', 'zipfile36', 'wget','tqdm','keras','numpy','tensorflow', 'pickle5'], output_component_file='embedding_component.yaml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_building_and_training(embedded_path: comp.InputPath(str), model_path: comp.OutputPath(str)):\n",
    "    \n",
    "    import os, pickle;\n",
    "    import pandas as pd\n",
    "    import numpy as np\n",
    "    from keras.models import Sequential\n",
    "    from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D\n",
    "    from keras.initializers import Constant\n",
    "    from sklearn.model_selection import train_test_split\n",
    "    from tensorflow.keras.optimizers import Adam\n",
    "    \n",
    "    with open(f'{embedded_path}/embedding', 'rb') as f:\n",
    "        embedding_matrix, num_words, tweet_pad, tweet_df, MAX_LEN = pickle.load(f)\n",
    "    \n",
    "    train=tweet_pad[:tweet_df.shape[0]]\n",
    "    final_test=tweet_pad[tweet_df.shape[0]:]\n",
    "    X_train,X_test,y_train,y_test=train_test_split(train,tweet_df['target'].values,test_size=0.15)\n",
    "    \n",
    "    #defining model\n",
    "    model=Sequential()\n",
    "\n",
    "    embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),\n",
    "                       input_length=MAX_LEN,trainable=False)\n",
    "\n",
    "    model.add(embedding)\n",
    "    model.add(SpatialDropout1D(0.2))\n",
    "    model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))\n",
    "    model.add(Dense(1, activation='sigmoid'))\n",
    "\n",
    "\n",
    "    optimzer=Adam(learning_rate=1e-5)\n",
    "    \n",
    "    #Compiling the classifier model with Adam optimizer\n",
    "    model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])\n",
    "    \n",
    "    #fitting the model\n",
    "    history=model.fit(X_train,y_train,batch_size=4,epochs=5,validation_data=(X_test,y_test),verbose=2)\n",
    "\n",
    "    #evaluate model\n",
    "    test_loss, test_acc = model.evaluate(np.array(X_test),  np.array(y_test), verbose=0)\n",
    "    print(\"Test_loss: {}, Test_accuracy: {} \".format(test_loss,test_acc))\n",
    "    \n",
    "    #creating the preprocess directory\n",
    "    os.makedirs(model_path, exist_ok = True)\n",
    "    \n",
    "    #saving the model\n",
    "    model.save(f'{model_path}/model.h5')\n",
    "    \n",
    "    #dumping other values\n",
    "    with open(f'{model_path}/values', 'wb') as f:\n",
    "        pickle.dump((test_loss, test_acc), f)\n",
    "    return(print('Done!'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_building_step = kfp.components.create_component_from_func(model_building_and_training, packages_to_install=['pandas', 'zipfile36', 'wget','tqdm','keras','numpy','tensorflow','sklearn','pickle5'], output_component_file='model_creation_component.yaml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "@dsl.pipeline(\n",
    "   name='Trial pipeline',\n",
    "   description='An example pipeline that performs pd formation and plotting class distibution.'\n",
    ")\n",
    "def trial_pipeline(\n",
    "    download_link: str,\n",
    "    data_path: str,\n",
    "    load_data_path: str, \n",
    "    preprocess_data_path: str,\n",
    "    corpus_path:str,\n",
    "    download_link_glove:str,\n",
    "    model_path:str,\n",
    "):\n",
    "    download_container = download_op(download_link)\n",
    "    output1 = load_data_step(download_container.output)\n",
    "    output2 = preprocess_data_step(output1.output)\n",
    "    output3 = corpus_creation_step(output2.output)\n",
    "    output4 = embedding_creation_step(download_link_glove, output3.output)\n",
    "    output5 = model_building_step(output4.output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# replace download_link with the repo link where the data is stored https:github-repo/data-dir/{file}.csv.zip?raw=true\n",
    "download_link = 'https://github.com/AnkitRai-22/natural-language-processing-with-disaster-tweets-kaggle-competition/blob/main/data/{file}.csv.zip?raw=true'\n",
    "data_path = \"/mnt\"\n",
    "load_data_path = \"load\"\n",
    "preprocess_data_path = \"preprocess\"\n",
    "corpus_path = \"corpus\",\n",
    "download_link_glove = \"http://nlp.stanford.edu/data/glove.6B.zip\"\n",
    "model_path = \"model\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<a href=\"/pipeline/#/experiments/details/6954af44-cdf8-47d7-bda1-211cbabdedec\" target=\"_blank\" >Experiment details</a>."
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<a href=\"/pipeline/#/runs/details/ba2fe0ed-6351-4b8c-bd7b-649fea61a5d9\" target=\"_blank\" >Run details</a>."
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "RunPipelineResult(run_id=ba2fe0ed-6351-4b8c-bd7b-649fea61a5d9)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client = kfp.Client()\n",
    "arguments = {\"download_link\": download_link,\n",
    "             \"data_path\": data_path,\n",
    "             \"load_data_path\": load_data_path,\n",
    "             \"preprocess_data_path\": preprocess_data_path,\n",
    "             \"corpus_path\":corpus_path,\n",
    "             \"download_link_glove\":download_link_glove,\n",
    "             \"model_path\":model_path,\n",
    "            }\n",
    "client.create_run_from_pipeline_func(trial_pipeline, arguments=arguments)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "kubeflow_notebook": {
   "autosnapshot": true,
   "docker_image": "",
   "experiment": {
    "id": "",
    "name": ""
   },
   "experiment_name": "",
   "katib_metadata": {
    "algorithm": {
     "algorithmName": "grid"
    },
    "maxFailedTrialCount": 3,
    "maxTrialCount": 12,
    "objective": {
     "objectiveMetricName": "",
     "type": "minimize"
    },
    "parallelTrialCount": 3,
    "parameters": []
   },
   "katib_run": false,
   "pipeline_description": "",
   "pipeline_name": "",
   "snapshot_volumes": true,
   "steps_defaults": [
    "label:access-ml-pipeline:true",
    "label:access-rok:true"
   ],
   "volume_access_mode": "rwm",
   "volumes": [
    {
     "annotations": [],
     "mount_point": "/home/jovyan",
     "name": "nlp-getting-started-vanilla-final-workspace-wwnkd",
     "size": 14,
     "size_type": "Gi",
     "snapshot": false,
     "type": "clone"
    }
   ]
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}