mirror of https://github.com/kubeflow/examples.git
616 lines
26 KiB
Plaintext
616 lines
26 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Defaulting to user installation because normal site-packages is not writeable\n",
|
|
"Requirement already satisfied: kfp in /home/jovyan/.local/lib/python3.6/site-packages (1.8.11)\n",
|
|
"Collecting kfp\n",
|
|
" Using cached kfp-1.8.12.tar.gz (301 kB)\n",
|
|
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
|
"\u001b[?25hRequirement already satisfied: absl-py<2,>=0.9 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.11.0)\n",
|
|
"Requirement already satisfied: PyYAML<6,>=5.3 in /usr/local/lib/python3.6/dist-packages (from kfp) (5.4.1)\n",
|
|
"Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.31.5)\n",
|
|
"Requirement already satisfied: google-cloud-storage<2,>=1.20.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.44.0)\n",
|
|
"Requirement already satisfied: kubernetes<19,>=8.0.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (12.0.1)\n",
|
|
"Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.12.8)\n",
|
|
"Requirement already satisfied: google-auth<2,>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.35.0)\n",
|
|
"Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.9.1)\n",
|
|
"Requirement already satisfied: cloudpickle<3,>=2.0.0 in /home/jovyan/.local/lib/python3.6/site-packages (from kfp) (2.0.0)\n",
|
|
"Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.7.1)\n",
|
|
"Requirement already satisfied: jsonschema<4,>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n",
|
|
"Requirement already satisfied: tabulate<1,>=0.8.6 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.8.9)\n",
|
|
"Requirement already satisfied: click<9,>=7.1.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (7.1.2)\n",
|
|
"Requirement already satisfied: Deprecated<2,>=1.2.7 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.2.13)\n",
|
|
"Requirement already satisfied: strip-hints<1,>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.1.10)\n",
|
|
"Requirement already satisfied: docstring-parser<1,>=0.7.3 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.13)\n",
|
|
"Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.13 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.1.13)\n",
|
|
"Requirement already satisfied: protobuf<4,>=3.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.19.3)\n",
|
|
"Requirement already satisfied: fire<1,>=0.3.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.4.0)\n",
|
|
"Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from kfp) (0.8)\n",
|
|
"Requirement already satisfied: uritemplate<4,>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.0.1)\n",
|
|
"Requirement already satisfied: typer<1.0,>=0.3.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (0.4.0)\n",
|
|
"Requirement already satisfied: typing-extensions<4,>=3.7.4 in /home/jovyan/.local/lib/python3.6/site-packages (from kfp) (3.7.4.3)\n",
|
|
"Requirement already satisfied: pydantic<2,>=1.8.2 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.9.0)\n",
|
|
"Requirement already satisfied: six in /home/jovyan/.local/lib/python3.6/site-packages (from absl-py<2,>=0.9->kfp) (1.15.0)\n",
|
|
"Requirement already satisfied: wrapt<2,>=1.10 in /home/jovyan/.local/lib/python3.6/site-packages (from Deprecated<2,>=1.2.7->kfp) (1.12.1)\n",
|
|
"Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from fire<1,>=0.3.1->kfp) (1.1.0)\n",
|
|
"Requirement already satisfied: google-auth-httplib2>=0.0.3 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client<2,>=1.7.8->kfp) (0.1.0)\n",
|
|
"Requirement already satisfied: httplib2<1dev,>=0.15.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client<2,>=1.7.8->kfp) (0.20.2)\n",
|
|
"Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (59.6.0)\n",
|
|
"Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (4.2.4)\n",
|
|
"Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (4.8)\n",
|
|
"Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.1->kfp) (0.2.8)\n",
|
|
"Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage<2,>=1.20.0->kfp) (2.27.1)\n",
|
|
"Requirement already satisfied: google-cloud-core<3.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage<2,>=1.20.0->kfp) (2.2.1)\n",
|
|
"Requirement already satisfied: google-resumable-media<3.0dev,>=1.3.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage<2,>=1.20.0->kfp) (2.1.0)\n",
|
|
"Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.6/dist-packages (from jsonschema<4,>=3.0.1->kfp) (4.8.3)\n",
|
|
"Requirement already satisfied: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema<4,>=3.0.1->kfp) (0.18.0)\n",
|
|
"Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema<4,>=3.0.1->kfp) (20.3.0)\n",
|
|
"Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp) (2.8.2)\n",
|
|
"Requirement already satisfied: urllib3>=1.15 in /usr/local/lib/python3.6/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp) (1.26.8)\n",
|
|
"Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp) (2021.10.8)\n",
|
|
"Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<19,>=8.0.0->kfp) (1.2.3)\n",
|
|
"Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<19,>=8.0.0->kfp) (1.3.0)\n",
|
|
"Requirement already satisfied: wheel in /home/jovyan/.local/lib/python3.6/site-packages (from strip-hints<1,>=0.1.8->kfp) (0.37.1)\n",
|
|
"Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp) (2021.3)\n",
|
|
"Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp) (1.54.0)\n",
|
|
"Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.6/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp) (21.3)\n",
|
|
"Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.6/dist-packages (from google-resumable-media<3.0dev,>=1.3.0->google-cloud-storage<2,>=1.20.0->kfp) (1.3.0)\n",
|
|
"Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /usr/local/lib/python3.6/dist-packages (from httplib2<1dev,>=0.15.0->google-api-python-client<2,>=1.7.8->kfp) (3.0.6)\n",
|
|
"Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.1->kfp) (0.4.8)\n",
|
|
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage<2,>=1.20.0->kfp) (3.3)\n",
|
|
"Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage<2,>=1.20.0->kfp) (2.0.10)\n",
|
|
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata->jsonschema<4,>=3.0.1->kfp) (3.6.0)\n",
|
|
"Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<19,>=8.0.0->kfp) (3.1.1)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!pip install kfp --upgrade "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# Import kubeflow pipeline libraries\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import kfp\n",
|
|
"import kfp.components as comp\n",
|
|
"import kfp.dsl as dsl\n",
|
|
"from kfp.components import InputPath, OutputPath"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## Download Dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Download data\n",
|
|
"def download_data(download_link: str, data_path: OutputPath(str)):\n",
|
|
" import zipfile\n",
|
|
" import wget\n",
|
|
" import os\n",
|
|
"\n",
|
|
" if not os.path.exists(data_path):\n",
|
|
" os.makedirs(data_path)\n",
|
|
"\n",
|
|
" # download files\n",
|
|
" wget.download(download_link.format(file='train'), f'{data_path}/train_csv.zip')\n",
|
|
" wget.download(download_link.format(file='test'), f'{data_path}/test_csv.zip')\n",
|
|
" \n",
|
|
" with zipfile.ZipFile(f\"{data_path}/train_csv.zip\",\"r\") as zip_ref:\n",
|
|
" zip_ref.extractall(data_path)\n",
|
|
" \n",
|
|
" with zipfile.ZipFile(f\"{data_path}/test_csv.zip\",\"r\") as zip_ref:\n",
|
|
" zip_ref.extractall(data_path)\n",
|
|
" \n",
|
|
" return(print('Done!'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"download_op = kfp.components.create_component_from_func(download_data,base_image=\"python:3.7\", packages_to_install=['wget', 'zipfile36'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"# Load Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def load_data(data_path:comp.InputPath(str),load_data_path: comp.OutputPath()):\n",
|
|
" import pandas as pd\n",
|
|
" import os, pickle\n",
|
|
" \n",
|
|
" train_data_path = data_path + '/train.csv'\n",
|
|
" test_data_path = data_path + '/test.csv'\n",
|
|
" tweet_df= pd.read_csv(train_data_path)\n",
|
|
" test_df=pd.read_csv(test_data_path)\n",
|
|
" df=pd.concat([tweet_df,test_df])\n",
|
|
" \n",
|
|
" #creating the preprocess directory\n",
|
|
" os.makedirs(load_data_path, exist_ok = True)\n",
|
|
" \n",
|
|
" # join train and test together\n",
|
|
" ntrain = tweet_df.shape[0]\n",
|
|
" ntest = test_df.shape[0]\n",
|
|
" with open(f'{load_data_path}/df', 'wb') as f:\n",
|
|
" pickle.dump((ntrain, df, tweet_df), f)\n",
|
|
" return(print('Done!'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"load_data_step = kfp.components.create_component_from_func(\n",
|
|
" func=load_data,\n",
|
|
" output_component_file='load_data_component.yaml', # This is optional. It saves the component spec for future use.\n",
|
|
" base_image='python:3.7',\n",
|
|
" packages_to_install=['pandas','pickle5'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def preprocess_data(load_data_path:comp.InputPath(str), preprocess_data_path:comp.OutputPath(str)):\n",
|
|
" \n",
|
|
" import re\n",
|
|
" import pandas as pd\n",
|
|
" import os, pickle\n",
|
|
" import string\n",
|
|
" \n",
|
|
" #loading the train data\n",
|
|
" with open(f'{load_data_path}/df', 'rb') as f:\n",
|
|
" ntrain, df, tweet_df = pickle.load(f)\n",
|
|
" \n",
|
|
" \n",
|
|
" def remove_URL(text):\n",
|
|
" url = re.compile(r'https?://\\S+|www\\.\\S+')\n",
|
|
" return url.sub(r'',text)\n",
|
|
" def remove_html(text):\n",
|
|
" html=re.compile(r'<.*?>')\n",
|
|
" return html.sub(r'',text)\n",
|
|
" def remove_emoji(text):\n",
|
|
" emoji_pattern = re.compile(\"[\"\n",
|
|
" u\"\\U0001F600-\\U0001F64F\" # emoticons\n",
|
|
" u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\n",
|
|
" u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\n",
|
|
" u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\n",
|
|
" u\"\\U00002702-\\U000027B0\"\n",
|
|
" u\"\\U000024C2-\\U0001F251\"\n",
|
|
" \"]+\", flags=re.UNICODE)\n",
|
|
" return emoji_pattern.sub(r'', text)\n",
|
|
" def remove_punct(text):\n",
|
|
" table=str.maketrans('','',string.punctuation)\n",
|
|
" return text.translate(table)\n",
|
|
" \n",
|
|
"\n",
|
|
" \n",
|
|
" \n",
|
|
" df['text'] = df['text'].apply(lambda x : remove_URL(x))\n",
|
|
" df['text'] = df['text'].apply(lambda x: remove_html(x))\n",
|
|
" df['text'] = df['text'].apply(lambda x: remove_emoji(x))\n",
|
|
" df['text'] = df['text'].apply(lambda x: remove_punct(x))\n",
|
|
"\n",
|
|
" \n",
|
|
" #creating the preprocess directory\n",
|
|
" os.makedirs(preprocess_data_path, exist_ok = True)\n",
|
|
" \n",
|
|
" with open(f'{preprocess_data_path}/df', 'wb') as f:\n",
|
|
" pickle.dump((ntrain, df, tweet_df), f)\n",
|
|
" return(print('Done!'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"preprocess_data_step = kfp.components.create_component_from_func(preprocess_data, packages_to_install=['pandas', 'regex', 'pickle5'], output_component_file='preprocess_data_component.yaml')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def corpus_creation(preprocess_data_path: comp.InputPath(str), corpus_path: comp.OutputPath(str)):\n",
|
|
" import os, pickle\n",
|
|
" import pandas as pd\n",
|
|
" import nltk\n",
|
|
" nltk.download('stopwords')\n",
|
|
" nltk.download('punkt')\n",
|
|
" from nltk.corpus import stopwords\n",
|
|
" from nltk.util import ngrams\n",
|
|
" from nltk.tokenize import word_tokenize\n",
|
|
" stop=set(stopwords.words('english'))\n",
|
|
" from tqdm import tqdm\n",
|
|
" \n",
|
|
" with open(f'{preprocess_data_path}/df', 'rb') as f:\n",
|
|
" ntrain, df, tweet_df = pickle.load(f)\n",
|
|
" \n",
|
|
" def create_corpus(df):\n",
|
|
" corpus=[]\n",
|
|
" for tweet in tqdm(df['text']):\n",
|
|
" words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]\n",
|
|
" corpus.append(words)\n",
|
|
" return corpus\n",
|
|
" \n",
|
|
" #creating the preprocess directory\n",
|
|
" os.makedirs(corpus_path, exist_ok = True)\n",
|
|
" \n",
|
|
" corpus=create_corpus(df)\n",
|
|
" with open(f'{corpus_path}/corpus', 'wb') as f:\n",
|
|
" pickle.dump((corpus,tweet_df), f)\n",
|
|
" return(print('Done!'))\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"corpus_creation_step = kfp.components.create_component_from_func(corpus_creation, packages_to_install=['pandas', 'pickle5', 'nltk','tqdm'], output_component_file='corpus_creation_component.yaml')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def embedding_step(download_link_glove: str,corpus_path: comp.InputPath(str), embedded_path: comp.OutputPath(str)):\n",
|
|
" \n",
|
|
" import os, pickle\n",
|
|
" import pandas as pd\n",
|
|
" import zipfile\n",
|
|
" import wget\n",
|
|
" import os\n",
|
|
" from keras.preprocessing.text import Tokenizer\n",
|
|
" from keras.utils import pad_sequences\n",
|
|
" import numpy as np\n",
|
|
" from tqdm import tqdm\n",
|
|
" \n",
|
|
" with open(f'{corpus_path}/corpus', 'rb') as f:\n",
|
|
" corpus, tweet_df = pickle.load(f)\n",
|
|
" \n",
|
|
" if not os.path.exists(embedded_path):\n",
|
|
" os.makedirs(embedded_path)\n",
|
|
" # download files\n",
|
|
" wget.download(download_link_glove, f'{embedded_path}/glove.6B.zip')\n",
|
|
" \n",
|
|
" with zipfile.ZipFile(f'{embedded_path}/glove.6B.zip', 'r') as zip_ref:\n",
|
|
" zip_ref.extractall(embedded_path)\n",
|
|
" \n",
|
|
" embedding_dict={}\n",
|
|
" \"\"\"path_to_glove_file = os.path.join(\n",
|
|
" os.path.expanduser(\"~\"), f\"{embedded_path}/glove.6B.100d.txt\"\n",
|
|
" )\"\"\"\n",
|
|
" with open(f\"{embedded_path}/glove.6B.100d.txt\",'r') as f:\n",
|
|
" for line in f:\n",
|
|
" values=line.split()\n",
|
|
" word=values[0]\n",
|
|
" vectors=np.asarray(values[1:],'float32')\n",
|
|
" embedding_dict[word]=vectors\n",
|
|
" f.close()\n",
|
|
" \n",
|
|
" MAX_LEN=50\n",
|
|
" tokenizer_obj=Tokenizer()\n",
|
|
" tokenizer_obj.fit_on_texts(corpus)\n",
|
|
" sequences=tokenizer_obj.texts_to_sequences(corpus)\n",
|
|
" \n",
|
|
" tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')\n",
|
|
" word_index=tokenizer_obj.word_index\n",
|
|
" num_words=len(word_index)+1\n",
|
|
" embedding_matrix=np.zeros((num_words,100))\n",
|
|
"\n",
|
|
" for word,i in tqdm(word_index.items()):\n",
|
|
" if i > num_words:\n",
|
|
" continue\n",
|
|
"\n",
|
|
" emb_vec=embedding_dict.get(word)\n",
|
|
" if emb_vec is not None:\n",
|
|
" embedding_matrix[i]=emb_vec\n",
|
|
" \n",
|
|
" with open(f'{embedded_path}/embedding', 'wb') as f:\n",
|
|
" pickle.dump((embedding_matrix, num_words, tweet_pad, tweet_df, MAX_LEN), f)\n",
|
|
" return(print('Done!'))\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embedding_creation_step = kfp.components.create_component_from_func(embedding_step, packages_to_install=['pandas', 'zipfile36', 'wget','tqdm','keras','numpy','tensorflow', 'pickle5'], output_component_file='embedding_component.yaml')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def model_building_and_training(embedded_path: comp.InputPath(str), model_path: comp.OutputPath(str)):\n",
|
|
" \n",
|
|
" import os, pickle;\n",
|
|
" import pandas as pd\n",
|
|
" import numpy as np\n",
|
|
" from keras.models import Sequential\n",
|
|
" from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D\n",
|
|
" from keras.initializers import Constant\n",
|
|
" from sklearn.model_selection import train_test_split\n",
|
|
" from tensorflow.keras.optimizers import Adam\n",
|
|
" \n",
|
|
" with open(f'{embedded_path}/embedding', 'rb') as f:\n",
|
|
" embedding_matrix, num_words, tweet_pad, tweet_df, MAX_LEN = pickle.load(f)\n",
|
|
" \n",
|
|
" train=tweet_pad[:tweet_df.shape[0]]\n",
|
|
" final_test=tweet_pad[tweet_df.shape[0]:]\n",
|
|
" X_train,X_test,y_train,y_test=train_test_split(train,tweet_df['target'].values,test_size=0.15)\n",
|
|
" \n",
|
|
" #defining model\n",
|
|
" model=Sequential()\n",
|
|
"\n",
|
|
" embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),\n",
|
|
" input_length=MAX_LEN,trainable=False)\n",
|
|
"\n",
|
|
" model.add(embedding)\n",
|
|
" model.add(SpatialDropout1D(0.2))\n",
|
|
" model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))\n",
|
|
" model.add(Dense(1, activation='sigmoid'))\n",
|
|
"\n",
|
|
"\n",
|
|
" optimzer=Adam(learning_rate=1e-5)\n",
|
|
" \n",
|
|
" #Compiling the classifier model with Adam optimizer\n",
|
|
" model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])\n",
|
|
" \n",
|
|
" #fitting the model\n",
|
|
" history=model.fit(X_train,y_train,batch_size=4,epochs=5,validation_data=(X_test,y_test),verbose=2)\n",
|
|
"\n",
|
|
" #evaluate model\n",
|
|
" test_loss, test_acc = model.evaluate(np.array(X_test), np.array(y_test), verbose=0)\n",
|
|
" print(\"Test_loss: {}, Test_accuracy: {} \".format(test_loss,test_acc))\n",
|
|
" \n",
|
|
" #creating the preprocess directory\n",
|
|
" os.makedirs(model_path, exist_ok = True)\n",
|
|
" \n",
|
|
" #saving the model\n",
|
|
" model.save(f'{model_path}/model.h5')\n",
|
|
" \n",
|
|
" #dumping other values\n",
|
|
" with open(f'{model_path}/values', 'wb') as f:\n",
|
|
" pickle.dump((test_loss, test_acc), f)\n",
|
|
" return(print('Done!'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model_building_step = kfp.components.create_component_from_func(model_building_and_training, packages_to_install=['pandas', 'zipfile36', 'wget','tqdm','keras','numpy','tensorflow','sklearn','pickle5'], output_component_file='model_creation_component.yaml')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"@dsl.pipeline(\n",
|
|
" name='Trial pipeline',\n",
|
|
" description='An example pipeline that performs pd formation and plotting class distibution.'\n",
|
|
")\n",
|
|
"def trial_pipeline(\n",
|
|
" download_link: str,\n",
|
|
" data_path: str,\n",
|
|
" load_data_path: str, \n",
|
|
" preprocess_data_path: str,\n",
|
|
" corpus_path:str,\n",
|
|
" download_link_glove:str,\n",
|
|
" model_path:str,\n",
|
|
"):\n",
|
|
" download_container = download_op(download_link)\n",
|
|
" output1 = load_data_step(download_container.output)\n",
|
|
" output2 = preprocess_data_step(output1.output)\n",
|
|
" output3 = corpus_creation_step(output2.output)\n",
|
|
" output4 = embedding_creation_step(download_link_glove, output3.output)\n",
|
|
" output5 = model_building_step(output4.output)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# replace download_link with the repo link where the data is stored https:github-repo/data-dir/{file}.csv.zip?raw=true\n",
|
|
"download_link = 'https://github.com/AnkitRai-22/natural-language-processing-with-disaster-tweets-kaggle-competition/blob/main/data/{file}.csv.zip?raw=true'\n",
|
|
"data_path = \"/mnt\"\n",
|
|
"load_data_path = \"load\"\n",
|
|
"preprocess_data_path = \"preprocess\"\n",
|
|
"corpus_path = \"corpus\",\n",
|
|
"download_link_glove = \"http://nlp.stanford.edu/data/glove.6B.zip\"\n",
|
|
"model_path = \"model\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<a href=\"/pipeline/#/experiments/details/6954af44-cdf8-47d7-bda1-211cbabdedec\" target=\"_blank\" >Experiment details</a>."
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<a href=\"/pipeline/#/runs/details/ba2fe0ed-6351-4b8c-bd7b-649fea61a5d9\" target=\"_blank\" >Run details</a>."
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"RunPipelineResult(run_id=ba2fe0ed-6351-4b8c-bd7b-649fea61a5d9)"
|
|
]
|
|
},
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"client = kfp.Client()\n",
|
|
"arguments = {\"download_link\": download_link,\n",
|
|
" \"data_path\": data_path,\n",
|
|
" \"load_data_path\": load_data_path,\n",
|
|
" \"preprocess_data_path\": preprocess_data_path,\n",
|
|
" \"corpus_path\":corpus_path,\n",
|
|
" \"download_link_glove\":download_link_glove,\n",
|
|
" \"model_path\":model_path,\n",
|
|
" }\n",
|
|
"client.create_run_from_pipeline_func(trial_pipeline, arguments=arguments)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"kubeflow_notebook": {
|
|
"autosnapshot": true,
|
|
"docker_image": "",
|
|
"experiment": {
|
|
"id": "",
|
|
"name": ""
|
|
},
|
|
"experiment_name": "",
|
|
"katib_metadata": {
|
|
"algorithm": {
|
|
"algorithmName": "grid"
|
|
},
|
|
"maxFailedTrialCount": 3,
|
|
"maxTrialCount": 12,
|
|
"objective": {
|
|
"objectiveMetricName": "",
|
|
"type": "minimize"
|
|
},
|
|
"parallelTrialCount": 3,
|
|
"parameters": []
|
|
},
|
|
"katib_run": false,
|
|
"pipeline_description": "",
|
|
"pipeline_name": "",
|
|
"snapshot_volumes": true,
|
|
"steps_defaults": [
|
|
"label:access-ml-pipeline:true",
|
|
"label:access-rok:true"
|
|
],
|
|
"volume_access_mode": "rwm",
|
|
"volumes": [
|
|
{
|
|
"annotations": [],
|
|
"mount_point": "/home/jovyan",
|
|
"name": "nlp-getting-started-vanilla-final-workspace-wwnkd",
|
|
"size": 14,
|
|
"size_type": "Gi",
|
|
"snapshot": false,
|
|
"type": "clone"
|
|
}
|
|
]
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|