mirror of https://github.com/kubeflow/examples.git
1 line
17 KiB
Plaintext
1 line
17 KiB
Plaintext
{"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.0"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"### Basic Intro \n\nIn this competition, you’re challenged to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t.\n\n","metadata":{}},{"cell_type":"markdown","source":"## What's in this kernel?\n- Basic EDA\n- Data Cleaning\n- Baseline Model","metadata":{}},{"cell_type":"markdown","source":"### Importing required Libraries.","metadata":{}},{"cell_type":"code","source":"import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nfrom nltk.corpus import stopwords\nfrom nltk.util import ngrams\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom collections import defaultdict\nfrom collections import Counter\nplt.style.use('ggplot')\nstop=set(stopwords.words('english'))\nimport re\nfrom nltk.tokenize import word_tokenize\nimport gensim\nimport string\nfrom keras.preprocessing.text import Tokenizer\nfrom keras.preprocessing.sequence import pad_sequences\nfrom tqdm import tqdm\nfrom keras.models import Sequential\nfrom keras.layers import Embedding,LSTM,Dense,SpatialDropout1D\nfrom keras.initializers import Constant\nfrom sklearn.model_selection import train_test_split\nfrom keras.optimizers import Adam\n\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import os\n#os.listdir('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Loading the data and getting basic idea ","metadata":{}},{"cell_type":"code","source":"tweet= pd.read_csv('../input/nlp-getting-started/train.csv')\ntest=pd.read_csv('../input/nlp-getting-started/test.csv')\ntweet.head(3)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print('There are {} rows and {} columns in train'.format(tweet.shape[0],tweet.shape[1]))\nprint('There are {} rows and {} columns in train'.format(test.shape[0],test.shape[1]))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Class distribution","metadata":{}},{"cell_type":"markdown","source":"Before we begin with anything else,let's check the class distribution.There are only two classes 0 and 1.","metadata":{}},{"cell_type":"code","source":"x=tweet.target.value_counts()\nsns.barplot(x.index,x)\nplt.gca().set_ylabel('samples')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"ohh,as expected ! There is a class distribution.There are more tweets with class 0 ( No disaster) than class 1 ( disaster tweets)","metadata":{}},{"cell_type":"markdown","source":"## Exploratory Data Analysis of tweets","metadata":{}},{"cell_type":"markdown","source":"First,we will do very basic analysis,that is character level,word level and sentence level analysis.","metadata":{}},{"cell_type":"markdown","source":"### Number of characters in tweets","metadata":{}},{"cell_type":"code","source":"fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))\ntweet_len=tweet[tweet['target']==1]['text'].str.len()\nax1.hist(tweet_len,color='red')\nax1.set_title('disaster tweets')\ntweet_len=tweet[tweet['target']==0]['text'].str.len()\nax2.hist(tweet_len,color='green')\nax2.set_title('Not disaster tweets')\nfig.suptitle('Characters in tweets')\nplt.show()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"The distribution of both seems to be almost same.120 t0 140 characters in a tweet are the most common among both.","metadata":{}},{"cell_type":"markdown","source":"### Number of words in a tweet","metadata":{}},{"cell_type":"code","source":"fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))\ntweet_len=tweet[tweet['target']==1]['text'].str.split().map(lambda x: len(x))\nax1.hist(tweet_len,color='red')\nax1.set_title('disaster tweets')\ntweet_len=tweet[tweet['target']==0]['text'].str.split().map(lambda x: len(x))\nax2.hist(tweet_len,color='green')\nax2.set_title('Not disaster tweets')\nfig.suptitle('Words in a tweet')\nplt.show()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Average word length in a tweet","metadata":{}},{"cell_type":"code","source":"fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))\nword=tweet[tweet['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])\nsns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red')\nax1.set_title('disaster')\nword=tweet[tweet['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])\nsns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='green')\nax2.set_title('Not disaster')\nfig.suptitle('Average word length in each tweet')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def create_corpus(target):\n corpus=[]\n \n for x in tweet[tweet['target']==target]['text'].str.split():\n for i in x:\n corpus.append(i)\n return corpus","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Common stopwords in tweets","metadata":{}},{"cell_type":"markdown","source":"First we will analyze tweets with class 0.","metadata":{}},{"cell_type":"code","source":"corpus=create_corpus(0)\n\ndic=defaultdict(int)\nfor word in corpus:\n if word in stop:\n dic[word]+=1\n \ntop=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] \n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"x,y=zip(*top)\nplt.bar(x,y)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Now,we will analyze tweets with class 1.","metadata":{}},{"cell_type":"code","source":"corpus=create_corpus(1)\n\ndic=defaultdict(int)\nfor word in corpus:\n if word in stop:\n dic[word]+=1\n\ntop=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] \n \n\n\nx,y=zip(*top)\nplt.bar(x,y)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"In both of them,\"the\" dominates which is followed by \"a\" in class 0 and \"in\" in class 1.","metadata":{}},{"cell_type":"markdown","source":"### Analyzing punctuations.","metadata":{}},{"cell_type":"markdown","source":"First let's check tweets indicating real disaster.","metadata":{}},{"cell_type":"code","source":"plt.figure(figsize=(10,5))\ncorpus=create_corpus(1)\n\ndic=defaultdict(int)\nimport string\nspecial = string.punctuation\nfor i in (corpus):\n if i in special:\n dic[i]+=1\n \nx,y=zip(*dic.items())\nplt.bar(x,y)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Now,we will move on to class 0.","metadata":{}},{"cell_type":"code","source":"plt.figure(figsize=(10,5))\ncorpus=create_corpus(0)\n\ndic=defaultdict(int)\nimport string\nspecial = string.punctuation\nfor i in (corpus):\n if i in special:\n dic[i]+=1\n \nx,y=zip(*dic.items())\nplt.bar(x,y,color='green')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Common words ?","metadata":{}},{"cell_type":"code","source":"\ncounter=Counter(corpus)\nmost=counter.most_common()\nx=[]\ny=[]\nfor word,count in most[:40]:\n if (word not in stop) :\n x.append(word)\n y.append(count)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sns.barplot(x=y,y=x)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Lot of cleaning needed !","metadata":{}},{"cell_type":"markdown","source":"### Ngram analysis","metadata":{}},{"cell_type":"markdown","source":"we will do a bigram (n=2) analysis over the tweets.Let's check the most common bigrams in tweets.","metadata":{}},{"cell_type":"code","source":"def get_top_tweet_bigrams(corpus, n=None):\n vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)\n bag_of_words = vec.transform(corpus)\n sum_words = bag_of_words.sum(axis=0) \n words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]\n words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)\n return words_freq[:n]","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"plt.figure(figsize=(10,5))\ntop_tweet_bigrams=get_top_tweet_bigrams(tweet['text'])[:10]\nx,y=map(list,zip(*top_tweet_bigrams))\nsns.barplot(x=y,y=x)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"We will need lot of cleaning here..","metadata":{}},{"cell_type":"markdown","source":"## Data Cleaning\nAs we know,twitter tweets always have to be cleaned before we go onto modelling.So we will do some basic cleaning such as spelling correction,removing punctuations,removing html tags and emojis etc.So let's start.","metadata":{}},{"cell_type":"code","source":"df=pd.concat([tweet,test])\ndf.shape","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Removing urls","metadata":{}},{"cell_type":"code","source":"example=\"New competition launched :https://www.kaggle.com/c/nlp-getting-started\"","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def remove_URL(text):\n url = re.compile(r'https?://\\S+|www\\.\\S+')\n return url.sub(r'',text)\n\nremove_URL(example)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df['text']=df['text'].apply(lambda x : remove_URL(x))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Removing HTML tags","metadata":{}},{"cell_type":"code","source":"example = \"\"\"<div>\n<h1>Real or Fake</h1>\n<p>Kaggle </p>\n<a href=\"https://www.kaggle.com/c/nlp-getting-started\">getting started</a>\n</div>\"\"\"","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def remove_html(text):\n html=re.compile(r'<.*?>')\n return html.sub(r'',text)\nprint(remove_html(example))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df['text']=df['text'].apply(lambda x : remove_html(x))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Romoving Emojis","metadata":{}},{"cell_type":"code","source":"# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b\ndef remove_emoji(text):\n emoji_pattern = re.compile(\"[\"\n u\"\\U0001F600-\\U0001F64F\" # emoticons\n u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\n u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\n u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\n u\"\\U00002702-\\U000027B0\"\n u\"\\U000024C2-\\U0001F251\"\n \"]+\", flags=re.UNICODE)\n return emoji_pattern.sub(r'', text)\n\nremove_emoji(\"Omg another Earthquake 😔😔\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df['text']=df['text'].apply(lambda x: remove_emoji(x))\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Removing punctuations","metadata":{}},{"cell_type":"code","source":"def remove_punct(text):\n table=str.maketrans('','',string.punctuation)\n return text.translate(table)\n\nexample=\"I am a #king\"\nprint(remove_punct(example))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df['text']=df['text'].apply(lambda x : remove_punct(x))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Spelling Correction\n","metadata":{}},{"cell_type":"markdown","source":"Even if I'm not good at spelling I can correct it with python :) I will use `pyspellcheker` to do that.","metadata":{}},{"cell_type":"code","source":"!pip install pyspellchecker","metadata":{"_kg_hide-output":true,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from spellchecker import SpellChecker\n\nspell = SpellChecker()\ndef correct_spellings(text):\n corrected_text = []\n misspelled_words = spell.unknown(text.split())\n for word in text.split():\n if word in misspelled_words:\n corrected_text.append(spell.correction(word))\n else:\n corrected_text.append(word)\n return \" \".join(corrected_text)\n \ntext = \"corect me plese\"\ncorrect_spellings(text)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#df['text']=df['text'].apply(lambda x : correct_spellings(x)#)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## GloVe for Vectorization","metadata":{}},{"cell_type":"markdown","source":"Here we will use GloVe pretrained corpus model to represent our words.It is available in 3 varieties :50D ,100D and 200 Dimentional.We will try 100 D here.","metadata":{}},{"cell_type":"code","source":"\ndef create_corpus(df):\n corpus=[]\n for tweet in tqdm(df['text']):\n words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]\n corpus.append(words)\n return corpus\n \n ","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"corpus=create_corpus(df)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"embedding_dict={}\nwith open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r') as f:\n for line in f:\n values=line.split()\n word=values[0]\n vectors=np.asarray(values[1:],'float32')\n embedding_dict[word]=vectors\nf.close()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"MAX_LEN=50\ntokenizer_obj=Tokenizer()\ntokenizer_obj.fit_on_texts(corpus)\nsequences=tokenizer_obj.texts_to_sequences(corpus)\n\ntweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"word_index=tokenizer_obj.word_index\nprint('Number of unique words:',len(word_index))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"num_words=len(word_index)+1\nembedding_matrix=np.zeros((num_words,100))\n\nfor word,i in tqdm(word_index.items()):\n if i > num_words:\n continue\n \n emb_vec=embedding_dict.get(word)\n if emb_vec is not None:\n embedding_matrix[i]=emb_vec\n ","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Baseline Model","metadata":{}},{"cell_type":"code","source":"model=Sequential()\n\nembedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),\n input_length=MAX_LEN,trainable=False)\n\nmodel.add(embedding)\nmodel.add(SpatialDropout1D(0.2))\nmodel.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))\nmodel.add(Dense(1, activation='sigmoid'))\n\n\noptimzer=Adam(learning_rate=1e-5)\n\nmodel.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])\n\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model.summary()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train=tweet_pad[:tweet.shape[0]]\ntest=tweet_pad[tweet.shape[0]:]","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)\nprint('Shape of train',X_train.shape)\nprint(\"Shape of Validation \",X_test.shape)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Making our submission","metadata":{}},{"cell_type":"code","source":"sample_sub=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"y_pre=model.predict(test)\ny_pre=np.round(y_pre).astype(int).reshape(3263)\nsub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})\nsub.to_csv('submission.csv',index=False)\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sub.head()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"<font size='5' color='red'> if you like this kernel,please do an upvote.</font>","metadata":{"trusted":true}},{"cell_type":"code","source":"","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]} |