# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/sentiment-analysis-of-tweets/test_samples.txt /kaggle/input/sentiment-analysis-of-tweets/train.txt
#importing all the libraries
import sys , os , re , csv , codecs , numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Input,LSTM,Embedding,Dropout,Activation,BatchNormalization
from keras.layers import Bidirectional,GlobalMaxPool1D,GlobalAvgPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints , optimizers, layers
from keras.utils import to_categorical
#getting the train data
train = pd.read_csv('/kaggle/input/sentiment-analysis-of-tweets/train.txt')
#loading the test data
test = pd.read_csv('/kaggle/input/sentiment-analysis-of-tweets/test_samples.txt')
#diplay first 5 rows of train
train.head()
#one hot encoding the labels
df = pd.concat([train,pd.get_dummies(train['sentiment'])],axis=1)
#df.head()
train_data = df['tweet_text']
#train_data.head()
test_data = test['tweet_text']
test_data.head()
0 @jjuueellzz down in the Atlantic city, ventnor... 1 Musical awareness: Great Big Beautiful Tomorro... 2 On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana... 3 Kapan sih lo ngebuktiin,jan ngomong doang Susa... 4 Excuse the connectivity of this live stream, f... Name: tweet_text, dtype: object
#creating the array of labels in serial with their respective texts
classes = ['neutral' , 'negative' , 'positive']
y = df[classes].values
y
array([[0, 0, 1], [0, 1, 0], [0, 1, 0], ..., [1, 0, 0], [0, 0, 1], [1, 0, 0]], dtype=uint8)
#checking for null values in train and test data
train.isnull().any()
test.isnull().any()
tweet_id False tweet_text False dtype: bool
#configuration parameters
LATENT_DIM_DECODER = 400
BATCH_SIZE =128
EPOCHS = 20
LATENT_DIM = 400
NUM_SAMPLES = 10000
MAX_SEQUENCE_LEN = 1000
MAX_NUM_WORDS = 100000
EMBEDDING_DIM = 300
#NLTK python library for preprocessing
import nltk
#nltk.download('wordnet')
#for tokenization
from nltk.tokenize import RegexpTokenizer
#for stemming
from nltk.stem import WordNetLemmatizer,PorterStemmer
#for removing stopwords
from nltk.corpus import stopwords
#importing regex library of python
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
#function for performing all preproccing steps at once
def preprocess(sentence):
sentence=str(sentence)
sentence = sentence.lower()
sentence=sentence.replace('{html}',"")
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', sentence)
rem_url=re.sub(r'http\S+', '',cleantext)
rem_num = re.sub('[0-9]+', '', rem_url)
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(rem_num)
filtered_words = [w for w in tokens]# if not w in stopwords.words('english')]
stem_words=[stemmer.stem(w) for w in filtered_words]
lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
return " ".join(filtered_words)
#make a dataframe of preprocessed text
df['cleanText']=train_data.map(lambda s:preprocess(s))
test['clean_text']=test['tweet_text'].map(lambda s:preprocess(s))
test_final = test['clean_text']
test_final
0 jjuueellzz down in the atlantic city ventnor m... 1 musical awareness great big beautiful tomorrow... 2 on radio fm fri oct labour analyst shawn hatti... 3 kapan sih lo ngebuktiin jan ngomong doang susa... 4 excuse the connectivity of this live stream fr... ... 5393 it s a wednesday girls night out as s band wil... 5394 night college course sorted just have to enrol... 5395 for the st time in years for your splendiferou... 5396 nurses day may nursing the heart beat of the h... 5397 we have minutes left until the nd episode of s... Name: clean_text, Length: 5398, dtype: object
from keras.preprocessing.text import Tokenizer
#breaking the sentence into unique words/tokens
#expecting max tokens to be 20k
train_final = df['cleanText']
max_feat=40000
#tokenize sentence into list of words
tokenizer = Tokenizer(num_words=max_feat)#setting up tokenizer
#fiiting the tokenizer on out data
tokenizer.fit_on_texts(list(train_final))
train_final
0 gas by my house hit i um going to chapel hill ... 1 theo walcott is still shit uc watch rafa and j... 2 its not that i um a gsp fan uc i just hate nic... 3 iranian general says israel us iron dome can u... 4 tehran uc mon amour obama tried to establish t... ... 21460 the day after newark ill be able to say i met ... 21461 fec hold farewell session for seven ministers ... 21462 luca di montezemolo who s last day was monday ... 21463 coffee is pretty much the answer to all questi... 21464 niki lauda just confirmed to sky that alonso w... Name: cleanText, Length: 21465, dtype: object
tokenizer2 = Tokenizer(num_words=max_feat)#setting up tokenizer
#fiiting the tokenizer on out data
tokenizer2.fit_on_texts(list(test_final))
#converting text into sequence of numbers to feed in neural network
sequence_train = tokenizer.texts_to_sequences(train_final)
sequence_test = tokenizer2.texts_to_sequences(test_final)
# get the word to index mapping for input language
word2idx_inputs = tokenizer.word_index
print('Found %s unique input tokens.' % len(word2idx_inputs))
Found 34302 unique input tokens.
#LOADING PRETRAINED WORD VECTORS
# store all the pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open('/kaggle/glove.6B.300d.txt', encoding="utf8") as f:
# is just a space-separated text file in the format:
# word vec[0] vec[1] vec[2] ...
for line in f:
values = line.split()
word = values[0]
vec = np.asarray(values[1:], dtype='float32')
word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))
#EMBEDDING MATRIX
# prepare embedding matrix of words for embedding layer
print('Filling pre-trained embeddings...')
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
if(i < MAX_NUM_WORDS):
embedding_vector = word2vec.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all zeros.
embedding_matrix[i] = embedding_vector
Filling pre-trained embeddings...
max_len = [len(s) for s in sequence_train]
print(max(max_len))
1074
from keras.preprocessing.sequence import pad_sequences
#scaling all the sequences to a fixed length
#dimension of input to the layer should be constant
#scaling each comment sequence to a fixed length to 200
#comments smaller than 200 will be padded with zeros to make their length as 200
max_len=1000
#pad the train and text sequence to be of fixed length (in keras input in lstm should be of fixed length sequnece)
x_train=pad_sequences(sequence_train,maxlen=max_len)
x_test=pad_sequences(sequence_test,maxlen=max_len)
from keras.layers import Embedding
# create embedding layer
embedding_layer = Embedding(
num_words,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=max_len,
trainable=True
)
%matplotlib inline
from matplotlib import pyplot as plt
len_words = [len(words) for words in sequence_train]
#distribution of sequence
plt.hist(len_words, bins = np.arange(0,400,10))
plt.show()
# we can see that most of the comments have [0,50] words
from keras.layers import Input
input = Input(shape=(max_len,))
#feeding the output of previous layer to the embedding layer that converts
#the sequences into vector representation to detect relevance and context
#of a particular word
embed_layer =embedding_layer(input)
import tensorflow as tf
from keras.layers.recurrent import LSTM
#passing the previous output as input to the BI_LSTM layer
LSTM_layer = tf.keras.layers.Bidirectional(LSTM(256, return_sequences=True, name='BI_lstm_layer'))(embed_layer)
sec_LSTM_layer = tf.keras.layers.Bidirectional(LSTM(256, return_sequences=True, name='BI2_lstm_layer'))(LSTM_layer)
batchnorm = BatchNormalization()(sec_LSTM_layer)
#dimension reduction using pooling layer
red_dim_layer = tf.keras.layers.GlobalAvgPool1D()(batchnorm)
##### adding dropout layer for better generalization
#setting value as 0.1 , which means 10$ of nodes will be randomly disabled
drop_layer = Dropout(0.55)(red_dim_layer)
#densely connected layer
dense1 = Dense(128,activation='elu')(drop_layer)
batchnorm2 = BatchNormalization()(dense1)
dense2 = Dense(128,activation='elu')(batchnorm2)
batchnorm3 = BatchNormalization()(dense2)
dense3 = Dense(128,activation='elu')(batchnorm3)
#adding another dropout layer
drop_layer2 = Dropout(0.55)(dense3)
#adding the output dense layer with sigmoid activation to get result
#3 classes as output
output_dense = Dense(3,activation='softmax')(drop_layer2)
#connecting the inputs and outputs to create a model and compiling the model
from keras.optimizers import Adagrad,Adam,RMSprop
model = Model(inputs=input , outputs = output_dense)
model.compile(loss = 'categorical_crossentropy',
optimizer = RMSprop(lr=0.001),
metrics = ['accuracy'])
model.summary()
Model: "model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) [(None, 1000)] 0 _________________________________________________________________ embedding (Embedding) (None, 1000, 300) 10290900 _________________________________________________________________ bidirectional (Bidirectional (None, 1000, 512) 1140736 _________________________________________________________________ bidirectional_1 (Bidirection (None, 1000, 512) 1574912 _________________________________________________________________ batch_normalization (BatchNo (None, 1000, 512) 2048 _________________________________________________________________ global_average_pooling1d (Gl (None, 512) 0 _________________________________________________________________ dropout (Dropout) (None, 512) 0 _________________________________________________________________ dense (Dense) (None, 128) 65664 _________________________________________________________________ batch_normalization_1 (Batch (None, 128) 512 _________________________________________________________________ dense_1 (Dense) (None, 128) 16512 _________________________________________________________________ batch_normalization_2 (Batch (None, 128) 512 _________________________________________________________________ dense_2 (Dense) (None, 128) 16512 _________________________________________________________________ dropout_1 (Dropout) (None, 128) 0 _________________________________________________________________ dense_3 (Dense) (None, 3) 387 ================================================================= Total params: 13,108,695 Trainable params: 13,107,159 Non-trainable params: 1,536 _________________________________________________________________
#Fitting the model
batch_size=64
epochs = 30
model.fit(x_train,y,batch_size=batch_size,epochs = epochs,validation_split=0.2)
Epoch 1/30 269/269 [==============================] - 164s 609ms/step - loss: 1.0977 - accuracy: 0.4971 - val_loss: 1.9855 - val_accuracy: 0.3988 Epoch 2/30 269/269 [==============================] - 162s 602ms/step - loss: 0.7904 - accuracy: 0.6585 - val_loss: 8.8136 - val_accuracy: 0.4281 Epoch 3/30 269/269 [==============================] - 162s 601ms/step - loss: 0.6312 - accuracy: 0.7466 - val_loss: 28.4012 - val_accuracy: 0.3988 Epoch 4/30 269/269 [==============================] - 162s 602ms/step - loss: 0.5122 - accuracy: 0.8026 - val_loss: 32.2878 - val_accuracy: 0.4281 Epoch 5/30 269/269 [==============================] - 162s 603ms/step - loss: 0.4010 - accuracy: 0.8530 - val_loss: 22.5790 - val_accuracy: 0.3990 Epoch 6/30 269/269 [==============================] - 162s 602ms/step - loss: 0.3176 - accuracy: 0.8894 - val_loss: 36.6653 - val_accuracy: 0.1731 Epoch 7/30 269/269 [==============================] - 162s 602ms/step - loss: 0.2551 - accuracy: 0.9094 - val_loss: 16.2503 - val_accuracy: 0.1731 Epoch 8/30 269/269 [==============================] - 162s 602ms/step - loss: 0.2035 - accuracy: 0.9302 - val_loss: 9.0825 - val_accuracy: 0.4281 Epoch 9/30 269/269 [==============================] - 162s 602ms/step - loss: 0.1675 - accuracy: 0.9442 - val_loss: 8.4932 - val_accuracy: 0.4298 Epoch 10/30 269/269 [==============================] - 162s 601ms/step - loss: 0.1493 - accuracy: 0.9499 - val_loss: 7.6688 - val_accuracy: 0.4030 Epoch 11/30 269/269 [==============================] - 162s 602ms/step - loss: 0.1296 - accuracy: 0.9580 - val_loss: 3.0278 - val_accuracy: 0.2786 Epoch 12/30 269/269 [==============================] - 162s 600ms/step - loss: 0.1127 - accuracy: 0.9630 - val_loss: 3.5251 - val_accuracy: 0.3084 Epoch 13/30 269/269 [==============================] - 162s 601ms/step - loss: 0.1118 - accuracy: 0.9636 - val_loss: 3.1608 - val_accuracy: 0.4642 Epoch 14/30 269/269 [==============================] - 162s 603ms/step - loss: 0.0992 - accuracy: 0.9686 - val_loss: 1.9886 - val_accuracy: 0.5153 Epoch 15/30 269/269 [==============================] - 162s 602ms/step - loss: 0.0879 - accuracy: 0.9723 - val_loss: 3.5870 - val_accuracy: 0.3450 Epoch 16/30 269/269 [==============================] - 162s 601ms/step - loss: 0.0818 - accuracy: 0.9746 - val_loss: 7.3300 - val_accuracy: 0.4223 Epoch 17/30 269/269 [==============================] - 162s 602ms/step - loss: 0.0752 - accuracy: 0.9773 - val_loss: 15.3390 - val_accuracy: 0.1731 Epoch 18/30 269/269 [==============================] - 162s 601ms/step - loss: 0.0668 - accuracy: 0.9790 - val_loss: 2.5116 - val_accuracy: 0.5236 Epoch 19/30 269/269 [==============================] - 162s 601ms/step - loss: 0.0665 - accuracy: 0.9790 - val_loss: 7.4534 - val_accuracy: 0.5022 Epoch 20/30 269/269 [==============================] - 162s 601ms/step - loss: 0.0641 - accuracy: 0.9797 - val_loss: 5.5781 - val_accuracy: 0.4985 Epoch 21/30 269/269 [==============================] - 162s 601ms/step - loss: 0.0575 - accuracy: 0.9826 - val_loss: 4.2691 - val_accuracy: 0.3487 Epoch 23/30 269/269 [==============================] - 162s 601ms/step - loss: 0.0547 - accuracy: 0.9842 - val_loss: 3.4779 - val_accuracy: 0.4540 Epoch 24/30 269/269 [==============================] - 162s 601ms/step - loss: 0.0531 - accuracy: 0.9843 - val_loss: 3.9039 - val_accuracy: 0.4642 Epoch 25/30 269/269 [==============================] - 161s 600ms/step - loss: 0.0478 - accuracy: 0.9863 - val_loss: 3.6402 - val_accuracy: 0.4999 Epoch 26/30 269/269 [==============================] - 162s 601ms/step - loss: 0.0530 - accuracy: 0.9841 - val_loss: 3.6919 - val_accuracy: 0.4854 Epoch 27/30 269/269 [==============================] - 161s 599ms/step - loss: 0.0460 - accuracy: 0.9871 - val_loss: 2.6603 - val_accuracy: 0.5157 Epoch 28/30 269/269 [==============================] - 161s 600ms/step - loss: 0.0465 - accuracy: 0.9870 - val_loss: 3.4747 - val_accuracy: 0.5073 Epoch 29/30 269/269 [==============================] - 161s 600ms/step - loss: 0.0416 - accuracy: 0.9883 - val_loss: 4.9956 - val_accuracy: 0.5183 Epoch 30/30 269/269 [==============================] - 161s 599ms/step - loss: 0.0426 - accuracy: 0.9877 - val_loss: 9.7257 - val_accuracy: 0.1938
<tensorflow.python.keras.callbacks.History at 0x7f27a8051690>