fyp_structured_sentiment_analysis.py

# -*- coding: utf-8 -*-
"""FYP Structured sentiment analysis.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1QaQA8EPvuV2CpKmEPft7TxgyQ53Xpp4d
"""

!pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

!pip install transformers pandas numpy

!pip3 install nltk

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np
import nltk

"""Uploading train data.csv file """

from google.colab import files

uploaded = files.upload()

"""Pre processing test data

"""

dftest = pd.read_csv("Test_Data.csv") 
dftest.info()

dftest.drop(dftest.iloc[:, 36:99], inplace = True, axis = 1)
# dftest.info()

dftest = dftest.drop(['sent_id','opinions/0/Target/1/0', 'opinions/0/Polar_expression/1/0','opinions/1/Target/1/0', 'opinions/1/Polar_expression/1/0','opinions/2/Target/1/0','opinions/2/Target/1/0','opinions/2/Polar_expression/1/0', 'opinions/3/Target/1/0','opinions/3/Polar_expression/1/0', 'opinions/4/Target/1/0','opinions/4/Polar_expression/1/0','opinions/12/Intensity','opinions/0/Source/1/0','opinions/1/Source/1/0'], axis = 1)

dftest.info()

"""Pre processing train data"""

df = pd.read_csv("Train Data.csv")
df.info()

url = 'https://raw.githubusercontent.com/anasamin26/fyp/main/Train_Data.csv'
df = pd.read_csv(url)

df.drop(df.iloc[:, 36:99], inplace = True, axis = 1)
# df.value_count()

df = df.drop(['sent_id','opinions/0/Target/1/0', 'opinions/0/Polar_expression/1/0','opinions/1/Target/1/0', 'opinions/1/Polar_expression/1/0','opinions/2/Target/1/0','opinions/2/Target/1/0','opinions/2/Polar_expression/1/0', 'opinions/3/Target/1/0','opinions/3/Polar_expression/1/0', 'opinions/4/Target/1/0','opinions/0/Source/1/0','opinions/1/Source/1/0'], axis = 1)
df.info()

# df.drop(df.iloc[:, 23:48], inplace = True, axis = 1)
# df.drop(['opinions/16/Intensity'], inplace = True, axis = 1)
df['opinions/0/Polarity']

"""Label Encoding"""

from sklearn import preprocessing

mappin = {'Positive':1, 'Negative':0, "NaN":-1}

df['opinions/0/Polarity'] = df['opinions/0/Polarity'].fillna(3)
df['opinions/0/Polarity'] =  df['opinions/0/Polarity'].replace("Positive", 1)
df['opinions/0/Polarity'] = df['opinions/0/Polarity'].replace("Negative", 0)

"""# **Naive Bayes Classifier**"""

import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

vec = TfidfVectorizer(stop_words='english')
inputTexts = vec.fit_transform(df['text']).toarray()
pickle.dump(vec.vocabulary_,open("vec_vocab.pkl","wb"))
joblib.dump(vec, 'tfidf_vocab.pkl')
files.download('tfidf_vocab.pkl')

vec = CountVectorizer(stop_words='english')
inputTexts = vec.fit_transform(df['text']).toarray()
pickle.dump(vec.vocabulary_,open("vec_vocab.pkl","wb"))
# joblib.dump(vec, 'CountVectorizer.pkl')
files.download('vec_vocab.pkl')

x, x_test, y, y_test = train_test_split(inputTexts,df['opinions/0/Polarity'], test_size=0.25, random_state=42)
naiveBayes = MultinomialNB()

naiveBayes.fit(x, y)

naiveBayes.score(x_test,y_test)

print(naiveBayes.predict(vec.transform(['This was not a good idea']))[0])

import joblib
joblib.dump(naiveBayes, 'naiveBayes.pkl')
files.download('naiveBayes.pkl')

"""# **LSTM Classifier**"""

from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, Input
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding
import tensorflow as tf
from keras.models import Model
import string
import re
import keras
from sklearn.model_selection import train_test_split
import keras.optimizers

input_df = df[['text','opinions/0/Polarity']]

input_df = df[df['opinions/0/Polarity'] != 3]
sentiment_label = df['opinions/0/Polarity'].factorize()
# df['opinions/0/Polarity']

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", 
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]
             

def remove_stopwords(df):
  df['review without stopwords'] = df['text'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
  return df

def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
    
data_without_stopwords = remove_stopwords(df)
data_without_stopwords['clean_review']= data_without_stopwords['review without stopwords'].apply(lambda cw : remove_tags(cw))
data_without_stopwords['clean_review'] = data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

reviews_list = []
for i in range(len(df['text'])):
  reviews_list.append(df['text'][i])
 
sentiment = data_without_stopwords['opinions/0/Polarity']

counter = 0
for item in reviews_list:
 counter+=1
print(counter)
print(sentiment.size)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(reviews_list)

words_to_index = tokenizer.word_index

def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

  return word_to_vec_map

word_to_vec_map = read_glove_vector('glove.6B.50d.txt')

maxLen = 150

vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index-1, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

def imdb_rating(input_shape):

  X_indices = Input(input_shape)
  
  embeddings = embedding_layer(X_indices)
  
  X = LSTM(128, return_sequences=True)(embeddings)

  X = Dropout(0.6)(X)

  X = LSTM(128, return_sequences=True)(X)

  X = Dropout(0.6)(X)

  X = LSTM(128)(X)

  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

counter = 0
for item in reviews_list:
 counter+=1
print(counter)

X_train_indices = tokenizer.texts_to_sequences(reviews_list)

X_train_indices = pad_sequences(X_train_indices, maxlen=150, padding='post')

X_train_indices.shape

sentiment

print(X_train_indices.shape)
print(sentiment.shape)

adam = tf.keras.optimizers.Adam(learning_rate = 0.0001)
model=imdb_rating(X_train_indices.shape)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_indices, sentiment, batch_size=64, epochs=15)

input = input_df.text.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(input)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(input)
padded_sequence = pad_sequences(encoded_docs, maxlen=150)


embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length,     
                                     input_length=150) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', 
                           metrics=['accuracy'])
print(model.summary())
history = model.fit(padded_sequence,sentiment_label[0],
                  validation_split=0.2, epochs=20, batch_size=32)

test_word ="This is not good"

tw = tokenizer.texts_to_sequences([test_word])
tw = pad_sequences(tw,maxlen=150)
prediction = int(model.predict(tw).round().item())
sentiment_label[1][prediction]

"""# **Logistic Regression**"""

from sklearn.linear_model import LogisticRegression