Twitter Sentiment Analysis Classification - Exploratory Data Analysis

Libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from PIL import Image

from sklearn.feature_extraction.text import TfidfVectorizer

from preprocessing.text_cleaning import clean_text
from preprocessing.preprocessing import drop_duplicates
from preprocessing.utils import load_dataset, STOPWORDS

import html

Dataset Load

!curl "https://dbdmg.polito.it/dbdmg_web/wp-content/uploads/2021/12/DSL2122_january_dataset.zip" -Lo dataset.zip
!unzip -q dataset.zip; rm dataset.zip; rm -r __MACOSX/

tweets = load_dataset()

Data exploration

for col in tweets.columns:
    print(f' {col} has {len(tweets[col].unique())} unique values.')

 sentiment has 2 unique values.
 ids has 224716 unique values.
 date has 189779 unique values.
 flag has 1 unique values.
 user has 10647 unique values.
 text has 223106 unique values.

tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224994 entries, 0 to 224993
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   sentiment  224994 non-null  int64 
 1   ids        224994 non-null  int64 
 2   date       224994 non-null  object
 3   flag       224994 non-null  object
 4   user       224994 non-null  object
 5   text       224994 non-null  object
dtypes: int64(2), object(4)
memory usage: 10.3+ MB

tweets.describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	sentiment	ids
count	224994.000000	2.249940e+05
mean	0.578491	1.988334e+09
std	0.493802	1.777980e+08
min	0.000000	1.467811e+09
25%	0.000000	1.956965e+09
50%	1.000000	1.996993e+09
75%	1.000000	2.064995e+09
max	1.000000	2.329205e+09

Graphics

sns.set_theme(style='darkgrid')

tweets = load_dataset()
fig, ax = plt.subplots(figsize=(10, 8))

d = {0: "Negative", 1:"Positive"}
sns.histplot(x=tweets["sentiment"].apply(lambda x: d[x]), ax=ax, hue=tweets["sentiment"], legend=None, hue_order=[1, 0], alpha=0.8)

plt.title("Sentiment Distribution", fontdict={"fontsize": "xx-large"})

plt.xlabel("Sentiment", fontdict={"fontsize": "x-large"})
plt.ylabel("Count", fontdict={"fontsize": "x-large"})

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

#fig.savefig("./report/figures/unbalanced.svg", format='svg')

plt.show()

tweets = load_dataset()
tweets["text"] = tweets["text"].apply(lambda x : html.unescape(x))
tweets['char_count'] = list(map(lambda x: len(x), tweets['text'])) 


fig, ax = plt.subplots(figsize=(10, 10))
d = {0: "Negative", 1:"Positive"}

sns.boxplot(data=tweets, x = "char_count", y = tweets["sentiment"].apply(lambda x: d[x]), orient='h', ax= ax, linewidth=2.5, fliersize=10)


plt.title("Tweets Length", fontdict={"fontsize": "xx-large"})

plt.xlabel("Char Count", fontdict={"fontsize": "x-large"})
plt.ylabel("Sentiment", fontdict={"fontsize": "x-large"})

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

#fig.savefig("./report/figures/lenghts_distrib.svg", format='svg')

plt.show()

tweets = load_dataset().pipe(drop_duplicates).pipe(clean_text)
fig, ax = plt.subplots(figsize=(8,8))
cloud_mask = np.array(Image.open("./report/figures/cloud.png"))

vectorizer = TfidfVectorizer(strip_accents="ascii", stop_words = STOPWORDS, use_idf=False, min_df=0.01)
vectorizer.fit(tweets["text"])
wc = WordCloud(background_color=None, mode="RGBA", mask=cloud_mask, prefer_horizontal=True, random_state=666)
wc.generate_from_text(" ".join(vectorizer.get_feature_names()))
plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
fig.savefig("./report/figures/word_cloud.png", format='png', transparent=True, dpi=700)

Name		Name	Last commit message	Last commit date
Latest commit History 136 Commits
DSL2122_january_dataset		DSL2122_january_dataset
figures		figures
preprocessing		preprocessing
report		report
.gitignore		.gitignore
Data_Science_Lab___Project_Assignment_January_2022.pdf		Data_Science_Lab___Project_Assignment_January_2022.pdf
Exploratory Data Analysis.ipynb		Exploratory Data Analysis.ipynb
README.md		README.md
main.ipynb		main.ipynb
model_selection.py		model_selection.py
tuning.py		tuning.py
word_cloud.png		word_cloud.png

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Twitter Sentiment Analysis Classification - Exploratory Data Analysis

Libraries

Dataset Load

Data exploration

Graphics

About

Releases

Packages

Contributors 2

Languages

Gio99c/dsl-project

Folders and files

Latest commit

History

Repository files navigation

Twitter Sentiment Analysis Classification - Exploratory Data Analysis

Libraries

Dataset Load

Data exploration

Graphics

About

Topics

Resources

Stars

Watchers

Forks

Releases

Packages 0

Contributors 2

Languages

Packages