-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
133 lines (80 loc) · 3.31 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# https://randerson112358.medium.com/email-spam-detection-using-python-machine-learning-abe38c889855
# https://blog.textedly.com/spam-text-message-examples
#Import libraries
import numpy as numpy
import pandas as panda
import nltk
from nltk.corpus import stopwords
import string
#Load the data
# dataFrame = panda.read_csv('db/emails.csv')
dataFrame = panda.read_csv('db/test.csv')
# print(dataFrame.head(5))
#Print the shape (Get the number of rows and cols)
result = dataFrame.shape
# print (result)
#Get the column names
dataFrame.columns
# print(dataFrame.columns)
#Checking for duplicates and removing them
dataFrame.drop_duplicates(inplace=True)
# result = dataFrame.shape
# print (result)
#Show the number of missing (NAN, NaN, na) data for each column
result = dataFrame.isnull().sum()
# print (result)
#Need to download stopwords
# nltk.download('stopwords')
# Tokenization (a list of tokens), will be used as the analyzer
# 1.Punctuations are [!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]
# 2.Stop words in natural language processing, are useless words (data).
def process_text(text):
# 1 Remove punctuation
text_without_punctuation = [char for char in text if char not in string.punctuation]
text_without_punctuation = ''.join(text_without_punctuation)
# 2 Remove Stop Words
text_without_stop_words = [word for word in text_without_punctuation.split() if word.lower() not in stopwords.words('english')]
# 3 Return a list of clean words
return text_without_stop_words
#Show the Tokenization (a list of tokens )
# print (dataFrame['text'].head().apply(process_text))
# Convert the text into a matrix of token counts.
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(dataFrame['text'])
# print (messages_bow)
#Split data into 80% training & 20% testing data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, dataFrame['spam'], test_size=0.20, random_state=0)
#Get the shape of messages_bow
# messages_bow.shape
# print (messages_bow.shape)
# Create and train the Multinomial Naive Bayes classifier which is suitable for classification with discrete features (e.g., word counts for text classification)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
# #Print the predictions
# print(classifier.predict(X_train))
#
#
# print ('divider')
# # Print the actual values
# print(y_train.values)
#Evaluate the model on the training data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train, pred))
print('Confusion Matrix: \n', confusion_matrix(y_train, pred))
print()
print('Accuracy: ', accuracy_score(y_train, pred))
# #Print the predictions
# print('Predicted value: ', classifier.predict(X_test))
# print ('divider')
# #Print Actual Label
# print('Actual value: ', y_test.values)
#Evaluate the model on the test data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))