test_final.py

# /usr/bin/env python
#
# GUI module generated by PAGE version 4.9
# In conjunction with Tcl version 8.6

import io
import os
from os import listdir
from os.path import isfile, join

import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


class Test:
    vectorizer = CountVectorizer()
    classifier = MultinomialNB()
    nb_hams = 0
    nb_spams = 0

    def readFiles(self, path):
        for root, dirnames, filenames in os.walk(path):
            for filename in filenames:
                path = os.path.join(root, filename)
                inBody = False
                lines = []
                f = io.open(path, 'r', encoding='UTF-8')
                for line in f:
                    if inBody:
                        lines.append(line)
                    elif line == '\n':
                        inBody = True
                    message = '\n'.join(lines)
                    yield path, message
        pass

    def dataFrameFromDirectory(self, path, classification):
        rows = []
        index = []
        for filename, message in self.readFiles(path):
            rows.append({'message': message, 'class': classification})
            index.append(filename)

        return DataFrame(rows, index=index)
        pass

    def chargement(self):
        data = DataFrame({'message': [], 'class': []})

        self.nb_spams = len(os.walk("spams").next()[2])
        self.nb_hams = len(os.walk("hams").next()[2])

        data = data.append(self.dataFrameFromDirectory('spams', 'spam'))
        data = data.append(self.dataFrameFromDirectory('hams', 'ham'))

        # print(data)

        counts = self.vectorizer.fit_transform(data['message'].values)

        targets = data['class'].values
        self.classifier.fit(counts, targets)
        pass

    def verifier(self, message):
        Y = [message]
        example_counts = self.vectorizer.transform(Y)
        predictions = self.classifier.predict(example_counts)
        return str(predictions[0])
        pass

    # calcul de la matrice de confusion


    def get_repertoires(self, filename):

        with open(filename) as f:
            return f.read()
        pass

    def statistiques(self):

        fig, ax = plt.subplots()

        valeurs = (i * 100, j * 100, k * 100)

        ind = np.arange(3)
        width = 0.7

        rects1 = ax.bar(ind, valeurs, width, color='#2c3e50')

        ax.set_ylabel('Pourcentage (%)')
        ax.set_xlabel('Indicateurs')
        ax.set_title('Evaluation du mod?le sur la base de la pr?cision et du taux d\'erreur')
        ax.set_xticks(ind)
        ax.set_xticklabels(('Pr?cision SPAMs', 'Pr?cision HAMs', 'Taux d\'erreur'))

        for rect in rects1:
            height = rect.get_height()
            ax.text(rect.get_x() + rect.get_width() / 2., 1.005 * height, str('%.2f' % height) + "%", ha='center',
                    va='bottom')

        # afficher le graphe
        plt.show()
        pass


# vrai positif
vp = 0

# vrai negatif
vn = 0

# faux positif
fp = 0

# faux negatif
fn = 0

nb_emails_tests = 0

test = Test()
test.chargement()

onlyfiles = [f for f in listdir('spams_t/') if isfile(join('spams_t/', f))]

nb_emails_tests = nb_emails_tests + len(onlyfiles)

print("\nBas? sur " + str(len(onlyfiles)) + " spams tests.")
for g in onlyfiles:
    i = test.verifier(test.get_repertoires('spams_t/' + g))
    if i == 'spam':
        vp = vp + 1
    elif i == 'ham':
        fn = fn + 1

onlyfiles = [f for f in listdir('hams_t/') if isfile(join('hams_t/', f))]

nb_emails_tests = nb_emails_tests + len(onlyfiles)

print("Bas? sur " + str(len(onlyfiles)) + " hams tests.\n")
for f in onlyfiles:
    i = test.verifier(test.get_repertoires('hams_t/' + f))
    if i == 'spam':
        fp = fp + 1
    elif i == 'ham':
        vn = vn + 1

# creation d'une marice carr?e d'ordre 2
w, h = 2, 2
matrice = [[0 for x in range(w)] for y in range(h)]

matrice[0][0] = vp
matrice[0][1] = fn
matrice[1][0] = fp
matrice[1][1] = vn

print("--- Affichage de la matrice de confusion ---\n")
print(np.matrix(matrice))

# calcul de la precision
i = vp / (vp + fp)
j = vn / (vn + fn)
print("\nPr?cision spams = " + str(i))
print("Pr?cision hams = " + str(j))

# calcul du taux d'erreur
k = (fp + fn) / nb_emails_tests
print("\nLe taux d'erreur est = " + str(k))