naive bayes spam classificator in python

Naive Bayes Spam Classification in Python

Here's an implementation of a Naive Bayes spam classifier in Python:

main.py
import os
import io
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def make_Dictionary(root_dir):
    all_words = []
    emails = [os.path.join(root_dir, f) for f in os.listdir(root_dir)]
    for mail in emails:
        with io.open(mail,'r',encoding='latin1') as openemail:
            for line in openemail:
                words = line.split()
                all_words += words
    dictionary = Counter(all_words)
    list_to_remove = list(dictionary)
    for item in list_to_remove:
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)
    return dictionary

def extract_features(mail_dir, dictionary):
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),3000))
    train_labels = np.zeros(len(files))
    count = 0;
    docID = 0;
    for fil in files:
        with io.open(fil,'r',encoding='latin1') as fi:
            for i,line in enumerate(fi):
                if i ==2:
                    words = line.split()
                    for word in words:
                        wordID = 0
                        for i,d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                features_matrix[docID,wordID] = words.count(word)
            train_labels[docID] = 0;
            filepathTokens = fil.split('/')
            lastToken = filepathTokens[len(filepathTokens)-1]
            if lastToken.startswith("spmsg"):
                train_labels[docID] = 1;
                count = count + 1
            docID = docID + 1
    features_matrix = np.delete(features_matrix, np.s_[docID:],0)
    train_labels = np.delete(train_labels, np.s_[docID:],0)
    return features_matrix, train_labels

# You need to have the "spam" and "ham" folders in the same directory as this script.
# Directories can be modified according to your specifications.
 
TRAIN_DIR = "train-mails"
TEST_DIR = "test-mails"
dictionary = make_Dictionary(TRAIN_DIR)
 
print "reading and processing emails from file."
features_matrix, labels = extract_features(TRAIN_DIR, dictionary)
 
X_train, X_test, y_train, y_test = train_test_split(features_matrix, labels, test_size=0.25)
 
model = MultinomialNB()
print "Training model."
model.fit(X_train, y_train)
predicted_labels = model.predict(X_test)
print "FINISHED classifying. accuracy score: "
print accuracy_score(y_test,predicted_labels)
2685 chars
73 lines

In this implementation, we first define a helper function make_Dictionary that creates a dictionary of the most commonly occurring words in the training data. The function then processes each email in the training set and extracts its features using the extract_features function, which returns a matrix of feature vectors and a vector of labels.

We then split the feature vectors and labels into training and testing sets using train_test_split from the sklearn library. We create an instance of the Multinomial Naive Bayes classifier using MultinomialNB() and train it on the training data using the fit method.

Finally, we use the trained classifier to make predictions on the test data and compute the accuracy score of our model using accuracy_score from sklearn.metrics.

gistlibby LogSnag