84 lines
2.7 KiB
Python
84 lines
2.7 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# coding:utf-8
|
||
|
import numpy as np
|
||
|
|
||
|
|
||
|
def load_data_set():
|
||
|
posting_list = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
|
||
|
['maybe', 'not', 'take', 'him', 'dog', 'park', 'stupid'],
|
||
|
['my', 'dalmation', 'is', 'so', 'cute', '_i', 'love', 'him'],
|
||
|
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
|
||
|
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
|
||
|
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
|
||
|
|
||
|
class_vec = [0, 1, 0, 1, 0, 1]
|
||
|
return posting_list, class_vec
|
||
|
|
||
|
|
||
|
def create_vocab_list(data_set):
|
||
|
vocab_set = set([])
|
||
|
for document in data_set:
|
||
|
vocab_set = vocab_set | set(document)
|
||
|
|
||
|
return list(vocab_set)
|
||
|
|
||
|
|
||
|
def set_of_words2_vec(vocab_list, input_set):
|
||
|
return_vec = [0] * len(vocab_list)
|
||
|
for word in input_set:
|
||
|
if word in vocab_list:
|
||
|
return_vec[vocab_list.index(word)] = 1
|
||
|
else:
|
||
|
print("the word:%s is not in my vocabulary!", word)
|
||
|
|
||
|
return return_vec
|
||
|
|
||
|
|
||
|
# 贝叶斯分类器训练函数
|
||
|
def train_nb0(train_matrix, train_category):
|
||
|
num_train_docs = len(train_matrix)
|
||
|
num_words = len(train_matrix[0])
|
||
|
p_abusive = sum(train_category)/float(num_train_docs)
|
||
|
p0_num = np.zeros(num_words)
|
||
|
p1_num = np.zeros(num_words)
|
||
|
p0_denom = 0.0
|
||
|
p1_denom = 0.0
|
||
|
for i in range(num_train_docs):
|
||
|
if train_category[i] == 1:
|
||
|
p1_num += train_matrix[i]
|
||
|
p1_denom += sum(train_matrix[i])
|
||
|
else:
|
||
|
p0_num += train_matrix[i]
|
||
|
p0_denom += sum(train_matrix[i])
|
||
|
|
||
|
p1_vect = p1_num/p1_denom
|
||
|
p0_vect = p0_num/p0_denom
|
||
|
return p0_vect, p1_vect, p_abusive
|
||
|
|
||
|
|
||
|
def classify_nb(vec2_classify, p0_vec, p1_vec, p_class1):
|
||
|
p1 = sum(vec2_classify * p1_vec) + np.log(p_class1)
|
||
|
p0 = sum(vec2_classify * p0_vec) + np.log(1.0 - p_class1)
|
||
|
if p1 > p0:
|
||
|
return 1
|
||
|
else:
|
||
|
return 0
|
||
|
|
||
|
|
||
|
def testing_nb():
|
||
|
list0_posts, list_classes = load_data_set()
|
||
|
my_vocab_list = create_vocab_list(list0_posts)
|
||
|
train_mat = []
|
||
|
for postin_doc in list0_posts:
|
||
|
train_mat.append(set_of_words2_vec(my_vocab_list, postin_doc))
|
||
|
p0_v, p1_v, p_ab = train_nb0(np.array(train_mat), np.array(list_classes))
|
||
|
test_entry = ['love', 'my', 'dalmation']
|
||
|
this_doc = np.array(set_of_words2_vec(my_vocab_list, test_entry))
|
||
|
print(test_entry, 'classsified as:', classify_nb(this_doc, p0_v, p1_v, p_ab))
|
||
|
test_entry = ['stupid', 'garbage']
|
||
|
this_doc = np.array(set_of_words2_vec(my_vocab_list, test_entry))
|
||
|
print(test_entry, 'classsified as:', classify_nb(this_doc, p0_v, p1_v, p_ab))
|
||
|
|
||
|
|
||
|
testing_nb()
|