This repository has been archived on 2020-04-25. You can view files and clone it, but cannot push or open issues or pull requests.
ml/bayes/bayes.py

84 lines
2.7 KiB
Python
Raw Normal View History

2020-02-23 14:14:06 +00:00
#!/usr/bin/env python3
# coding:utf-8
import numpy as np
def load_data_set():
posting_list = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', '_i', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
class_vec = [0, 1, 0, 1, 0, 1]
return posting_list, class_vec
def create_vocab_list(data_set):
vocab_set = set([])
for document in data_set:
vocab_set = vocab_set | set(document)
return list(vocab_set)
def set_of_words2_vec(vocab_list, input_set):
return_vec = [0] * len(vocab_list)
for word in input_set:
if word in vocab_list:
return_vec[vocab_list.index(word)] = 1
else:
print("the word:%s is not in my vocabulary!", word)
return return_vec
# 贝叶斯分类器训练函数
def train_nb0(train_matrix, train_category):
num_train_docs = len(train_matrix)
num_words = len(train_matrix[0])
p_abusive = sum(train_category)/float(num_train_docs)
p0_num = np.zeros(num_words)
p1_num = np.zeros(num_words)
p0_denom = 0.0
p1_denom = 0.0
for i in range(num_train_docs):
if train_category[i] == 1:
p1_num += train_matrix[i]
p1_denom += sum(train_matrix[i])
else:
p0_num += train_matrix[i]
p0_denom += sum(train_matrix[i])
p1_vect = p1_num/p1_denom
p0_vect = p0_num/p0_denom
return p0_vect, p1_vect, p_abusive
def classify_nb(vec2_classify, p0_vec, p1_vec, p_class1):
p1 = sum(vec2_classify * p1_vec) + np.log(p_class1)
p0 = sum(vec2_classify * p0_vec) + np.log(1.0 - p_class1)
if p1 > p0:
return 1
else:
return 0
def testing_nb():
list0_posts, list_classes = load_data_set()
my_vocab_list = create_vocab_list(list0_posts)
train_mat = []
for postin_doc in list0_posts:
train_mat.append(set_of_words2_vec(my_vocab_list, postin_doc))
p0_v, p1_v, p_ab = train_nb0(np.array(train_mat), np.array(list_classes))
test_entry = ['love', 'my', 'dalmation']
this_doc = np.array(set_of_words2_vec(my_vocab_list, test_entry))
print(test_entry, 'classsified as:', classify_nb(this_doc, p0_v, p1_v, p_ab))
test_entry = ['stupid', 'garbage']
this_doc = np.array(set_of_words2_vec(my_vocab_list, test_entry))
print(test_entry, 'classsified as:', classify_nb(this_doc, p0_v, p1_v, p_ab))
testing_nb()