贝叶斯垃圾邮件分类,进行拉普拉斯平滑,给出结果的准确率、召回率、F1值
朴素贝叶斯基本原理: $$y=\arg \max _{c_{k}} P\left(Y=c_{k}\right) \prod_{j=1}^{n} P\left(X_{j}=x^{(j)} | Y=c_{k}\right)$$
基本假设:条件的独立性
import os
import re
from random import shuffle
def findFile(path):
for file in os.listdir(path):
if file.endswith(".txt"):
yield os.path.join(path, file)
def create(path):
ans = []
for file in findFile(path):
#print(file) 有个文件格式不同,用这个去找到,再更改文件格式
with open(file, "r", encoding='UTF-8') as f:
tmp = re.split('\W+',f.read()) # \W 匹配非数字字母下划线
ans.append([i for i in tmp if i]) #去除空串
return ans
class Bayes: #改用的机器学习课写的贝叶斯,没用那个矩阵
def __init__(self):
self.y_label = {} # spam例子的字典, key:word, val:count
self.n_label = {} # ham例子的字典
self.n0 = 0
self.n1 = 0
def fit(self,data):
for row in data:
if row[-1]: #垃圾邮件
self.n1 +=1
for i,d in enumerate(row[:-1]):
if d not in self.y_label:
self.y_label[d] = 1
else: self.y_label[d] += 1
else:
self.n0 +=1
for i,d in enumerate(row[:-1]):
if d not in self.n_label:
self.n_label[d] = 1
else: self.n_label[d] += 1
def predict(self,data):
p_yes = self.n1/(self.n0 + self.n1)
p_no = self.n0/(self.n0 + self.n1)
for d in data[:-1]:
p_yes *= (self.y_label.get(d, 0) + 1)/(self.n1+2) #拉普拉斯平滑,即加 1 平滑
p_no *= (self.n_label.get(d, 0) + 1)/(self.n0+2)
#print("p(yes):", p_yes)
#print("p(no):", p_no)
return True if p_yes >= p_no else False
def data_split(data):
nums = [_ for _ in range(50)]
shuffle(nums)
return [data[_] for _ in nums[:40]], [data[_] for _ in nums[40:]]
if __name__ == "__main__":
hl = create("ham")
sl = create("spam")
# vocab = set()
# for lst in hl+sl:
# for word in lst:
# vocab.add(word)
# print(len(list(vocab)),vocab)
data = [row + [0] for row in hl] + [row + [1] for row in sl] # int型 0,1 作为对邮件的标记
for i in range(10):
print("{:=^80}".format(" 第"+ str(i+1) +"次训练 "))
train, test = data_split(data)
#print(len(train), len(test))
tp, fp, tn, fn = 0, 0, 0, 0
tmp = []
model = Bayes()
model.fit(train)
for t in test:
predict = model.predict(t)
tmp.append(predict==t[-1])
if predict:
if t[-1]==1: tp += 1
else: fp += 1
else:
if t[-1]==1: fn += 1
else: tn += 1
P = tp/(tp+fp)
R = tp/(tp+fn)
F1 = 2*P*R/(P+R)
print("\n判断对了:",tmp)
print("\nP:{} R:{} F1:{}\n".format(P, R, F1))
==================================== 第1次训练 ===================================== 判断对了: [True, True, True, True, True, True, True, True, True, True] P:1.0 R:1.0 F1:1.0 ==================================== 第2次训练 ===================================== 判断对了: [True, True, True, True, True, True, True, True, True, True] P:1.0 R:1.0 F1:1.0 ==================================== 第3次训练 ===================================== 判断对了: [True, True, True, True, True, True, True, True, True, True] P:1.0 R:1.0 F1:1.0 ==================================== 第4次训练 ===================================== 判断对了: [True, True, True, True, True, True, True, True, True, True] P:1.0 R:1.0 F1:1.0 ==================================== 第5次训练 ===================================== 判断对了: [True, True, True, True, True, False, True, True, True, True] P:1.0 R:0.75 F1:0.8571428571428571 ==================================== 第6次训练 ===================================== 判断对了: [True, True, True, True, True, True, True, True, True, True] P:1.0 R:1.0 F1:1.0 ==================================== 第7次训练 ===================================== 判断对了: [True, True, True, True, True, True, True, True, True, True] P:1.0 R:1.0 F1:1.0 ==================================== 第8次训练 ===================================== 判断对了: [True, True, True, True, True, True, True, True, True, True] P:1.0 R:1.0 F1:1.0 ==================================== 第9次训练 ===================================== 判断对了: [True, True, True, True, True, True, True, True, True, True] P:1.0 R:1.0 F1:1.0 ==================================== 第10次训练 ==================================== 判断对了: [False, True, True, True, True, True, True, True, True, True] P:1.0 R:0.8 F1:0.888888888888889