题目要求
手动实现朴素贝叶斯分类器
使用 (sklearn) (python机器学习库) 自带的数据集,进行文本特征向量化(TF-IDF),并使用多项式贝叶斯文本分类
基本原理
$$y=\arg\max_{c_{k}}P(Y=c_{k})\prod_{j=1}^{n}P(X_{j}=x^{(j)}|Y=c_{k})$$
模型内有的只是训练出来的概率,
朴素贝叶斯分类器要做的事:根据模型,对测试的例子的 y 值进行(假设)极大似然,得出分类结果。
代码实现
from random import shuffle
from collections import Counter
ping_hua = [3, 3, 2, 2]
data = [['Sunny', 'Hot', 'High', 'Weak', 0],
['Sunny', 'Hot', 'High', 'Strong', 0],
['Overcast', 'Hot', 'High', 'Weak', 1],
['Rain', 'Mild', 'High', 'Weak', 1],
['Rain', 'Cool', 'Normal', 'Weak', 1],
['Rain', 'Cool', 'Normal', 'Strong', 0],
['Overcast', 'Cool', 'Normal', 'Strong', 1],
['Sunny', 'Mild', 'High', 'Weak', 0],
['Sunny', 'Cool', 'Normal', 'Weak', 1],
['Rain', 'Mild', 'Normal', 'Weak', 1],
['Sunny', 'Mild', 'Normal', 'Strong', 1],
['Overcast', 'Mild', 'High', 'Strong', 1],
['Overcast', 'Hot', 'Normal', 'Weak', 1],
['Rain', 'Mild', 'High', 'Strong', 0]]
class NaiveBayes:
def __init__(self):
self.y_label = {} #yes例子的字典
self.n_label = {} # no例子的字典
self.n0 = 0
self.n1 = 0
def fit(self,data):
for row in data:
if row[4]: #正例
self.n1 +=1
for i,d in enumerate(row[:4]):
if d not in self.y_label:
self.y_label[d] = 1
else: self.y_label[d] += 1
else:
self.n0 +=1
for i,d in enumerate(row[:4]):
if d not in self.n_label:
self.n_label[d] = 1
else: self.n_label[d] += 1
def predict(self,data):
p_yes = self.n1/(self.n0 + self.n1)
p_no = self.n0/(self.n0 + self.n1)
for i, d in enumerate(data):
p_yes *= (self.y_label.get(d, 0) + 1)/(self.n1 + ping_hua[i])
p_no *= (self.n_label.get(d, 0) + 1)/(self.n0 + ping_hua[i])
print("p(yes):", p_yes)
print("p(no):", p_no)
return "yes" if p_yes>=p_no else "no"
def data_split(data):
nums = [_ for _ in range(13)]
shuffle(nums)
return [data[_] for _ in nums[:10]], [data[_] for _ in nums[-4:]]
if __name__ == "__main__":
train, test = data_split(data)
#print(train, test)
model = NaiveBayes()
model.fit(train)
#print(model.y_label, model.n_label)
for t in test:
print(t," the predict:", model.predict(t[:4]))
print("\n")
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB # 多项式模型,进行文本分类
from pprint import pprint #data pretty printer
from sklearn.feature_extraction.text import TfidfVectorizer #TF-IDF文本特征提取
newsgroups_train = fetch_20newsgroups(subset='train')
pprint(list(newsgroups_train.target_names))
select = ['alt.atheism','comp.graphics','misc.forsale','rec.autos',
'sci.crypt','soc.religion.christian','talk.politics.guns']
train=fetch_20newsgroups(subset='train',categories=select) # 1.数据集导入
test=fetch_20newsgroups(subset='test',categories=select)
vectorizer = TfidfVectorizer()
train_v=vectorizer.fit_transform(train.data) # 2.文本特征向量化
test_v=vectorizer.transform(test.data)
model = MultinomialNB()
model.fit(train_v,train.target) # 3.训练模型
print("准确率为:",model.score(test_v,test.target)) # 4~5. 测试-,报告学习结果
参考资料:
- python, 20Newsgroup文本分类
- 官网:sklearn user guide
- sklearn朴素贝叶斯多项式模型文档
- sklearn.feature_extraction.text.TfidfVectorizer
应用实例
Jun 2021 update
短文本情感分析,代码参考
垃圾邮件分类,代码参考