import os
import re

def findFile(path):
    for file in os.listdir(path):
    #for file in os.listdir(path)[:1000]:
        if file.endswith(".txt"):
            #print(file)
            yield os.path.join(path, file)
            
def create(path):
    ans = []
    com_cnt = 0
    err_cnt = 0
    for file in findFile(path):
        with open(file, "r", encoding='UTF-8') as f:
            try:
                tmp = re.split(' ', f.read())
                ans.append([i for i in tmp if i])
                com_cnt += 1
            except:
                err_cnt += 1
    print("正确读取文件:{}   错误读取文件:{}".format(com_cnt, err_cnt))
    return ans

neg = create(r"dataset\train\neg")
pos = create(r"dataset\train\pos")
data = [row + [0] for row in neg] + [row + [1] for row in pos]

testn = create(r"dataset\test\neg")
testp = create(r"dataset\test\pos")

正确读取文件:12500   错误读取文件:0
正确读取文件:12500   错误读取文件:0
正确读取文件:12500   错误读取文件:0
正确读取文件:12500   错误读取文件:0


class Bayes:                                    #改用的机器学习课写的贝叶斯，没用那个矩阵
    def __init__(self):
        self.y_label = {}                       # pos例子的字典, key:word, val:count
        self.n_label = {}                       # neg例子的字典
        self.n0 = 0
        self.n1 = 0
    def fit(self,data):
        for row in data:
            if row[-1]:                         #pos文本
                self.n1 +=1
                for i,d in enumerate(row[:-1]):
                    if d not in self.y_label:
                        self.y_label[d] = 1
                    else: self.y_label[d] += 1
            else:
                self.n0 +=1
                for i,d in enumerate(row[:-1]):
                    if d not in self.n_label:
                        self.n_label[d] = 1
                    else: self.n_label[d] += 1 

    def predict(self,data):
        p_yes = self.n1/(self.n0 + self.n1)
        p_no = self.n0/(self.n0 + self.n1)
        for d in data:
            p_yes *= (self.y_label.get(d, 0)+2)/(self.n1+2)          #拉普拉斯平滑，即加 1 平滑
            p_no *= (self.n_label.get(d, 0)+2)/(self.n0+2)

        #print("p(yes):", p_yes)
        #print("p(no):", p_no)
        return True if  p_yes >= p_no else False


model = Bayes()
model.fit(data)

tp, fp, tn, fn = 0, 0, 0, 0
for t in testp:
    if model.predict(t): 
        tp += 1
    else: fn += 1
for t in testn:
    if model.predict(t):
        fp += 1
    else: tn += 1
    
print("正确率：{}".format((tp+tn)/(tp+fp+tn+fn)))
print("准确率：{}".format(tp/(tp+fp)))

正确率：0.77656
准确率：0.7312683971099813


import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer

def findFile(path):
    for file in os.listdir(path):
        if file.endswith(".txt"):
            yield os.path.join(path, file)
            
def create(path):
    ans = []
    for file in findFile(path):
        vectorizer = TfidfVectorizer(stop_words = "english")
        with open(file, "r", encoding='UTF-8') as f:
            tmp = f.read().split('.')
            vectorizer.fit(tmp)
        ans.append(vectorizer.get_feature_names())
    return ans

            
neg = create(r"dataset\train\neg")
pos = create(r"dataset\train\pos")
data = [row + [0] for row in neg] + [row + [1] for row in pos]

testn = create(r"dataset\test\neg")
testp = create(r"dataset\test\pos")


class Bayes:                                    #改用的机器学习课写的贝叶斯，没用那个矩阵
    def __init__(self):
        self.y_label = {}                       # pos例子的字典, key:word, val:count
        self.n_label = {}                       # neg例子的字典
        self.n0 = 0
        self.n1 = 0
    def fit(self,data):
        for row in data:
            if row[-1]:                         #pos文本
                self.n1 +=1
                for i,d in enumerate(row[:-1]):
                    if d not in self.y_label:
                        self.y_label[d] = 1
                    else: self.y_label[d] += 1
            else:
                self.n0 +=1
                for i,d in enumerate(row[:-1]):
                    if d not in self.n_label:
                        self.n_label[d] = 1
                    else: self.n_label[d] += 1 

    def predict(self,data):
        p_yes = self.n1/(self.n0 + self.n1)
        p_no = self.n0/(self.n0 + self.n1)
        for d in data:
            p_yes *= (self.y_label.get(d, 0)+200)/(self.n1+2)          #拉普拉斯平滑，即加 1 平滑
            p_no *= (self.n_label.get(d, 0)+200)/(self.n0+2)

        #print("p(yes):", p_yes)
        #print("p(no):", p_no)
        return True if  p_yes >= p_no else False


model = Bayes()
model.fit(data)
tp, fp, tn, fn = 0, 0, 0, 0
for t in testp:
    if model.predict(t): 
        tp += 1
    else: fn += 1
for t in testn:
    if model.predict(t):
        fp += 1
    else: tn += 1
    
print("正确率：{}".format((tp+tn)/(tp+fp+tn+fn)))
print("准确率：{}".format(tp/(tp+fp)))

正确率：0.8314
准确率：0.8420444224258938


class Bayes:                                    #改用的机器学习课写的贝叶斯，没用那个矩阵
    def __init__(self):
        self.y_label = {}                       # pos例子的字典, key:word, val:count
        self.n_label = {}                       # neg例子的字典
        self.n0 = 0
        self.n1 = 0
    def fit(self,data):
        for row in data:
            if row[-1]:                         #pos文本
                self.n1 +=1
                for i,d in enumerate(row[:-1]):
                    if d not in self.y_label:
                        self.y_label[d] = 1
                    else: self.y_label[d] += 1
            else:
                self.n0 +=1
                for i,d in enumerate(row[:-1]):
                    if d not in self.n_label:
                        self.n_label[d] = 1
                    else: self.n_label[d] += 1 

    def predict(self,data,x):
        p_yes = self.n1/(self.n0 + self.n1)
        p_no = self.n0/(self.n0 + self.n1)
        for d in data:
            p_yes *= (self.y_label.get(d, 0) + x)/(self.n1+2)          #拉普拉斯平滑，即加 1 平滑
            p_no *= (self.n_label.get(d, 0) + x)/(self.n0+2)

        #print("p(yes):", p_yes)
        #print("p(no):", p_no)
        return True if  p_yes >= p_no else False


import numpy as np
import matplotlib.pyplot as plt
import time

start_time = time.time()

#解决中文乱码问题
plt.rcParams['font.family']=['Microsoft YaHei']
model = Bayes()
model.fit(data)

def main(arr):
    
    x, acc, pre = [], [], []
    
    for i in arr:

        tp, fp, tn, fn = 0, 0, 0, 0
        for t in testp:
            if model.predict(t, i): 
                tp += 1
            else: fn += 1
        for t in testn:
            if model.predict(t, i):
                fp += 1
            else: tn += 1
                
        x.append(i)
        acc.append((tp+tn)/(tp+fp+tn+fn))
        pre.append(tp/(tp+fp))
        
    plt.plot(x, acc, label='正确率')
    plt.plot(x, pre, label='准确率')

    plt.title("评测曲线")
    plt.xlabel('平滑数')
    plt.legend()
    plt.show()
    
    return acc, pre
    
acc,pre = main([i for i in range(0,251,2)])

end_time = time.time()
print("程序运行了：{}".format(end_time-start_time))

# for x,y in zip(acc,pre):
#     print(x,y)
# 暂时选取 200 为最优

程序运行了：196.99278736114502


import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

def findFile(path):
    for file in os.listdir(path):
        if file.endswith(".txt"):
            yield os.path.join(path, file)
            
def create(path):
    ans = []
    for file in findFile(path):
        with open(file, "r", encoding='UTF-8') as f:
            ans.append(str(f.read()))
    return ans                                  #ans中，一篇文章的整个字符为一项

vectorizer = TfidfVectorizer(stop_words = "english") 

neg = create(r"dataset\train\neg")
pos = create(r"dataset\train\pos")
testn = create(r"dataset\test\neg")
testp = create(r"dataset\test\pos")

train = vectorizer.fit_transform(neg+pos)
test = vectorizer.transform(testn+testp)

model = MultinomialNB() 
model.fit(train, [0]*12500+[1]*12500)                      
print("准确率为:",model.score(test, [0]*12500+[1]*12500))

准确率为: 0.82992


from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB # 多项式模型，进行文本分类
from pprint import pprint        #data pretty printer
from sklearn.feature_extraction.text import TfidfVectorizer #TF-IDF文本特征提取

newsgroups_train = fetch_20newsgroups(subset='train')

#pprint(list(newsgroups_train.target_names))

select = ['alt.atheism','comp.graphics','misc.forsale','rec.autos',
          'sci.crypt','soc.religion.christian','talk.politics.guns']

train=fetch_20newsgroups(subset='train',categories=select)   # 1.数据集导入

#print(type(train),dir(train))                         #这是一些 python自省函数 ！！！！！！！！！！
print(type(train.data),dir(train.data))
print(train.data[:1])
#print(train.target)
test=fetch_20newsgroups(subset='test',categories=select)


vectorizer = TfidfVectorizer() 
train_v=vectorizer.fit_transform(train.data)                 # 2.文本特征向量化
test_v=vectorizer.transform(test.data)

model = MultinomialNB() 
model.fit(train_v,train.target)                              # 3.训练模型
print("准确率为:",model.score(test_v,test.target))

<class 'list'> ['__add__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__rmul__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort']
['From: pmoloney@maths.tcd.ie (Paul Moloney)\nSubject: Re: some thoughts.\nKeywords: Dan Bissell\nOrganization: Somewhere in the Twentieth Century\nLines: 14\n\nbissda@saturn.wwc.edu (DAN LAWRENCE BISSELL) writes:\n\n>\tNiether was he a lunatic.  Would more than an entire nation be drawn \n>to someone who was crazy.\n\nFind an encyclopedia. Volume H. Now look up Hitler, Adolf. He had\nmany more people than just Germans enamoured with him.\n\nP.\n-- \n moorcockpratchettdenislearydelasoulu2iainmbanksneworderheathersbatmanpjorourke\nclive p a u l  m o l o n e y  Come, let us retract the foreskin of misconception\njames trinity college dublin  and apply the wire brush of enlightenment - GeoffM\n brownbladerunnersugarcubeselectronicblaylockpowersspikeleekatebushhamcornpizza \n']
准确率为: 0.8714177978883861


import numpy
import sys
print(sys.version)
print(numpy.__version__)

3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
1.19.2

平滑	正确率Acc	准确率Pre
未平滑	0.60108	0.562895116730549
+1	0.80944	0.7831625183016105
+5	0.81656	0.790187738339689
+100	0.8314	0.8420444224258938
+250	0.83028	0.8463216173139837

短文本情感分析任务¶

（1）朴素贝叶斯实现¶

V1.1 二分类实现¶

分数记录：¶

V1.2 引入TF-IDF特征提取，及停用词¶

V1.3 对 1.2 测算平滑参数的影响¶

V2.0 使用sklearn的多项式贝叶斯¶

test_demo¶

这是一个示例代码¶

查看环境版本¶

平滑	正确率	准确率
未平滑	0.51324	0.5067958773046441
+1	0.77308	0.7278552833589212
+2	0.77656	0.7312683971099813