任务描述: 基于机器学习方法实现电影评论文本的情感分类。 最终机器能够从大规模标注文本中获得经验,判定某一指定文本的情感极性:正面/负面
数据集: 数据集已经切分为训练集和测试集, 其中训练集 25000 条, 测试集 25000 条;训练集和测试集中正面(positive) 评价和负面(negtive)评价各占一半。
目录结构如下:训练集和测试集分别位于 train 和 test 两个目录,每个目录下均有 pos 和 neg 两个子目录,分别代表正面评价的文本和负面评价的文本;每一个训练样例一个文件,文本命名方式为: id_rating.txt,其中 id 为样例唯一 id, rating 为该文本的情感极性评分,正面评价为 7-10分,负面评价为 0-4 分; 例如: [test/pos/200_8.txt], 表示测试集中 id 为 200、评分为 8 的正面评价实例(正例)
提示:
import os
import re
def findFile(path):
for file in os.listdir(path):
#for file in os.listdir(path)[:1000]:
if file.endswith(".txt"):
#print(file)
yield os.path.join(path, file)
def create(path):
ans = []
com_cnt = 0
err_cnt = 0
for file in findFile(path):
with open(file, "r", encoding='UTF-8') as f:
try:
tmp = re.split(' ', f.read())
ans.append([i for i in tmp if i])
com_cnt += 1
except:
err_cnt += 1
print("正确读取文件:{} 错误读取文件:{}".format(com_cnt, err_cnt))
return ans
neg = create(r"dataset\train\neg")
pos = create(r"dataset\train\pos")
data = [row + [0] for row in neg] + [row + [1] for row in pos]
testn = create(r"dataset\test\neg")
testp = create(r"dataset\test\pos")
正确读取文件:12500 错误读取文件:0 正确读取文件:12500 错误读取文件:0 正确读取文件:12500 错误读取文件:0 正确读取文件:12500 错误读取文件:0
class Bayes: #改用的机器学习课写的贝叶斯,没用那个矩阵
def __init__(self):
self.y_label = {} # pos例子的字典, key:word, val:count
self.n_label = {} # neg例子的字典
self.n0 = 0
self.n1 = 0
def fit(self,data):
for row in data:
if row[-1]: #pos文本
self.n1 +=1
for i,d in enumerate(row[:-1]):
if d not in self.y_label:
self.y_label[d] = 1
else: self.y_label[d] += 1
else:
self.n0 +=1
for i,d in enumerate(row[:-1]):
if d not in self.n_label:
self.n_label[d] = 1
else: self.n_label[d] += 1
def predict(self,data):
p_yes = self.n1/(self.n0 + self.n1)
p_no = self.n0/(self.n0 + self.n1)
for d in data:
p_yes *= (self.y_label.get(d, 0)+2)/(self.n1+2) #拉普拉斯平滑,即加 1 平滑
p_no *= (self.n_label.get(d, 0)+2)/(self.n0+2)
#print("p(yes):", p_yes)
#print("p(no):", p_no)
return True if p_yes >= p_no else False
model = Bayes()
model.fit(data)
tp, fp, tn, fn = 0, 0, 0, 0
for t in testp:
if model.predict(t):
tp += 1
else: fn += 1
for t in testn:
if model.predict(t):
fp += 1
else: tn += 1
print("正确率:{}".format((tp+tn)/(tp+fp+tn+fn)))
print("准确率:{}".format(tp/(tp+fp)))
正确率:0.77656 准确率:0.7312683971099813
平滑 | 正确率 | 准确率 |
---|---|---|
未平滑 | 0.51324 | 0.5067958773046441 |
+1 | 0.77308 | 0.7278552833589212 |
+2 | 0.77656 | 0.7312683971099813 |
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
def findFile(path):
for file in os.listdir(path):
if file.endswith(".txt"):
yield os.path.join(path, file)
def create(path):
ans = []
for file in findFile(path):
vectorizer = TfidfVectorizer(stop_words = "english")
with open(file, "r", encoding='UTF-8') as f:
tmp = f.read().split('.')
vectorizer.fit(tmp)
ans.append(vectorizer.get_feature_names())
return ans
neg = create(r"dataset\train\neg")
pos = create(r"dataset\train\pos")
data = [row + [0] for row in neg] + [row + [1] for row in pos]
testn = create(r"dataset\test\neg")
testp = create(r"dataset\test\pos")
class Bayes: #改用的机器学习课写的贝叶斯,没用那个矩阵
def __init__(self):
self.y_label = {} # pos例子的字典, key:word, val:count
self.n_label = {} # neg例子的字典
self.n0 = 0
self.n1 = 0
def fit(self,data):
for row in data:
if row[-1]: #pos文本
self.n1 +=1
for i,d in enumerate(row[:-1]):
if d not in self.y_label:
self.y_label[d] = 1
else: self.y_label[d] += 1
else:
self.n0 +=1
for i,d in enumerate(row[:-1]):
if d not in self.n_label:
self.n_label[d] = 1
else: self.n_label[d] += 1
def predict(self,data):
p_yes = self.n1/(self.n0 + self.n1)
p_no = self.n0/(self.n0 + self.n1)
for d in data:
p_yes *= (self.y_label.get(d, 0)+200)/(self.n1+2) #拉普拉斯平滑,即加 1 平滑
p_no *= (self.n_label.get(d, 0)+200)/(self.n0+2)
#print("p(yes):", p_yes)
#print("p(no):", p_no)
return True if p_yes >= p_no else False
model = Bayes()
model.fit(data)
tp, fp, tn, fn = 0, 0, 0, 0
for t in testp:
if model.predict(t):
tp += 1
else: fn += 1
for t in testn:
if model.predict(t):
fp += 1
else: tn += 1
print("正确率:{}".format((tp+tn)/(tp+fp+tn+fn)))
print("准确率:{}".format(tp/(tp+fp)))
正确率:0.8314 准确率:0.8420444224258938
平滑 | 正确率Acc | 准确率Pre |
---|---|---|
未平滑 | 0.60108 | 0.562895116730549 |
+1 | 0.80944 | 0.7831625183016105 |
+5 | 0.81656 | 0.790187738339689 |
+100 | 0.8314 | 0.8420444224258938 |
+250 | 0.83028 | 0.8463216173139837 |
对1.2的文本处理模块不变,改写模型及评测
class Bayes: #改用的机器学习课写的贝叶斯,没用那个矩阵
def __init__(self):
self.y_label = {} # pos例子的字典, key:word, val:count
self.n_label = {} # neg例子的字典
self.n0 = 0
self.n1 = 0
def fit(self,data):
for row in data:
if row[-1]: #pos文本
self.n1 +=1
for i,d in enumerate(row[:-1]):
if d not in self.y_label:
self.y_label[d] = 1
else: self.y_label[d] += 1
else:
self.n0 +=1
for i,d in enumerate(row[:-1]):
if d not in self.n_label:
self.n_label[d] = 1
else: self.n_label[d] += 1
def predict(self,data,x):
p_yes = self.n1/(self.n0 + self.n1)
p_no = self.n0/(self.n0 + self.n1)
for d in data:
p_yes *= (self.y_label.get(d, 0) + x)/(self.n1+2) #拉普拉斯平滑,即加 1 平滑
p_no *= (self.n_label.get(d, 0) + x)/(self.n0+2)
#print("p(yes):", p_yes)
#print("p(no):", p_no)
return True if p_yes >= p_no else False
import numpy as np
import matplotlib.pyplot as plt
import time
start_time = time.time()
#解决中文乱码问题
plt.rcParams['font.family']=['Microsoft YaHei']
model = Bayes()
model.fit(data)
def main(arr):
x, acc, pre = [], [], []
for i in arr:
tp, fp, tn, fn = 0, 0, 0, 0
for t in testp:
if model.predict(t, i):
tp += 1
else: fn += 1
for t in testn:
if model.predict(t, i):
fp += 1
else: tn += 1
x.append(i)
acc.append((tp+tn)/(tp+fp+tn+fn))
pre.append(tp/(tp+fp))
plt.plot(x, acc, label='正确率')
plt.plot(x, pre, label='准确率')
plt.title("评测曲线")
plt.xlabel('平滑数')
plt.legend()
plt.show()
return acc, pre
acc,pre = main([i for i in range(0,251,2)])
end_time = time.time()
print("程序运行了:{}".format(end_time-start_time))
# for x,y in zip(acc,pre):
# print(x,y)
# 暂时选取 200 为最优
程序运行了:196.99278736114502
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
def findFile(path):
for file in os.listdir(path):
if file.endswith(".txt"):
yield os.path.join(path, file)
def create(path):
ans = []
for file in findFile(path):
with open(file, "r", encoding='UTF-8') as f:
ans.append(str(f.read()))
return ans #ans中,一篇文章的整个字符为一项
vectorizer = TfidfVectorizer(stop_words = "english")
neg = create(r"dataset\train\neg")
pos = create(r"dataset\train\pos")
testn = create(r"dataset\test\neg")
testp = create(r"dataset\test\pos")
train = vectorizer.fit_transform(neg+pos)
test = vectorizer.transform(testn+testp)
model = MultinomialNB()
model.fit(train, [0]*12500+[1]*12500)
print("准确率为:",model.score(test, [0]*12500+[1]*12500))
准确率为: 0.82992
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB # 多项式模型,进行文本分类
from pprint import pprint #data pretty printer
from sklearn.feature_extraction.text import TfidfVectorizer #TF-IDF文本特征提取
newsgroups_train = fetch_20newsgroups(subset='train')
#pprint(list(newsgroups_train.target_names))
select = ['alt.atheism','comp.graphics','misc.forsale','rec.autos',
'sci.crypt','soc.religion.christian','talk.politics.guns']
train=fetch_20newsgroups(subset='train',categories=select) # 1.数据集导入
#print(type(train),dir(train)) #这是一些 python自省函数 !!!!!!!!!!
print(type(train.data),dir(train.data))
print(train.data[:1])
#print(train.target)
test=fetch_20newsgroups(subset='test',categories=select)
vectorizer = TfidfVectorizer()
train_v=vectorizer.fit_transform(train.data) # 2.文本特征向量化
test_v=vectorizer.transform(test.data)
model = MultinomialNB()
model.fit(train_v,train.target) # 3.训练模型
print("准确率为:",model.score(test_v,test.target))
<class 'list'> ['__add__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__rmul__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort'] ['From: pmoloney@maths.tcd.ie (Paul Moloney)\nSubject: Re: some thoughts.\nKeywords: Dan Bissell\nOrganization: Somewhere in the Twentieth Century\nLines: 14\n\nbissda@saturn.wwc.edu (DAN LAWRENCE BISSELL) writes:\n\n>\tNiether was he a lunatic. Would more than an entire nation be drawn \n>to someone who was crazy.\n\nFind an encyclopedia. Volume H. Now look up Hitler, Adolf. He had\nmany more people than just Germans enamoured with him.\n\nP.\n-- \n moorcockpratchettdenislearydelasoulu2iainmbanksneworderheathersbatmanpjorourke\nclive p a u l m o l o n e y Come, let us retract the foreskin of misconception\njames trinity college dublin and apply the wire brush of enlightenment - GeoffM\n brownbladerunnersugarcubeselectronicblaylockpowersspikeleekatebushhamcornpizza \n'] 准确率为: 0.8714177978883861
import numpy
import sys
print(sys.version)
print(numpy.__version__)
3.8.5 (default, Sep 3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)] 1.19.2