from sklearn.datasets import fetch_20newsgroups
random_state = 1234
train_data = fetch_20newsgroups(subset="train", shuffle=True, random_state=random_state)
print("keys: ", train_data.keys(),"\n")
print("categories(%d): "%len((train_data["target_names"])))
for i, cname in enumerate(train_data["target_names"]):
print("%d: "%i, cname)
print("\ntrain_data 中有文章 %d 篇。"%len(train_data["data"]))
print("前10篇文章的 labels: ", train_data["target"][:10])
keys: dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR']) categories(20): 0: alt.atheism 1: comp.graphics 2: comp.os.ms-windows.misc 3: comp.sys.ibm.pc.hardware 4: comp.sys.mac.hardware 5: comp.windows.x 6: misc.forsale 7: rec.autos 8: rec.motorcycles 9: rec.sport.baseball 10: rec.sport.hockey 11: sci.crypt 12: sci.electronics 13: sci.med 14: sci.space 15: soc.religion.christian 16: talk.politics.guns 17: talk.politics.mideast 18: talk.politics.misc 19: talk.religion.misc train_data 中有文章 11314 篇。 前10篇文章的 labels: [18 13 6 1 15 13 11 4 7 8]
print(type(train_data), dir(train_data),"\n")
print(type(train_data.data), dir(train_data.data),"\n")
print(train_data.data[:1]) #可见 train_data.data 是一个字符串列表
<class 'sklearn.utils.Bunch'> ['DESCR', 'data', 'filenames', 'target', 'target_names'] <class 'list'> ['__add__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__rmul__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort'] ["From: mwalker@novell.com (Mel Walker)\nSubject: Re: Top Ten Ways Slick Willie Could Improve His Standing With Americans\nNntp-Posting-Host: mwalker.npd.provo.novell.com\nOrganization: Novell, Inc\nLines: 23\n\nIn article <C5KMz5.Hy4@newsserver.technet.sg>, ipser@solomon.technet.sg (Ed\nIpser) wrote:\n> \n> \n> Top Ten Ways Slick Willie Could Improve His Standing With Americans\n> \n> \n[deleted for a very good reason which I'm sure you can guess]\n>\n\n0. Enact a law that bans people without a sense of humor from\n posting allegedly humorous items. If he did this, I think\n his approval rating would go through the roof!\n\n> Copyright (c) Edward A. Ipser, Jr., 1993\n\nThis means we can't quote Ed without his permission. No using these lists\nin your .sigs, folks!\n\n----------------------------------------------------------------\nMel Walker mwalker@novell.com\nAll opinions expressed are of the author.\nNovell, Inc. is not responsible for the content of this article.\n"]
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
count_vect = CountVectorizer()
# tokenize and count on train_data:
X_train_counts = count_vect.fit_transform(train_data.data)
print("part of vocabulary: ")
print([(i, sword) for (i, sword) in enumerate(count_vect.vocabulary_) if i < 10])
print("shape of tf-count: ", X_train_counts.shape)
# term frequency only!
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print("Shape of tf-vector: ", X_train_tf.shape)
# tf-idf:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print("Shape of tf-idf vector: ", X_train_tfidf.shape)
part of vocabulary: [(0, 'from'), (1, 'mwalker'), (2, 'novell'), (3, 'com'), (4, 'mel'), (5, 'walker'), (6, 'subject'), (7, 're'), (8, 'top'), (9, 'ten')] shape of tf-count: (11314, 130107) Shape of tf-vector: (11314, 130107) Shape of tf-idf vector: (11314, 130107)
CountVectorizer
,会将文本转换为token计数矩阵,行数为文章数,列数为字典中词数,以出现作为值
属性
vocabulary_
: A mapping of terms to feature indices.方法
fit
: Learn a vocabulary dictionary of all tokens in the raw documents.
fit_transform
: Learn the vocabulary dictionary and return document-term matrix.
get_feature_names
: Array mapping from feature integer indices to feature name.
TfidfTransformer
, 将计数矩阵转换为标准化tf或tf-idf表示
TfidfVectorizer
,将文本转换为TF-IDF特征矩阵
参考资料:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train_data.target)
# do some toy-example prediction:
toy_examples = ['I love Machine Learning!', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(toy_examples)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
print("Predict on toy-examples: ")
for doc, category in zip(toy_examples, predicted):
print('%r => %s' % (doc, train_data.target_names[category]))
Predict on toy-examples: 'I love Machine Learning!' => soc.religion.christian 'OpenGL on the GPU is fast' => rec.autos
import numpy as np
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=random_state)
test_features = tfidf_transformer.transform(count_vect.transform(test_data.data))
predicted = clf.predict(test_features)
print("@naive Bayes! Evaluate on test: ", np.mean(predicted == test_data.target))
@naive Bayes! Evaluate on test: 0.7738980350504514
from sklearn.linear_model import SGDClassifier
mm_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=random_state, max_iter=5, tol=None)
mm_clf.fit(X_train_tfidf, train_data.target)
predicted = mm_clf.predict(test_features)
print("@max-margin! Evaluate on test: ", np.mean(predicted == test_data.target))
# Report the test result with Precision/recall/f1-score and Confusion matrix:
from sklearn import metrics
#Build a text report showing the main classification metrics.
print(metrics.classification_report(test_data.target, predicted, target_names=test_data.target_names))
#Compute confusion matrix to evaluate the accuracy of a classification.
print(metrics.confusion_matrix(test_data.target, predicted)[:3])
@max-margin! Evaluate on test: 0.8248805098247477 precision recall f1-score support alt.atheism 0.73 0.71 0.72 319 comp.graphics 0.79 0.71 0.75 389 comp.os.ms-windows.misc 0.72 0.80 0.76 394 comp.sys.ibm.pc.hardware 0.74 0.69 0.71 392 comp.sys.mac.hardware 0.83 0.82 0.83 385 comp.windows.x 0.85 0.76 0.80 395 misc.forsale 0.84 0.89 0.87 390 rec.autos 0.92 0.89 0.91 396 rec.motorcycles 0.91 0.97 0.94 398 rec.sport.baseball 0.89 0.90 0.89 397 rec.sport.hockey 0.87 0.99 0.93 399 sci.crypt 0.84 0.96 0.90 396 sci.electronics 0.83 0.62 0.71 393 sci.med 0.87 0.85 0.86 396 sci.space 0.84 0.96 0.89 394 soc.religion.christian 0.75 0.94 0.83 398 talk.politics.guns 0.70 0.92 0.79 364 talk.politics.mideast 0.91 0.92 0.92 376 talk.politics.misc 0.90 0.55 0.68 310 talk.religion.misc 0.84 0.39 0.54 251 accuracy 0.82 7532 macro avg 0.83 0.81 0.81 7532 weighted avg 0.83 0.82 0.82 7532 [[226 0 0 1 0 2 1 0 1 3 1 2 1 11 5 44 2 8 1 10] [ 2 277 21 9 8 24 3 1 2 4 3 9 5 3 9 2 2 4 0 1] [ 1 10 315 20 8 11 2 0 0 6 1 7 1 1 7 2 0 1 0 1]]
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
# define a pipeline from extraction of features to model building.
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42,
max_iter=5, tol=None)),
])
parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}
# grid-search by cross-validation:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(train_data.data, train_data.target)
print("Mean cross-validated score of the best_estimator: ", gs_clf.best_score_)
print("best hyper-parameters configuration: ")
for param_name in sorted(parameters.keys()):
print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
print("\n\n\n")
# Test on toy-examples:
predicted = gs_clf.predict(toy_examples)
print("Predict on toy-examples: ")
for doc, category in zip(toy_examples, predicted):
print('%r => %s' % (doc, train_data.target_names[category]))
# Test finally with best model!
predicted = gs_clf.predict(test_data.data)
print("\n\n\n@Best Model! Evaluate on test: ", np.mean(predicted == test_data.target))
Mean cross-validated score of the best_estimator: 0.9040120291327873 best hyper-parameters configuration: clf__alpha: 0.001 tfidf__use_idf: True vect__ngram_range: (1, 2) Predict on toy-examples: 'I love Machine Learning!' => misc.forsale 'OpenGL on the GPU is fast' => rec.autos @Best Model! Evaluate on test: 0.8353690918746681
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
x = []
y = ["neg"]*1000 + ["pos"]*1000
def findFile(path):
for file in os.listdir(path):
yield os.path.join(path, file)
def create(path):
com_cnt = 0
err_cnt = 0
for file in findFile(path):
with open(file, "r", encoding = "UTF-8") as f:
try:
x.append(f.read())
com_cnt += 1
except:
err_cnt += 1
print("正确读取文件:{} 错误读取文件:{}".format(com_cnt, err_cnt))
create(r"review_polarity\neg")
create(r"review_polarity\pos")
#print(len(x), len(y))
train_data, test_data, train_label, test_label = train_test_split(x, y, train_size = 0.8)
#print(test_label)
正确读取文件:1000 错误读取文件:0 正确读取文件:1000 错误读取文件:0
vec = CountVectorizer() # vec 要先 fit, 即放入数据
idf = TfidfTransformer()
train_matrix = idf.fit_transform(vec.fit_transform(train_data))
test_matrix = idf.fit_transform(vec.transform(test_data)) # 这里不能 fit, 后面会 dimension mismatch
model = MultinomialNB()
model.fit(train_matrix,train_label)
print(model.score(test_matrix,test_label))
print(np.mean(model.predict(test_matrix) == test_label))
0.7825 0.7825
vec = CountVectorizer()
idf = TfidfTransformer()
vec.fit(train_data + test_data)
train_matrix = idf.fit_transform(vec.transform(train_data))
test_matrix = idf.fit_transform(vec.transform(test_data))
model = MultinomialNB()
model.fit(train_matrix,train_label)
print(model.score(test_matrix,test_label))
print(np.mean(model.predict(test_matrix) == test_label))
0.78 0.78
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42,
max_iter=5, tol=None)),
])
parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(train_data, train_label)
print("Mean cross-validated score of the best_estimator: ", gs_clf.best_score_)
print("best hyper-parameters configuration: ")
for param_name in sorted(parameters.keys()):
print("{}:{}".format(param_name, gs_clf.best_params_[param_name]))
Mean cross-validated score of the best_estimator: 0.8324999999999999 best hyper-parameters configuration: clf__alpha:0.001 tfidf__use_idf:True vect__ngram_range:(1, 1)
predicted = gs_clf.predict(test_data)
print("@Best Model! Evaluate on test: ", np.mean(predicted == test_label))
@Best Model! Evaluate on test: 0.8225