from sklearn.datasets import fetch_20newsgroups
random_state = 1234

train_data = fetch_20newsgroups(subset="train", shuffle=True, random_state=random_state)

print("keys: ", train_data.keys(),"\n")


print("categories(%d): "%len((train_data["target_names"])))
for i, cname in enumerate(train_data["target_names"]):
    print("%d: "%i, cname)
    
print("\ntrain_data 中有文章 %d 篇。"%len(train_data["data"]))
print("前10篇文章的 labels: ", train_data["target"][:10])

keys:  dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR']) 

categories(20): 
0:  alt.atheism
1:  comp.graphics
2:  comp.os.ms-windows.misc
3:  comp.sys.ibm.pc.hardware
4:  comp.sys.mac.hardware
5:  comp.windows.x
6:  misc.forsale
7:  rec.autos
8:  rec.motorcycles
9:  rec.sport.baseball
10:  rec.sport.hockey
11:  sci.crypt
12:  sci.electronics
13:  sci.med
14:  sci.space
15:  soc.religion.christian
16:  talk.politics.guns
17:  talk.politics.mideast
18:  talk.politics.misc
19:  talk.religion.misc

train_data 中有文章 11314 篇。
前10篇文章的 labels:  [18 13  6  1 15 13 11  4  7  8]


print(type(train_data), dir(train_data),"\n")
print(type(train_data.data), dir(train_data.data),"\n")

print(train_data.data[:1])   #可见 train_data.data 是一个字符串列表

<class 'sklearn.utils.Bunch'> ['DESCR', 'data', 'filenames', 'target', 'target_names'] 

<class 'list'> ['__add__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__rmul__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort'] 

["From: mwalker@novell.com (Mel Walker)\nSubject: Re: Top Ten Ways Slick Willie Could Improve His Standing With Americans\nNntp-Posting-Host: mwalker.npd.provo.novell.com\nOrganization: Novell, Inc\nLines: 23\n\nIn article <C5KMz5.Hy4@newsserver.technet.sg>, ipser@solomon.technet.sg (Ed\nIpser) wrote:\n> \n> \n> Top Ten Ways Slick Willie Could Improve His Standing With Americans\n> \n> \n[deleted for a very good reason which I'm sure you can guess]\n>\n\n0. Enact a law that bans people without a sense of humor from\n   posting allegedly humorous items. If he did this, I think\n   his approval rating would go through the roof!\n\n> Copyright (c) Edward A. Ipser, Jr., 1993\n\nThis means we can't quote Ed without his permission. No using these lists\nin your .sigs, folks!\n\n----------------------------------------------------------------\nMel Walker                                    mwalker@novell.com\nAll opinions expressed are of the author.\nNovell, Inc. is not responsible for the content of this article.\n"]


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
count_vect = CountVectorizer()

# tokenize and count on train_data:
X_train_counts = count_vect.fit_transform(train_data.data)

print("part of vocabulary: ")
print([(i, sword) for (i, sword) in enumerate(count_vect.vocabulary_) if i < 10])
print("shape of tf-count: ", X_train_counts.shape)

# term frequency only!
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print("Shape of tf-vector: ", X_train_tf.shape)

# tf-idf:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print("Shape of tf-idf vector: ", X_train_tfidf.shape)

part of vocabulary: 
[(0, 'from'), (1, 'mwalker'), (2, 'novell'), (3, 'com'), (4, 'mel'), (5, 'walker'), (6, 'subject'), (7, 're'), (8, 'top'), (9, 'ten')]
shape of tf-count:  (11314, 130107)
Shape of tf-vector:  (11314, 130107)
Shape of tf-idf vector:  (11314, 130107)


from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train_data.target)

# do some toy-example prediction:
toy_examples = ['I love Machine Learning!', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(toy_examples)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
print("Predict on toy-examples: ")
for doc, category in zip(toy_examples, predicted):
    print('%r => %s' % (doc, train_data.target_names[category]))

Predict on toy-examples: 
'I love Machine Learning!' => soc.religion.christian
'OpenGL on the GPU is fast' => rec.autos


import numpy as np
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=random_state)
test_features = tfidf_transformer.transform(count_vect.transform(test_data.data))
predicted = clf.predict(test_features)
print("@naive Bayes! Evaluate on test: ", np.mean(predicted == test_data.target))

@naive Bayes! Evaluate on test:  0.7738980350504514


from sklearn.linear_model import SGDClassifier
mm_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=random_state, max_iter=5, tol=None)
mm_clf.fit(X_train_tfidf, train_data.target)

predicted = mm_clf.predict(test_features)
print("@max-margin! Evaluate on test: ", np.mean(predicted == test_data.target))

# Report the test result with Precision/recall/f1-score and Confusion matrix:
from sklearn import metrics

#Build a text report showing the main classification metrics.
print(metrics.classification_report(test_data.target, predicted, target_names=test_data.target_names))

#Compute confusion matrix to evaluate the accuracy of a classification.
print(metrics.confusion_matrix(test_data.target, predicted)[:3])

@max-margin! Evaluate on test:  0.8248805098247477
                          precision    recall  f1-score   support

             alt.atheism       0.73      0.71      0.72       319
           comp.graphics       0.79      0.71      0.75       389
 comp.os.ms-windows.misc       0.72      0.80      0.76       394
comp.sys.ibm.pc.hardware       0.74      0.69      0.71       392
   comp.sys.mac.hardware       0.83      0.82      0.83       385
          comp.windows.x       0.85      0.76      0.80       395
            misc.forsale       0.84      0.89      0.87       390
               rec.autos       0.92      0.89      0.91       396
         rec.motorcycles       0.91      0.97      0.94       398
      rec.sport.baseball       0.89      0.90      0.89       397
        rec.sport.hockey       0.87      0.99      0.93       399
               sci.crypt       0.84      0.96      0.90       396
         sci.electronics       0.83      0.62      0.71       393
                 sci.med       0.87      0.85      0.86       396
               sci.space       0.84      0.96      0.89       394
  soc.religion.christian       0.75      0.94      0.83       398
      talk.politics.guns       0.70      0.92      0.79       364
   talk.politics.mideast       0.91      0.92      0.92       376
      talk.politics.misc       0.90      0.55      0.68       310
      talk.religion.misc       0.84      0.39      0.54       251

                accuracy                           0.82      7532
               macro avg       0.83      0.81      0.81      7532
            weighted avg       0.83      0.82      0.82      7532

[[226   0   0   1   0   2   1   0   1   3   1   2   1  11   5  44   2   8
    1  10]
 [  2 277  21   9   8  24   3   1   2   4   3   9   5   3   9   2   2   4
    0   1]
 [  1  10 315  20   8  11   2   0   0   6   1   7   1   1   7   2   0   1
    0   1]]


from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# define a pipeline from extraction of features to model building.
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

# grid-search by cross-validation:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(train_data.data, train_data.target)

print("Mean cross-validated score of the best_estimator: ", gs_clf.best_score_)
print("best hyper-parameters configuration: ")
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
print("\n\n\n")
    
# Test on toy-examples:
predicted = gs_clf.predict(toy_examples)
print("Predict on toy-examples: ")
for doc, category in zip(toy_examples, predicted):
    print('%r => %s' % (doc, train_data.target_names[category]))

# Test finally with best model!
predicted = gs_clf.predict(test_data.data)
print("\n\n\n@Best Model! Evaluate on test: ", np.mean(predicted == test_data.target))

Mean cross-validated score of the best_estimator:  0.9040120291327873
best hyper-parameters configuration: 
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


Predict on toy-examples: 
'I love Machine Learning!' => misc.forsale
'OpenGL on the GPU is fast' => rec.autos


@Best Model! Evaluate on test:  0.8353690918746681


import os
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


x = []
y = ["neg"]*1000 + ["pos"]*1000

def findFile(path):
    for file in os.listdir(path):
        yield os.path.join(path, file)
        
def create(path):
    com_cnt = 0
    err_cnt = 0
    for file in findFile(path):
        with open(file, "r", encoding = "UTF-8") as f:
            try:
                x.append(f.read())
                com_cnt += 1
            except: 
                err_cnt += 1
    print("正确读取文件:{}   错误读取文件:{}".format(com_cnt, err_cnt))
    
    
create(r"review_polarity\neg")
create(r"review_polarity\pos")

#print(len(x), len(y))
train_data, test_data, train_label, test_label = train_test_split(x, y, train_size = 0.8)
#print(test_label)

正确读取文件:1000   错误读取文件:0
正确读取文件:1000   错误读取文件:0


vec = CountVectorizer()   # vec 要先 fit, 即放入数据
idf = TfidfTransformer()

train_matrix = idf.fit_transform(vec.fit_transform(train_data))
test_matrix = idf.fit_transform(vec.transform(test_data))   # 这里不能 fit, 后面会 dimension mismatch

model = MultinomialNB()
model.fit(train_matrix,train_label)
print(model.score(test_matrix,test_label))
print(np.mean(model.predict(test_matrix) == test_label))

0.7825
0.7825


vec = CountVectorizer()
idf = TfidfTransformer()

vec.fit(train_data + test_data)

train_matrix = idf.fit_transform(vec.transform(train_data))
test_matrix = idf.fit_transform(vec.transform(test_data))

model = MultinomialNB()
model.fit(train_matrix,train_label)
print(model.score(test_matrix,test_label))
print(np.mean(model.predict(test_matrix) == test_label))

0.78
0.78


text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(train_data, train_label)

print("Mean cross-validated score of the best_estimator: ", gs_clf.best_score_)
print("best hyper-parameters configuration: ")
for param_name in sorted(parameters.keys()):
    print("{}:{}".format(param_name, gs_clf.best_params_[param_name]))

Mean cross-validated score of the best_estimator:  0.8324999999999999
best hyper-parameters configuration: 
clf__alpha:0.001
tfidf__use_idf:True
vect__ngram_range:(1, 1)


predicted = gs_clf.predict(test_data)
print("@Best Model! Evaluate on test: ", np.mean(predicted == test_label))

@Best Model! Evaluate on test:  0.8225

（一）20-newsgroups 数据集¶

利用自省函数¶

（二）特征提取¶

一些说明：¶

（三）朴素贝叶斯¶

（四）模型评估¶

（五）随机梯度下降的线性分类器¶

（六）基于网格搜索的超参数优化¶

（七）作业代码¶

1. 预处理数据¶

2. 构建文本分类器¶

3. 使用网格搜索¶

4. 评估模型在测试集上性能¶