实现decision stump学习器表示;
实现decision stump学习过程;定义好接口(API)供AdaBoost使用;
使用决策树桩作为基学习器(弱学习器),实现AdaBoost算法构建强学习器。
import numpy as np
from pprint import pprint
X = np.array([[0, 1, 3], [0, 3, 1], [1, 2, 2], [1, 1, 3], [1, 2, 3],
[0, 1, 2],[1, 1, 2], [1, 1, 1], [1, 3, 1], [0, 2, 1]])
y = np.array([-1, -1, -1, -1, -1, -1, 1, 1, -1, -1])
class stump:
def __init__(self):
self.d = None #二分类对应的维度
self.t = None #二分类对应的阈值
self.m = None #二分类对应的模式,l把左边分类为-1,r把右边分类为-1
@staticmethod #静态方法,可以不实例化调用该方法
def base_estimator(X, d, t, mode):
ans = np.ones(np.shape(X)[0]) #预测表
if mode == 'l':
ans[X[:, d] <= t] = -1.0
else:
ans[X[:, d] > t] = -1.0
return ans
def get_d(self):
return self.d
def get_t(self):
return self.t
def get_m(self):
return self.m
def fit(self, X, y, w): # w为样本点的权值,仅在该训练中生效
m, n = np.shape(X)
e_min = np.inf
sign = None
for i in range(n):
range_max = X[:,i].max()
range_min = X[:,i].min()
step = (range_max-range_min)/n
for j in range(-1, int(n)+1):
t = range_min + j*step
for mode in ['l','r']:
predict_vals = self.base_estimator(X, i, t, mode)
err_arr = np.array(np.ones(m))
err_arr[predict_vals.T == y.T] = 0
weighted_error = np.dot(w, err_arr)
if weighted_error < e_min:
e_min = weighted_error
sign = predict_vals
self.d = i
self.t = t
self.m = mode
return sign, e_min
class AdaBoost:
def __init__(self, X, y, tol=0.05, max_iter=10):
self.X = X
self.y = y
self.tol = tol #终止值
self.max_iter = max_iter
self.G = [] #分类器
self.alpha = [] #分类器权值
self.w = np.full((X.shape[0]), 1 / X.shape[0])
def updata_w(self, alpha, predict):
P = self.w * np.exp(-alpha * self.y * predict)
self.w = P / P.sum()
def train(self):
G = 0
for i in range(self.max_iter):
best_stump = stump()
sign, error = best_stump.fit(self.X, self.y, self.w)
alpha = 1 / 2 * np.log((1 - error) / error)
self.G.append(best_stump)
self.alpha.append(alpha)
# 以下3行计算当前总分类器(之前所有弱分类器加权和)分类效率
G += alpha * sign
y_predict = np.sign(G)
error_rate = np.sum(np.abs(y_predict - self.y)) / 2 / self.y.shape[0]
print("\n第{}次迭代的样本权重:".format(i+1))
pprint(self.w)
if error_rate < self.tol: # 满足中止条件 则跳出循环
print("\n总共迭代次数:", i + 1)
break
else:
self.updata_w(alpha, y_predict) # 若不满足,更新权重,继续迭代
def predict(self, X):
m = np.shape(X)[0]
G = np.zeros(m)
for i in range(len(self.G)):
tmp = self.G[i]
# 遍历每一个弱分类器,进行加权
_G = stump.base_estimator(X, tmp.get_d(), tmp.get_t(), tmp.get_m())
alpha = self.alpha[i]
G += alpha * _G
y_predict = np.sign(G)
return y_predict.astype(int)
def score(self, X, y):
y_predict = self.predict(X)
error_rate = np.sum(np.abs(y_predict - y)) / 2 / y.shape[0]
print("\n弱分类器:")
pprint(self.G)
return 1 - error_rate
model = AdaBoost(X, y)
model.train()
y_predict = model.predict(X)
score = model.score(X, y)
#print("原始输出:", y)
#print("预测输出:", y_predict)
print("预测正确率:{:.2%}".format(score))
第1次迭代的样本权重: array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) 第2次迭代的样本权重: array([0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.25 , 0.25 , 0.0625, 0.0625]) 第3次迭代的样本权重: array([0.16666667, 0.03846154, 0.03846154, 0.16666667, 0.03846154, 0.16666667, 0.15384615, 0.15384615, 0.03846154, 0.03846154]) 第4次迭代的样本权重: array([0.13188406, 0.03043478, 0.03043478, 0.13188406, 0.03043478, 0.13188406, 0.33043478, 0.12173913, 0.03043478, 0.03043478]) 第5次迭代的样本权重: array([0.09937457, 0.02293259, 0.02293259, 0.34587511, 0.02293259, 0.09937457, 0.24898243, 0.09173037, 0.02293259, 0.02293259]) 第6次迭代的样本权重: array([0.09251592, 0.02134983, 0.02134983, 0.32200345, 0.02134983, 0.09251592, 0.23179812, 0.08539931, 0.09036799, 0.02134983]) 第7次迭代的样本权重: array([0.06729152, 0.01552881, 0.01552881, 0.23420943, 0.01552881, 0.20521726, 0.16859852, 0.06211525, 0.20045278, 0.01552881]) 第8次迭代的样本权重: array([0.04584103, 0.0105787 , 0.0105787 , 0.1595506 , 0.0105787 , 0.13980025, 0.11485445, 0.0423148 , 0.45532406, 0.0105787 ]) 总共迭代次数: 8 弱分类器: [<__main__.stump object at 0x00000221775E5820>, <__main__.stump object at 0x0000022178B3DA00>, <__main__.stump object at 0x0000022178B3D490>, <__main__.stump object at 0x00000221775E5880>, <__main__.stump object at 0x0000022178B3DBB0>, <__main__.stump object at 0x0000022178B3DC10>, <__main__.stump object at 0x0000022178B3D4C0>, <__main__.stump object at 0x0000022178B3DE20>] 预测正确率:100.00%