import numpy as np
from pprint import pprint

X = np.array([[0, 1, 3], [0, 3, 1], [1, 2, 2], [1, 1, 3], [1, 2, 3], 
              [0, 1, 2],[1, 1, 2], [1, 1, 1], [1, 3, 1], [0, 2, 1]])
y = np.array([-1, -1, -1, -1, -1, -1, 1, 1, -1, -1])


class stump:
    def __init__(self):
        self.d = None  #二分类对应的维度
        self.t = None  #二分类对应的阈值
        self.m = None  #二分类对应的模式，l把左边分类为-1，r把右边分类为-1
        
    @staticmethod                        #静态方法，可以不实例化调用该方法
    def base_estimator(X, d, t, mode):
        ans = np.ones(np.shape(X)[0])    #预测表
        if mode == 'l':
            ans[X[:, d] <= t] = -1.0
        else:
            ans[X[:, d] > t] = -1.0
        return ans
    
    def get_d(self):
        return self.d
    
    def get_t(self):
        return self.t
    
    def get_m(self):
        return self.m
        
    def fit(self, X, y, w):   # w为样本点的权值，仅在该训练中生效
        m, n = np.shape(X)
        e_min = np.inf
        sign = None
        for i in range(n):
            range_max = X[:,i].max()
            range_min = X[:,i].min()
            step = (range_max-range_min)/n
            for j in range(-1, int(n)+1):
                t = range_min + j*step
                for mode in ['l','r']:
                    predict_vals = self.base_estimator(X, i, t, mode)
                    err_arr = np.array(np.ones(m))
                    err_arr[predict_vals.T == y.T] = 0
                    weighted_error = np.dot(w, err_arr)
                    if weighted_error < e_min:
                        e_min = weighted_error
                        sign = predict_vals
                        self.d = i
                        self.t = t
                        self.m = mode
        return sign, e_min


class AdaBoost:
    def __init__(self, X, y, tol=0.05, max_iter=10):
        self.X = X
        self.y = y
        self.tol = tol           #终止值
        self.max_iter = max_iter
        self.G = []              #分类器
        self.alpha = []          #分类器权值
        self.w = np.full((X.shape[0]), 1 / X.shape[0])

    def updata_w(self, alpha, predict):
        P = self.w * np.exp(-alpha * self.y * predict)
        self.w = P / P.sum()

    def train(self):
        G = 0
        for i in range(self.max_iter):
            best_stump = stump()
            sign, error = best_stump.fit(self.X, self.y, self.w)
            
            alpha = 1 / 2 * np.log((1 - error) / error)
            self.G.append(best_stump)
            self.alpha.append(alpha)
            
            # 以下3行计算当前总分类器（之前所有弱分类器加权和）分类效率
            G += alpha * sign
            y_predict = np.sign(G)
            error_rate = np.sum(np.abs(y_predict - self.y)) / 2 / self.y.shape[0]
            
            print("\n第{}次迭代的样本权重：".format(i+1))
            pprint(self.w)
            
            if error_rate < self.tol:            # 满足中止条件 则跳出循环
                print("\n总共迭代次数:", i + 1)
                break
            else:
                self.updata_w(alpha, y_predict)  # 若不满足，更新权重，继续迭代

    def predict(self, X):
        m = np.shape(X)[0]
        G = np.zeros(m)
        for i in range(len(self.G)):
            tmp = self.G[i]
            # 遍历每一个弱分类器，进行加权
            _G = stump.base_estimator(X, tmp.get_d(), tmp.get_t(), tmp.get_m())
            alpha = self.alpha[i]
            G += alpha * _G
        y_predict = np.sign(G)
        return y_predict.astype(int)

    def score(self, X, y):
        y_predict = self.predict(X)
        error_rate = np.sum(np.abs(y_predict - y)) / 2 / y.shape[0]
        
        print("\n弱分类器：")
        pprint(self.G)
        return 1 - error_rate


model = AdaBoost(X, y)
model.train()

y_predict = model.predict(X)
score = model.score(X, y)
#print("原始输出:", y)
#print("预测输出:", y_predict)
print("预测正确率：{:.2%}".format(score))

第1次迭代的样本权重：
array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])

第2次迭代的样本权重：
array([0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.25  , 0.25  ,
       0.0625, 0.0625])

第3次迭代的样本权重：
array([0.16666667, 0.03846154, 0.03846154, 0.16666667, 0.03846154,
       0.16666667, 0.15384615, 0.15384615, 0.03846154, 0.03846154])

第4次迭代的样本权重：
array([0.13188406, 0.03043478, 0.03043478, 0.13188406, 0.03043478,
       0.13188406, 0.33043478, 0.12173913, 0.03043478, 0.03043478])

第5次迭代的样本权重：
array([0.09937457, 0.02293259, 0.02293259, 0.34587511, 0.02293259,
       0.09937457, 0.24898243, 0.09173037, 0.02293259, 0.02293259])

第6次迭代的样本权重：
array([0.09251592, 0.02134983, 0.02134983, 0.32200345, 0.02134983,
       0.09251592, 0.23179812, 0.08539931, 0.09036799, 0.02134983])

第7次迭代的样本权重：
array([0.06729152, 0.01552881, 0.01552881, 0.23420943, 0.01552881,
       0.20521726, 0.16859852, 0.06211525, 0.20045278, 0.01552881])

第8次迭代的样本权重：
array([0.04584103, 0.0105787 , 0.0105787 , 0.1595506 , 0.0105787 ,
       0.13980025, 0.11485445, 0.0423148 , 0.45532406, 0.0105787 ])

总共迭代次数: 8

弱分类器：
[<__main__.stump object at 0x00000221775E5820>,
 <__main__.stump object at 0x0000022178B3DA00>,
 <__main__.stump object at 0x0000022178B3D490>,
 <__main__.stump object at 0x00000221775E5880>,
 <__main__.stump object at 0x0000022178B3DBB0>,
 <__main__.stump object at 0x0000022178B3DC10>,
 <__main__.stump object at 0x0000022178B3D4C0>,
 <__main__.stump object at 0x0000022178B3DE20>]
预测正确率：100.00%

题目要求¶

基本思想¶