链家二手房成交记录探索性分析之后,我做了几轮机器学习模型预测。先写几个结论:
- 特征值和数据质量决定模型上限,后面的调优只是逼近这个上限而已。很多所谓的数据科学家只是夸夸其谈,数据预处理做的很差。我的数据预处理之后不仅得分很高,而且各项评估指标和测试都非常亮眼。
- 数据标准化并不适用于所有模型,比如几个线性模型数据标准化之后得分极低甚至变成了负数(LinearRegression),而且运算耗时大幅增加。
数据标准化之前的模型













可以看到表现最好的是Bagging, DecisionTree, ExtraTree,得分0.91-0.97之间而且在测试集上表现非常稳定(我在测试集随机选择了很多数据测试过)。再看下各项评估指数(从左到右’LGBMRegressor’, ‘LinearRegression’,’KNNRegressor’,’Ridge’,’Lasso’,’MLPRegressor’,’DecisionTree’,’ExtraTree’,’XGBoost’,’RandomForest’,’AdaBoost’,’GradientBoost’,’Bagging’):





数据标准化
网络上很多关于数据预处理的文章语焉不详,多数都不是用的生产数据,很容易误导人,我就被带坑里转了几圈。
https://zhuanlan.zhihu.com/p/80428843
我发现很多关于数据标准化或者归一化的代码都很模糊,开始我标准化的数据一直是错误的,分数总是为负。这方面还是老外写文章靠谱,逻辑清晰。
一次训练多个模型
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
import time
from sklearn.metrics import explained_variance_score, r2_score
from sklearn.metrics import median_absolute_error, mean_squared_log_error, mean_absolute_error, mean_squared_error
import random
# import multiprocessing
from sklearn.multioutput import MultiOutputRegressor
# raw, normalized and standardized training and testing data
trainX = [X_train, X_train_norm, X_train_stand]
testX = [X_test, X_test_norm, X_test_stand]
trainX_type = ['X_train', 'X_train_norm', 'X_train_stand']
# SVR 太慢只好暂时放弃了
# LinearRegression(n_jobs=-1),
models=[LGBMRegressor(n_jobs=-1), KNeighborsRegressor(n_jobs=-1),Ridge(),
Lasso(),MLPRegressor(),DecisionTreeRegressor(),ExtraTreeRegressor(),
XGBRegressor(nthread=-1),
RandomForestRegressor(n_estimators=50,
max_features='auto',
oob_score=True,
criterion = 'mse',
max_depth = 10,
n_jobs=-1,
random_state=2,
min_samples_leaf=2),AdaBoostRegressor(),GradientBoostingRegressor(),
BaggingRegressor(n_jobs=-1)]
# 'LinearRegression',
models_str=['LGBMRegressor', 'KNNRegressor','Ridge','Lasso','MLPRegressor','DecisionTree','ExtraTree','XGBoost','RandomForest','AdaBoost','GradientBoost','Bagging']
score_=[]
predicts = []
times = []
names = []
norm_score_=[]
norm_predicts = []
norm_times = []
norm_names = []
stand_score_=[]
stand_predicts = []
stand_times = []
stand_names = []
# model fitting and measuring RMSE
for i in range(len(trainX_type)):
for name,model in zip(models_str,models):
t1 = time.time()
print('开始训练模型:'+name)
# model=model #建立模型
# 模型训练
try:
if name in ['Ridge','Lasso','MLPRegressor','DecisionTree','ExtraTree','AdaBoost','GradientBoost']:
MultiOutputRegressor(model.fit(trainX[i],y_train), n_jobs=-1)
else:
model.fit(X_train,y_train)
except Exception as e:
print(i, name, e)
# 模型预测
pred=model.predict(X_test)
score=model.score(X_test,y_test)
print(str(i) + name +' 得分:'+str(score))
if trainX_type[i]=='X_train':
names.append(name)
predicts.append(pred)
score_.append(round(score, 4))
t2 = time.time()
times.append(t2-t1)
print('X_train' + name + '耗时{}秒'.format(t2-t1))
elif trainX_type[i]=='X_train_norm':
norm_names.append(name)
norm_predicts.append(pred)
norm_score_.append(round(score, 4))
t2 = time.time()
norm_times.append(t2-t1)
print('X_train_norm' + name + '耗时{}秒'.format(t2-t1))
elif trainX_type[i]=='X_train_stand':
stand_names.append(name)
stand_predicts.append(pred)
stand_score_.append(round(score, 4))
t2 = time.time()
stand_times.append(t2-t1)
print('X_train_stand' + name + '耗时{}秒'.format(t2-t1))
from sklearn.metrics import explained_variance_score, r2_score
from sklearn.metrics import median_absolute_error, mean_squared_log_error, mean_absolute_error, mean_squared_error
import random
import seaborn as sns
# 解决中文字符不显示问题
from matplotlib.font_manager import FontProperties
myfont=FontProperties(fname=r'C:\Windows\Fonts\simhei.ttf',size=14)
sns.set(font=myfont.get_name())
# # 设置显示所有列
# pd.options.display.max_columns = None
# plt.rcParams["font.family"]="STSong"
# # fig, ax = plt.subplots()
# plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
# MAE 平均绝对值误差
## 用于评估预测结果和真实数据集的接近程度的程度,其其值越小说明拟合效果越好
# MSE 均方差
## 该指标计算的是拟合数据和原始数据对应样本点的误差的平方和的均值,其值越小说明拟合效果越好。
# MedianAE 中值绝对误差
## 此种方法非常适应含有离群点的数据集,越小越好
# EVS 可释方差得分
## 解释回归模型的方差得分,其值取值范围是[0,1],越接近于1说明自变量越能解释因变量的方差变化,值越小则说明效果越差。
# R2 决定系数(拟合优度)
## 判定系数,其含义是也是解释回归模型的方差得分,其值取值范围是[0,1],越接近于1说明自变量越能解释因变量的方差变化,值越小则说明效果越差。
def metrics_calc(modelPredict, mae_, mse_, msle_, evs_, median_ae_, r2_):
# 计算多个评估metric
for pre,name in zip(modelPredict, models_str):
print(name,':')
MAE = mean_absolute_error(y_test,pre)
MSE = mean_squared_error(y_test,pre)
try:
MSLE = mean_squared_log_error(y_test, pre)
except Exception as e:
print(e, name)
MSLE = 0
EVS = explained_variance_score(y_test,pre)
Median_AE = median_absolute_error(y_test,pre)
R2 = r2_score(y_test,pre)
mae_.append(MAE)
mse_.append(MSE)
msle_.append(MSLE)
evs_.append(EVS)
median_ae_.append(Median_AE)
r2_.append(R2)
print('\t平均绝对值误差\t{}\n\t均方差 \t\t{}\n\t均方对数误差\t{}\n\t中值绝对误差 \t{}\n\t可释方差得分 \t{}\n\t决定系数 \t{}'.format(
MAE,MSE,MSLE,Median_AE,EVS,R2))
return(mae_, mse_, msle_, evs_, median_ae_, r2_)
# fig.tight_layout()#调整整体空白
# 数据是冷漠的,还是让我们来看一下对比图
def barplotting(data_type, y_plot, title, plotnum):
# 画评估参数直方图
y_plot = y_plot
plt.figure(figsize=(16,8), dpi=200)
plt.subplot(plotnum)
# plt.ylim(0.004,0.036)
plt.title(data_type + title, fontsize=20)
plt.bar(x_name,y_plot)
# plt.subplots_adjust(wspace =0, hspace =0.3)#调整子图间距
plt.xticks(rotation=25,fontsize=20)
# We change the fontsize of minor ticks label
plt.tick_params(axis='both', which='major', labelsize=10)
plt.tick_params(axis='both', which='minor', labelsize=10)
plt.show()
plt.pause(2)
plt.savefig(data_type + '_' + 'model_evaluation_' + title + '.jpg')
plt.close()
def comparision_plot_predict_real(datatype, modelPredict, models_str, model_score):
#绘图进行比较预测和实际数据
for pre,name,score in zip(modelPredict, models_str, model_score):
print(pre, name)
#绘图进行比较
plt.figure(figsize=(16,8)) #画布大小
test_index = sorted([random.randint(0,len(y_test)) for i in range(50)])
# num=50
# x_=np.arange(1,num+1) #取100个点进行比较
x_ = test_index
plt.plot(x_,y_test.iloc[test_index],label='test_target') #目标取值
plt.plot(x_,pre[test_index],label='trained_prediction') #预测取值
# plt.plot(x_,y_test[:num],label='y_test') #目标取值
# plt.plot(x_,preds[:num],label='preds') #预测取值
plt.legend(loc='upper right', fontsize=15) #线条显示位置
plt.title(datatype + '_' + 'prediction_vs_real_sales_Price_' + name + '_score_' + str(score), fontsize=30)
plt.savefig(datatype + '_' + 'prediction_vs_real_sales_Price_' + name + '.jpg')
plt.show()
plt.pause(5)
plt.clf()
plt.close()
if __name__ == '__main__':
# raw data, standardized data, normalized data
mae = []
mse = []
msle = []
median_ae = []
evs = []
r2 = []
stand_mae = []
stand_mse = []
stand_msle = []
stand_median_ae = []
stand_evs = []
stand_r2 = []
norm_mae = []
norm_mse = []
norm_msle = []
norm_median_ae = []
norm_evs = []
norm_r2 = []
x_name = models_str
metrics_calc(predicts, mae, mse, msle, evs, median_ae, r2)
print('6 metrics:', mae, mse, msle, evs, median_ae, r2)
metrics_calc(norm_predicts, norm_mae, norm_mse, norm_msle, norm_evs, norm_median_ae, norm_r2)
print('6 metrics:', norm_mae, norm_mse, norm_msle, norm_evs, norm_median_ae, norm_r2)
metrics_calc(stand_predicts, stand_mae, stand_mse, stand_msle, stand_evs, stand_median_ae, stand_r2)
print('6 metrics:', stand_mae, stand_mse, stand_msle, stand_evs, stand_median_ae, stand_r2)
for data_type in ['raw', 'standardized', 'normalied']:
if data_type=='raw':
print(data_type)
metric_mae = mae
metric_mse = mse
metric_msle = msle
metric_median_ae = median_ae
metric_evs = evs
metric_r2 = r2
runtime = times
elif data_type=='standardized':
print(data_type)
metric_mae = stand_mae
metric_mse = stand_mse
metric_msle = stand_msle
metric_median_ae = stand_median_ae
metric_evs = stand_evs
metric_r2 = stand_r2
runtime = stand_times
elif data_type=='normalized':
print(data_type)
metric_mae = norm_mae
metric_mse = norm_mse
metric_msle = norm_msle
metric_median_ae = norm_median_ae
metric_evs = norm_evs
metric_r2 = norm_r2
runtime = norm_times
comparision_plot_predict_real('raw', predicts, models_str, score_)
comparision_plot_predict_real('stand', stand_predicts, models_str, stand_score_)
comparision_plot_predict_real('norm', norm_predicts, models_str, norm_score_)
barplotting('raw', y_plot=mae, title='平均绝对值误差MAE', plotnum=111)
barplotting('raw', y_plot=mse, title='均方误差MSE', plotnum=111)
barplotting('raw', y_plot=median_ae, title='中值绝对值误差MedianAE', plotnum=111)
barplotting('raw', y_plot=evs, title='可释方差分数EVS', plotnum=111)
barplotting('raw', y_plot=r2, title='R-Square决定系数', plotnum=111)
barplotting('stand', y_plot=stand_mae, title='平均绝对值误差MAE', plotnum=111)
barplotting('stand', y_plot=stand_mse, title='均方误差MSE', plotnum=111)
barplotting('stand', y_plot=stand_median_ae, title='中值绝对值误差MedianAE', plotnum=111)
barplotting('stand', y_plot=stand_evs, title='可释方差分数EVS', plotnum=111)
barplotting('stand', y_plot=stand_r2, title='R-Square决定系数', plotnum=111)
barplotting('norm', y_plot=norm_mae, title='平均绝对值误差MAE', plotnum=111)
barplotting('norm', y_plot=norm_mse, title='均方误差MSE', plotnum=111)
barplotting('norm', y_plot=norm_median_ae, title='中值绝对值误差MedianAE', plotnum=111)
barplotting('norm', y_plot=norm_evs, title='可释方差分数EVS', plotnum=111)
barplotting('norm', y_plot=norm_r2, title='R-Square决定系数', plotnum=111)
#绘图进行比较raw,standardized,normalized三种数据各个模型的运行时间
plt.figure(figsize=(16,8)) #画布大小
plt.plot(x_name,times,label='runtime_raw_data') #目标取值
plt.plot(x_name,stand_times,label='runtime_standardized_data') #目标取值
plt.plot(x_name,norm_times,label='runtime_normalized_data') #目标取值
plt.legend(loc='upper right', fontsize=20) #线条显示位置
plt.title(data_type + '_' + 'runtime_raw_standardized_normalized', fontsize=30)
plt.xticks(rotation=25,fontsize=20)
plt.savefig(data_type + '_' + 'runtime_raw_standardized_normalized' + '.jpg')
plt.show()
plt.pause(2)
需要注意的是,并不是所有的模型支持n_jobs=-1来利用所有的核,这个时候MultiOutputRegressor就排上用场了,并行计算大幅度缩减训练时间。我这里训练11个模型*3组数据(60万行87列),33个模型训练大概一个多小时就能完成(i3 8100)。当然了,这个并不保证100%并行,我想读取数据这一步肯定没有并行,观察CPU利用率可以看到并不是所有时间100%利用率。
如果想用多进程,需要注意的是,Python multiprocessing模块在windows jupyter notebook中并不能正常使用,linux没问题。这时候可以选择保存为脚本运行。
在Windows环境中,jupyter-notebook中,即使使用if __name__ == ‘__main__进行保护,也会出现runtime error,这个时候可以将jupyter中的代码下载成py脚本,直接运行脚本。作为对比,Linux下运行的jupyter-notebook并不会。
很奇怪的是,数据标准化、归一化后效果很差,只有几个模型分数没有变化,其他变差或者为负数了。原来我以为最起码会有一些模型效果变好的,有时间再研究吧,应该和我的数据有关系。
最大深度与训练误差测试误差的关系
随着决策树最大深度的增加,决策树的拟合能力不断上升,决策数的最大深度太大时,我们的决策树不仅仅会拟合我们的正确样本,同时也会过拟合,也就是说模型预测抖动特别大,这导致了其泛化能力的下降。下面几个模型都可以测试最大深度来防止过拟合。请忽略下图中标题的错误(下划线前的是正确标题):






最大深度与训练误差测试误差的关系

Boosting Type/Bossting Algorithm

三种boosting算法优劣马上看出来了,默认的就不错!调参可以参考:
How to Develop a Light Gradient Boosted Machine (LightGBM) Ensemble
Gradient Boosting with Scikit-Learn, XGBoost, LightGBM, and CatBoost
Bagging: num_estimator(number of trees)与训练误差测试误差的关系
