{r setup, include=FALSE} knitr::opts_chunk$set(eval = FALSE)
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
</script>
本文于r format(Sys.Date(), "%Y-%m-%d")更新。 如发现问题或者有建议,欢迎提交 Issue
目前的排名在top 6%。

前言
计划
- 标准化
- 箱型图、
fct_lump异常值处理,- 出现次数在1%以下的情况
fct_lump分箱使用方法
新增
- 更新 9 参考资料
- 9 参考资料
- 8 集成
- 最新结果的总结,
reg:gammain Python - 5 Pool Quality 和 Pool Area 相关
- 4 y 做 Power Transformation
- 3.2 Importances plot
- 2 EDAEDA,更新完毕
- 3 naive xgboost
- 2.2 dashboard 增加字段解释
- 2.3 y的比例
- 1.2.1 使用方式
EDA
{r} get_path <- function(x){file.path(getwd(),"required_data",x)} train <- read_csv(get_path("train.csv")) %>% rename(v1stFlrSF = `1stFlrSF`, v2ndFlrSF = `2ndFlrSF`, v3SsnPorch = `3SsnPorch`) test <- read_csv(get_path("test.csv")) %>% rename(v1stFlrSF = `1stFlrSF`, v2ndFlrSF = `2ndFlrSF`, v3SsnPorch = `3SsnPorch`) eda_data <- train %>% bind_rows(test)
变量进行了重命名 1。
Overview
EDA 主要参考 SmartEDA 包 测评和 Tidyverse使用技巧。
{r} ExpData(data=train,type=1) %>% datatable() ExpData(data=train,type=2) %>% datatable()
连续变量
table
{r} ExpNumStat(train,by="A",gp="SalePrice",Qnt=seq(0,1,0.1),MesofShape=2,Outlier=TRUE,round=2) %>% mutate_at(vars(Per_of_Missing),percent) %>% datatable()
Vname– Variable name - 变量名称Group– Target variable -TN– Total sample (inculded NA observations) - 样本总数nNeg– Total negative observations - 负样本数量nZero– Total zero observations - 零值数量nPos– Total positive observations - 正样本数量NegInf– Negative infinite count - 负无穷大极值PosInf– Positive infinite count - 正无穷大极值NA_value– Not Applicable count - 缺失值Per_of_Missing– Percentage of missings - 缺失率Min– minimum value - 最小值Max– maximum value - 最大值Mean– average value - 平均值Median– median value - 中位数SD– Standard deviation - 总体标准差CV– coefficient of variations (SD/mean)*100 - z scoreIQR– Inter quartile range - 四分位距 $QD = \frac{Q3-Q1}{2}$Qnt– Specified quantiles - 百分位点MesofShape– Skewness and Kurtosis - 偏度和峰度Outlier– Number of outliers - 异常值数量Cor– Correlation b/w target and independent variables - 自变量和因变量相关性
plot
{r} ExpNumViz(train,gp=NULL,nlim=10,Page=c(2,2),sample=NULL)
nlim: 少于10个样本不画图sample: 随机选择变量进行展示,NULL表示全部展示
{r} ExpNumViz(train,gp="SalePrice",nlim=4,fname=NULL,col=NULL,Page=c(2,2))
分类变量
table
{r} ExpCTable(train,Target="SalePrice",margin=1,clim=10,nlim=NULL,round=2,bin=4,per=F) %>% datatable()
plot
{r} ExpCatViz(train,gp=NULL,fname=NULL,clim=10,col=NULL,margin=2,Page = c(2,1))
WOE和IV
dashboard
{r include=FALSE} comment <- function(){ comment <- read_excel(get_path("字段翻译.xlsx")) %>% select(-X__1,-No.) names(comment) <- c("var","comment1","comment2","type","unique_value") comment[1,1] <- "SalePrice comment <- comment %>% select(var,comment2,unique_value) return(comment) } comment <- comment()
{r} dashboard <- bind_rows( skim_to_wide(train) %>% mutate(tag = 'train'), skim_to_wide(test) %>% mutate(tag = 'test') ) %>% left_join(comment, by = c("variable"="var")) %>% select(tag,hist, everything()) dashboard %>% datatable()
skim函数的展示结果参考 Stack Overflow- 字段解释来自
magicyang
使用方式
这个表格比较大,因此建议将网页向右拖拽,即可查看相关的histogram图。

缺失率最高的变量
{r} dashboard %>% mutate(missing = as.integer(missing)) %>% top_n(20,missing) %>% distinct(variable)
$y$的比例
{r} setdiff( train %>% names(), test %>% names() )
{r} train %>% ggplot(aes(x = SalePrice)) + geom_freqpoly() + scale_x_continuous(labels = c('0','200k','400k','600k','800k')) + theme_ilo() + labs( title = "pdf on SalePrice", subtitle = "positive skewness", caption = "Jiaxiang Li - jiaxiangli.netlify.com )
naive xgboost
```{r} get_watchlist <- function(data){
set.seed(456) index_train <- sample(1:nrow(data),0.80*nrow(data)) train <- data[index_train,] %>% select(-Id) test <- data[-index_train,] %>% select(-Id)
get_dmatrix <- function(data){ x <- data %>% select(-SalePrice) x_mtx <- data.matrix(x) y <- data$SalePrice ddata <- xgb.DMatrix(data = x_mtx,label=y) return(ddata) }
dtrain <- get_dmatrix(train) dtest <- get_dmatrix(test) watchlist <- list(train = dtrain, test = dtest)
return(list( train = train, test = test, dtrain = dtrain, dtest = dtest, watchlist = watchlist)) }
```
train_xgb <- get_watchlist(train)
{r eval=F} train_xgb_mod <- xgb.train( data = train_xgb$dtrain, # 1 eta = 0.1, nround = 2000, # 2 max_depth = 7, min_child_weight = 17, gamma = 0.72, # 3 subsample = 0.8, colsample_bytree = 0.95, # 评价标准 # eval.metric = "error", eval.metric = "rmse", # eval.metric = ks_value, # eval.metric = "auc", # eval.metric = "logloss", # objective objective = "reg:linear", # 这是一个回归问题 # 其他 seed = 596, watchlist = train_xgb$watchlist, # 300万数据一起用! nfold = 2, early.stop = 50, nthread = 8 )
{r eval=F} xgb.save(train_xgb_mod, file.path("required_data","train_xgb_mod.model"))
{r eval=F} train_xgb_mod <- xgb.load(file.path("required_data","train_xgb_mod.model"))
{r eval=F} dtest <- xgb.DMatrix(data = data.matrix(test %>% select(-Id)))
{r eval=F} sm_ljx_180525 <- tibble( Id = test$Id, SalePrice = predict(train_xgb_mod, dtest) ) %>% write_csv(get_path("sm_ljx_180525.csv")) %>% select(everything())
lift curve
{r} bind_rows( tibble( y = train_xgb$train$SalePrice, yhat = predict(train_xgb_mod, train_xgb$dtrain) ) %>% mutate(tag = 'train'), tibble( y = train_xgb$test$SalePrice, yhat = predict(train_xgb_mod, train_xgb$dtest) ) %>% mutate(tag = 'test') ) %>% mutate(yhat_bin = ntile(yhat,20)) %>% group_by(tag,yhat_bin) %>% summarise(y = mean(y), yhat = mean(yhat)) %>% gather(key,value,y:yhat) %>% ggplot(aes(x = yhat_bin,y = value, col = key)) + geom_line() + facet_wrap(~ tag) + theme_ilo() + labs( x = "predicted binned value", y = "actual value", title = "lift curve on train and validation set", subtitle = "binning by ntile function in ggplot2 package", caption = "Jiaxiang Li - jiaxiangli.netlify.com )
theme_ilo函数参考 ggplot2使用技巧。- 开发集过拟合了,参考训练模型 training model 使用技巧,不然效果很好,但是kaggle上的分数不高。
Importances plot
{r} xgb.importance(feature_names = train %>% select(-Id,-SalePrice) %>% names(), model = train_xgb_mod) %>% xgb.plot.importance()
相关口径解释见 xgboost的理解。
结果
Your submission scored 0.14340, which is not an improvement of your best score. Keep trying!
Power Transformation for $y$
相关推导见 训练模型 training model 使用技巧。
寻找最好的$\lambda$
{r} boxcox_lambda <- MASS::boxcox(SalePrice ~ 1, data = train, lambda = seq(-0.25, 0.25, length = 10)) boxcox_lambda_best <- boxcox_lambda %>% as.tibble() %>% filter(y == max(y)) %>% .$x boxcox_lambda_best
{r} SP_cb <- train %>% select(SalePrice) %>% mutate(tag = 'original') %>% bind_rows( train %>% # mutate(SalePrice = log(SalePrice), # tag = 'transformed') %>% mutate(comp = SalePrice^boxcox_lambda_best, SalePrice = comp/(comp-1), tag = 'transformed') %>% select(SalePrice,tag) ) SP_cb %>% group_by(tag) %>% summarise( mean = mean(SalePrice), sd = sd(SalePrice), skew = skewness(SalePrice), kurt = kurtosis(SalePrice) ) %>% mutate_if(is.double, accounting) SP_cb %>% ggplot(aes(x = SalePrice, col = tag)) + geom_freqpoly() + facet_wrap(~ tag, scales = "free")
我们发现四项指标均有改善,因此转换有效。
naive xgboost
{r eval=F} train_xgb_trans <- get_watchlist( train %>% # mutate(SalePrice = SalePrice^boxcox_lambda_best/(SalePrice^boxcox_lambda_best-1)) mutate(SalePrice = log(SalePrice)) )
{r eval=F} train_xgb_trans_mod <- xgb.train( data = train_xgb_trans$dtrain, # 1 eta = 0.1, nround = 2000, # 2 max_depth = 7, min_child_weight = 17, gamma = 0.72, # 3 subsample = 0.8, colsample_bytree = 0.95, # 评价标准 # eval.metric = "error", eval.metric = "rmse", # eval.metric = ks_value, # eval.metric = "auc", # eval.metric = "logloss", # objective objective = "reg:linear", # 这是一个回归问题 # 其他 seed = 596, watchlist = train_xgb_trans$watchlist, # 300万数据一起用! nfold = 2, early.stop = 50, nthread = 8 )
{r eval=F} xgb.save(train_xgb_trans_mod, file.path("required_data","train_xgb_trans_mod.model"))
{r eval=F} train_xgb_trans_mod <- xgb.load(file.path("required_data","train_xgb_trans_mod.model"))
{r eval=F} dtest <- xgb.DMatrix(data = data.matrix(test %>% select(-Id)))
{r eval=F} sm_ljx_180527 <- tibble( Id = test$Id, SalePrice = predict(train_xgb_trans_mod, dtest) ) %>% # mutate(SalePrice = (SalePrice/(SalePrice-1))^(1/boxcox_lambda_best)) %>% mutate(SalePrice = exp(SalePrice)) %>% write_csv(get_path("sm_ljx_180527.csv")) %>% select(everything())
left_curve
{r} bind_rows( tibble( y = train_xgb_trans$train$SalePrice, yhat = predict(train_xgb_trans_mod, train_xgb_trans$dtrain) ) %>% mutate(tag = 'train'), tibble( y = train_xgb_trans$test$SalePrice, yhat = predict(train_xgb_trans_mod, train_xgb_trans$dtest) ) %>% mutate(tag = 'test') ) %>% mutate(yhat_bin = ntile(yhat,20)) %>% group_by(tag,yhat_bin) %>% summarise(y = mean(y), yhat = mean(yhat)) %>% gather(key,value,y:yhat) %>% ggplot(aes(x = yhat_bin,y = value, col = key)) + geom_line() + facet_wrap(~ tag) + theme_ilo() + labs( x = "predicted binned value", y = "actual value", title = "lift curve on train and validation set", subtitle = "binning by ntile function in ggplot2 package", caption = "Jiaxiang Li - jiaxiangli.netlify.com )
theme_ilo函数参考 ggplot2使用技巧。- 开发集过拟合了,参考训练模型 training model 使用技巧,不然效果很好,但是kaggle上的分数不高。
Importances plot
{r} xgb.importance(feature_names = train %>% select(-Id,-SalePrice) %>% names(), model = train_xgb_trans_mod) %>% xgb.plot.importance()
相关口径解释见 xgboost的理解。
结果
Your submission scored 0.15594, which is not an improvement of your best score. Keep trying!
结果没有提升,因此y变量在树模型上不需要做变换。
Pool Quality 和 Pool Area 相关
{r} eda_data %>% select(PoolQC,PoolArea) %>% group_by(PoolQC,PoolArea) %>% count() %>% spread(PoolQC,n) %>% filter(!is.na(`<NA>`))
PoolArea = 0表示没有游泳池,因此无法给出PoolQC,因此2906个缺失值作为一个level没有问题。 但是有PoolQC有三个缺失值是拥有PoolArea值的,因此应该是漏记了。 [@Owen2017]
因此这里需要将这三个缺失值修改为Ex、Fa、Gd的其中一种。
{r} eda_data %>% group_by(PoolQC) %>% summarise(PoolArea = mean(PoolArea) %>% accounting())
我们发现三种类别的PoolArea平均值如上,按照就近原则, 依次附上
Ex、Fa、Fa或者Ex、Ex、Fa
Ex、Fa、Fa
{r} eda_data_ipt_PQC <- eda_data %>% # filter(PoolArea %in% c(368,444,561)) %>% mutate(PoolQC = case_when( PoolArea == 368 ~ 'Ex', PoolArea == 444 ~ 'Fa', PoolArea == 561 ~ 'Fa', TRUE ~ PoolQC ))
{r eval=F} eda_data_ipt_PQC_xgb <- get_watchlist( eda_data_ipt_PQC %>% semi_join(train,by = "Id") )
{r eval=F} eda_data_ipt_PQC_xgb_mod <- xgb.train( data = eda_data_ipt_PQC_xgb$dtrain, # 1 eta = 0.1, nround = 2000, # 2 max_depth = 7, min_child_weight = 17, gamma = 0.72, # 3 subsample = 0.8, colsample_bytree = 0.95, # 评价标准 # eval.metric = "error", eval.metric = "rmse", # eval.metric = ks_value, # eval.metric = "auc", # eval.metric = "logloss", # objective objective = "reg:linear", # 这是一个回归问题 # 其他 seed = 596, watchlist = eda_data_ipt_PQC_xgb$watchlist, # 300万数据一起用! nfold = 2, early.stop = 50, nthread = 8 )
{r eval=F} xgb.save(eda_data_ipt_PQC_xgb_mod, file.path("required_data","eda_data_ipt_PQC_xgb_mod.model"))
{r eval=F} eda_data_ipt_PQC_xgb_mod <- xgb.load(file.path("required_data","eda_data_ipt_PQC_xgb_mod.model"))
{r eval=F} dtest <- xgb.DMatrix(data = data.matrix( eda_data_ipt_PQC %>% semi_join(test,by = "Id") %>% select(-Id)))
{r eval=F} sm_ljx_180528 <- tibble( Id = test$Id, SalePrice = predict(eda_data_ipt_PQC_xgb_mod, dtest) ) %>% write_csv(get_path("sm_ljx_180528.csv")) sm_ljx_180528
left_curve
{r} bind_rows( tibble( y = eda_data_ipt_PQC_xgb$train$SalePrice, yhat = predict(eda_data_ipt_PQC_xgb_mod, eda_data_ipt_PQC_xgb$dtrain) ) %>% mutate(tag = 'train'), tibble( y = eda_data_ipt_PQC_xgb$test$SalePrice, yhat = predict(eda_data_ipt_PQC_xgb_mod, eda_data_ipt_PQC_xgb$dtest) ) %>% mutate(tag = 'test') ) %>% mutate(yhat_bin = ntile(yhat,20)) %>% group_by(tag,yhat_bin) %>% summarise(y = mean(y), yhat = mean(yhat)) %>% gather(key,value,y:yhat) %>% ggplot(aes(x = yhat_bin,y = value, col = key)) + geom_line() + facet_wrap(~ tag) + theme_ilo() + labs( x = "predicted binned value", y = "actual value", title = "lift curve on train and validation set", subtitle = "binning by ntile function in ggplot2 package", caption = "Jiaxiang Li - jiaxiangli.netlify.com )
theme_ilo函数参考 ggplot2使用技巧。- 开发集过拟合了,参考训练模型 training model 使用技巧,不然效果很好,但是kaggle上的分数不高。
Importances plot
{r} xgb.importance(feature_names = train %>% select(-Id,-SalePrice) %>% names(), model = eda_data_ipt_PQC_xgb_mod) %>% xgb.plot.importance()
相关口径解释见 xgboost的理解。
结果
Your submission scored 0.14340, which is not an improvement of your best score. Keep trying!
结果没有提升,因此y变量在树模型上不需要做变换。
Ex、Ex、Fa
{r} eda_data_ipt_PQC2 <- eda_data %>% # filter(PoolArea %in% c(368,444,561)) %>% mutate(PoolQC = case_when( PoolArea == 368 ~ 'Ex', PoolArea == 444 ~ 'Ex', PoolArea == 561 ~ 'Fa', TRUE ~ PoolQC ))
{r eval=F} eda_data_ipt_PQC2_xgb <- get_watchlist( eda_data_ipt_PQC2 %>% semi_join(train,by = "Id") )
{r eval=F} eda_data_ipt_PQC2_xgb_mod <- xgb.train( data = eda_data_ipt_PQC2_xgb$dtrain, # 1 eta = 0.1, nround = 2000, # 2 max_depth = 7, min_child_weight = 17, gamma = 0.72, # 3 subsample = 0.8, colsample_bytree = 0.95, # 评价标准 # eval.metric = "error", eval.metric = "rmse", # eval.metric = ks_value, # eval.metric = "auc", # eval.metric = "logloss", # objective objective = "reg:linear", # 这是一个回归问题 # 其他 seed = 596, watchlist = eda_data_ipt_PQC2_xgb$watchlist, # 300万数据一起用! nfold = 2, early.stop = 50, nthread = 8 )
{r eval=F} xgb.save(eda_data_ipt_PQC2_xgb_mod, file.path("required_data","eda_data_ipt_PQC2_xgb_mod.model"))
{r eval=F} eda_data_ipt_PQC2_xgb_mod <- xgb.load(file.path("required_data","eda_data_ipt_PQC2_xgb_mod.model"))
{r eval=F} dtest <- xgb.DMatrix(data = data.matrix( eda_data_ipt_PQC2 %>% semi_join(test,by = "Id") %>% select(-Id)))
{r eval=F} sm_ljx_180528_02 <- tibble( Id = test$Id, SalePrice = predict(eda_data_ipt_PQC2_xgb_mod, dtest) ) %>% write_csv(get_path("sm_ljx_180528_02.csv")) sm_ljx_180528_02
left_curve
{r} bind_rows( tibble( y = eda_data_ipt_PQC2_xgb$train$SalePrice, yhat = predict(eda_data_ipt_PQC2_xgb_mod, eda_data_ipt_PQC2_xgb$dtrain) ) %>% mutate(tag = 'train'), tibble( y = eda_data_ipt_PQC2_xgb$test$SalePrice, yhat = predict(eda_data_ipt_PQC2_xgb_mod, eda_data_ipt_PQC2_xgb$dtest) ) %>% mutate(tag = 'test') ) %>% mutate(yhat_bin = ntile(yhat,20)) %>% group_by(tag,yhat_bin) %>% summarise(y = mean(y), yhat = mean(yhat)) %>% gather(key,value,y:yhat) %>% ggplot(aes(x = yhat_bin,y = value, col = key)) + geom_line() + facet_wrap(~ tag) + theme_ilo() + labs( x = "predicted binned value", y = "actual value", title = "lift curve on train and validation set", subtitle = "binning by ntile function in ggplot2 package", caption = "Jiaxiang Li - jiaxiangli.netlify.com )
theme_ilo函数参考 ggplot2使用技巧。- 开发集过拟合了,参考训练模型 training model 使用技巧,不然效果很好,但是kaggle上的分数不高。
Importances plot
{r} xgb.importance(feature_names = train %>% select(-Id,-SalePrice) %>% names(), model = eda_data_ipt_PQC2_xgb_mod) %>% xgb.plot.importance()
相关口径解释见 xgboost的理解。
结果
Your submission scored 0.14340, which is not an improvement of your best score. Keep trying!
结果没有提升,因此y变量在树模型上不需要做变换。
xgboost arguement changed
```{r include=FALSE} get_watchlist <- function(data){
set.seed(456) index_train <- sample(1:nrow(data),0.80*nrow(data)) train <- data[index_train,] %>% select(-Id) test <- data[-index_train,] %>% select(-Id)
get_dmatrix <- function(data){ x <- data %>% select(-SalePrice) x_mtx <- data.matrix(x) y <- data$SalePrice ddata <- xgb.DMatrix(data = x_mtx,label=y) return(ddata) }
dtrain <- get_dmatrix(train) dtest <- get_dmatrix(test) watchlist <- list(train = dtrain, test = dtest)
return(list( train = train, test = test, dtrain = dtrain, dtest = dtest, watchlist = watchlist)) }
```
train_xgb_more_regular <- get_watchlist(train)
{r eval=F} train_xgb_more_regular_mod <- xgb.train( data = train_xgb_more_regular$dtrain, # 1 eta = 0.002, nround = 20000, # 2 max_depth = 4, min_child_weight = 1, gamma = 0.5, # 3 subsample = 0.5, colsample_bytree = 0.5, # 评价标准 # eval.metric = "error", eval.metric = "rmse", # eval.metric = ks_value, # eval.metric = "auc", # eval.metric = "logloss", # objective objective = "reg:gamma", # 这是一个回归问题 # 其他 seed = 596, watchlist = train_xgb_more_regular$watchlist, # 300万数据一起用! nfold = 2, early.stop = 50, nthread = 8 )
{r eval=F} xgb.save(train_xgb_more_regular_mod, file.path("required_data","train_xgb_more_regular_mod.model"))
{r eval=F} train_xgb_more_regular_mod <- xgb.load(file.path("required_data","train_xgb_more_regular_mod.model"))
{r eval=F} dtest <- xgb.DMatrix(data = data.matrix(test %>% select(-Id)))
{r eval=F} sm_ljx_180529 <- tibble( Id = test$Id, SalePrice = predict(train_xgb_more_regular_mod, dtest) ) %>% write_csv(get_path("sm_ljx_180529.csv")) %>% select(everything())
left_curve
{r} bind_rows( tibble( y = train_xgb_more_regular$train$SalePrice, yhat = predict(train_xgb_more_regular_mod, train_xgb_more_regular$dtrain) ) %>% mutate(tag = 'train'), tibble( y = train_xgb_more_regular$test$SalePrice, yhat = predict(train_xgb_more_regular_mod, train_xgb_more_regular$dtest) ) %>% mutate(tag = 'test') ) %>% mutate(yhat_bin = ntile(yhat,20)) %>% group_by(tag,yhat_bin) %>% summarise(y = mean(y), yhat = mean(yhat)) %>% gather(key,value,y:yhat) %>% ggplot(aes(x = yhat_bin,y = value, col = key)) + geom_line() + facet_wrap(~ tag) + theme_ilo() + labs( x = "predicted binned value", y = "actual value", title = "lift curve on train and validation set", subtitle = "binning by ntile function in ggplot2 package", caption = "Jiaxiang Li - jiaxiangli.netlify.com )
theme_ilo函数参考 ggplot2使用技巧。- 开发集过拟合了,参考训练模型 training model 使用技巧,不然效果很好,但是kaggle上的分数不高。
Importances plot
{r} xgb.importance(feature_names = train %>% select(-Id,-SalePrice) %>% names(), model = train_xgb_more_regular_mod) %>% xgb.plot.importance()
相关口径解释见 xgboost的理解。
结果
best_iteration: 11555
best_ntreelimit: 11555
best_score: 30228.03
Your submission scored 0.13871, which is not an improvement of your best score. Keep trying!
- 3447次迭代
- 正则化也处理了,但是还是过拟合。
reg:linear
这些都不是问题,关键是Python前面的特征工程,好好研究。
reg:gamma in Python
<!-- http://localhost:8888/notebooks/HousingPrices/model_in_python.ipynb -->
思路参考CSDN博客。 @DMLCXgboostPython 提供了模型保存和录入等函数。 这次建模特点是
You advanced 851 places on the leaderboard! Your submission scored 0.12124, which is an improvement of your previous score of 0.13513. Great job!
- 迭代了20000次。
- 使用的
reg:gamma
You advanced 9 places on the leaderboard! Your submission scored 0.12112, which is an improvement of your previous score of 0.12124. Great job!
- 迭代了40000次。
- 使用的
reg:gamma
Your submission scored 0.12178, which is not an improvement of your best score. Keep trying!
-
迭代了100000次,
1h 29m 56s -
使用的
reg:gamma[99999] train-error:0.020631 test-error:0.020631
看来增加迭代次数的效果已经体现不出来了。 分析的原因是最后的round不够,学习率比较慢,这次扩大到,200000次,保持$\eta = 0.0004$。
Your submission scored 0.12247, which is not an improvement of your best score. Keep trying!
-
迭代了200000次,
5h 2m 15s -
使用的
reg:gamma[199999] train-error:0.004858 test-error:0.004858
增加迭代次数的红利消失了。
- 损失函数使用RMSLE口径,这是比赛的要求,详见 15 Root Mean Squared Log Error RMSLE
- 构建方程是基于指数回归完成的,详见Generalized linear model GLM 模型、指数回归 和 Gamma 回归方程
集成
naive
{r} file.path(getwd(),"required_data") %>% list.files(full.names = T) %>% str_subset(".csv") %>% str_subset("^(?!.*train.csv|.*test.csv)") %>% tibble(path = .) %>% mutate(shortpath = str_remove(path, "/Users/JiaxiangLi/Downloads/me/trans/housingPrices/required_data/")) %>% mutate(data = map(.x = path, .f = read_csv)) %>% mutate(size = map_dbl(.x = data, .f = object_size)) %>% select(-path) %>% filter(size <= 19832) %>% select(-size) %>% unnest() %>% group_by(Id) %>% summarise(SalePrice = median(unique(SalePrice))) %>% write_csv(get_path("sm_ljx_180530.csv")) %>% select(everything())
Your submission scored 0.12545, which is not an improvement of your best score. Keep trying!
naive OLS 回归
training
{r} ensemble_data_train_nest <- file.path(getwd(),"required_data") %>% list.files(full.names = T) %>% str_subset("train_pred.csv") %>% tibble(path = .) %>% mutate(shortpath = str_remove(path, "/Users/JiaxiangLi/Downloads/me/trans/housingPrices/required_data/")) %>% mutate(data = map(.x = path, .f = read_csv)) %>% mutate(size = map_dbl(.x = data, .f = object_size)) %>% select(-path) %>% select(-size)
{r eval=F} expand.grid( a = ensemble_data_train_nest$shortpath, b = ensemble_data_train_nest$shortpath ) %>% left_join( ensemble_data_train_nest, by = c("a" = "shortpath") ) %>% rename(data_a = data) %>% left_join( ensemble_data_train_nest, by = c("b" = "shortpath") ) %>% rename(data_b = data) %>% mutate(equal = map2(.x = data_a, .y = data_b, .f = setequal)) %>% select(-data_a,-data_b) %>% .$equal
{r} ensemble_data_train <- ensemble_data_train_nest %>% unnest() %>% spread(shortpath,SalePrice) %>% add_column(SalePrice = train$SalePrice) %>% select(-Id) %>% select(SalePrice, everything())
```{r} log_gamma_glm <- glm(SalePrice ~ ., data = ensemble_data_train, family=Gamma(link=“log”)) summary(log_gamma_glm)
$\Box$gamma 预测有问题。
```
lm <- lm(SalePrice ~ ., data = ensemble_data_train)
summary(lm)
{r} ensemble_data_train %>% mutate(pred = predict(lm)) %>% select(SalePrice,pred)
prediction
{r} ensemble_data_test_nest <- file.path(getwd(),"required_data") %>% list.files(full.names = T) %>% str_subset("test_pred.csv") %>% tibble(path = .) %>% mutate(shortpath = str_remove(path, "/Users/JiaxiangLi/Downloads/me/trans/housingPrices/required_data/")) %>% mutate(data = map(.x = path, .f = read_csv)) %>% mutate(size = map_dbl(.x = data, .f = object_size)) %>% select(-path) %>% select(-size)
```{r} ensemble_data_test <- ensemble_data_test_nest %>%
unnest() %>% mutate(shortpath = str_replace_all(shortpath, “test”,“train”)) %>% spread(shortpath,SalePrice)
ensemble_data_test %>% cor()
ensemble_data_output <- ensemble_data_test %>% mutate( SalePrice = predict(lm, newdata = ensemble_data_test) ) %>% select(Id,SalePrice,everything()) %>% select(Id,SalePrice) %>% write_csv(get_path(“sm_ljx_180531.csv”))
mutate(shortpath = str_replace(shortpath, "test","train")) %>%
统一训练集和测试集x变量的名称。
> Your submission scored 0.75198, which is not an improvement of your best score. Keep trying!
效果不好。
主要是线性结构需要变量之间的相关性比较低,但是这里的相关性都比较高。
# 参考资料
模型选取、特征工程参考 @Roberts2018 和 @Serigne2017。
相关尝试比较多,这里就不展开了,下面每篇推荐了三个点可以进行尝试,相关代码,直接跳转到参考文献,即可查询。
测试前建议安装好Python环境、Xgboost和Lightgbm。
@Roberts2018 给出了三点新颖的思路:
+ 使用lasso、ridge、ElasticNet类的模型,得到了不错的结果,适合后期stacking。
+ 使用$\mu$定义outliter (详见 [Andrew Ng Machine Learning 学习笔记](https://jiaxiangli.netlify.com/2018/04/andrew-ng-machine-learning/) 2.2)。
+ 因为比赛要求的损失函数是[RMSLE](https://jiaxiangli.netlify.com/2018/04/training-model/#root-mean-squared-log-error-rmsle),因此最好的power transformation就是$\log$变换,即$\lambda=0$。
还有其他一些改进思路,未进行验证:
+ 对其他连续变量进行power transformation。
+ 大部分尝试的模型是过拟合的,根据作者给出每个模型的RMSE[^RMSEandRMSLE]中,Xgboost表现最好,`r round(0.05966808692175978,4)`,然而最优模型为`r round(0.07861888550803024,4)`,因此作者还是通过不断的上传结果,最后得到最优效果的,测试集上的RMSE不是唯一标准。
[^RMSEandRMSLE]:
实际上是RMSLE,因为@Roberts2018 对y进行了log处理。
> Your submission scored 0.11556, which is not an improvement of your best score. Keep trying!
参考 @Roberts2018 的思路,成绩有所提升。
@Serigne2017 给出了三点新颖的思路:
+ 对大多数变量执行了缺失值处理,有用中位数替换、有的设置为新类别
+ 连续变量具备高skew的,进行了power transformation
+ stacking基本上使用的是平均集成,效果不错
> You advanced 684 places on the leaderboard!
Your submission scored 0.11549, which is an improvement of your previous score of 0.12124. Great job!
参考 @Serigne2017 的思路,成绩有所提升。
# 总结
```
setequal(sm_ljx_180525,sm_ljx_180527)
setequal(sm_ljx_180525,sm_ljx_180528)
setequal(sm_ljx_180525,sm_ljx_180528_02)
@ref(PQC)没有改变预测结果。
xgboost过拟合是一个问题。 除了模型 \ref@(model-python)没有过拟合外,其他的结果都是过拟合的。
- base$\to$调参: $0.14340\to0.13870$
- 调参$\to$平方项: $0.13870\to0.12124$
- 平方项$\to$提高round: $to0.12124\to0.12112$
- 提高round$\to$过度提高round: $to0.12112 \downarrow$
reg:gamma过拟合下降,train和test差异不大。
参考文献
-
输入code
names(train) %>% str_subset("^[:digit:]")发现变量
[1] "1stFlrSF" "2ndFlrSF" "3SsnPorch是数字开头的不符合R命名规则,统一更改。 ↩︎