{r setup, include=FALSE} knitr::opts_chunk$set(eval = FALSE)
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
</script>
本文于r format(Sys.Date(), "%Y-%m-%d")更新。 如发现问题或者有建议,欢迎提交 Issue
{r message=FALSE, warning=FALSE} library(visdat) library(tidyverse) library(naniar) library(rmarkdown)
缺失值位置
{r} vis_dat(airquality) + labs( caption = "Jiaxiang Li - jiaxiangli.netlify.com ) vis_miss(airquality) + labs( caption = "Jiaxiang Li - jiaxiangli.netlify.com )
- 缺点是不能进行
ggplot2函数的叠加。 ( Github )
缺失值分布
One approach to visualising missing data comes from
ggobiandmanet, where we replace “NA” values with values 10% lower than the minimum value in that variable. [@Tierney2018]
缺失值的展示方法中,一种是使用最小值还要小10%的值来代替,这样就可以进行散点图等方式进行展示。
{r} ggplot(airquality, aes(x = Solar.R, y = Ozone)) + geom_miss_point() + labs( caption = "Jiaxiang Li - jiaxiangli.netlify.com )
缺失值数量
{r} gg_miss_var(airquality) + labs( caption = "Jiaxiang Li - jiaxiangli.netlify.com )
样本缺失值情况
{r} miss_case_summary(airquality)
case是行的indexn_miss是某行缺失值的个数,pct_miss是某行缺失值的比例。
{r} miss_case_table(airquality)
再按照n_miss_in_case进行了汇总。
样本缺失值预测
根据miss_case_summary可以知道每个样本的缺失情况,可以对这个缺失率或者值,进行模型预测,看哪个变量比较显著。
```{r} library(rpart) library(rpart.plot)
airquality %>% add_prop_miss() %>% rpart(prop_miss_all ~ ., data = .) %>% prp(type = 4, extra = 101, prefix = “Prop. Miss =”)
# 变量缺失值情况
```
miss_var_summary(airquality)
miss_var_table(airquality)
{r} pedestrian %>% group_by(month) %>% miss_var_summary() %>% filter(variable == "hourly_counts")
interaction 展示
主要为了查看变量之间产生缺失值的共轭现象。 [@Tierney2018Gallery]
{r} gg_miss_upset(riskfactors) {r} gg_miss_upset(riskfactors, nsets = 10, nintersects = 50)
填补缺失值
```{r}
library(simputation) ocean_imp <- oceanbuoys %>% bind_shadow() %>% impute_lm(air_temp_c ~ wind_ew + wind_ns) %>% impute_lm(humidity ~ wind_ew + wind_ns) %>% impute_lm(sea_temp_c ~ wind_ew + wind_ns) %>% add_label_shadow() %>% paged_table()
`add_label_shadow`函数打上标记`any_missing`。
[@Tierney2018Imputed]
```
library(ggplot2)
ggplot(ocean_imp,
aes(x = air_temp_c,
y = humidity,
color = any_missing)) +
geom_point() +
scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
labs(
caption = "Jiaxiang Li - jiaxiangli.netlify.com
)
```{r}
ggplot(ocean_imp, aes(x = air_temp_c, fill = any_missing)) + geom_density(alpha = 0.3) + scale_fill_brewer(palette = “Dark2”) + theme(legend.position = “bottom”) + labs( caption = “Jiaxiang Li - jiaxiangli.netlify.com )
ggplot(ocean_imp, aes(x = humidity, fill = any_missing)) + geom_density(alpha = 0.3) + scale_fill_brewer(palette = “Dark2”) + theme(legend.position = “bottom”) + labs( caption = “Jiaxiang Li - jiaxiangli.netlify.com )
# Cases
```
library(data.table)
library(tidyverse)
library(visdat)
library(naniar)
# data <- fread('ldfilter_cbind.txt')
data <- fread(here::here('../tutoring2/pansiyu/analysis/NA_inlm/ldfilter_cbind.txt'))
{r} dim(data)
缺失值处理参考 naniar 使用技巧 缺失值展示 。
{r} vis_miss(data) + theme(text = element_text(size=10), axis.text.x = element_text(angle=90, hjust=1)) + labs( caption = "Jiaxiang Li - jiaxiangli.netlify.com )
联动缺失不高。
<details close>
<summary>
联动缺失不高,点击前面的三角查看更多。
</summary>
{r} gg_miss_upset(data)
{r} gg_miss_upset(data, nsets = 10, nintersects = 50)
</details>
<details close>
<summary>
样本缺失率不高。
</summary>
{r} miss_case_summary(data) miss_case_table(airquality)
- 最多一个样本,缺失值也就5个。
</details>
<details close>
<summary>
变量缺失率不高。
</summary>
{r} gg_miss_var(data) + theme( text = element_text(size=8))+ labs( caption = "Jiaxiang Li - jiaxiangli.netlify.com )
{r} miss_var_summary(data) miss_var_table(data)
```{r} library(rpart) library(rpart.plot)
data %>% add_prop_miss() %>% rpart(prop_miss_all ~ ., data = .) %>% prp(type = 4, extra = 101, prefix = “Prop. Miss =”)
1. 如图是影响缺失的主要变量。
</details>
由于缺失值不严重,因此进行`lm`。
```
library(broom)
data %>%
mutate_at(vars(-MPB)
,~fct_explicit_na(factor(.),'No_infos')) %>%
lm(MPB~.,data=.) %>%
tidy %>%
DT::datatable(
rownames = FALSE,
extensions = 'Buttons', options = list(
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
)
)
- 点击对应格式可以下载。