本文于2020-10-10更新。 如发现问题或者有建议,欢迎提交 Issue
项目需求是给在一群运动员内选出最佳的几个,并给出相应的数据支持。
- 数据支持方面,需要给出几个指标供参考,最后做(无监督学习),将指标聚合,最后使用一个综合得分进行选择的评价。
- 决策的方面,最后的数据支持是要展示出来,并知道决策的,因此最后会使用
ggplot
包展示选出的最佳运动员,和其他运动员的比较,以此来说服需求方。
本文的数据、Code、思路参考 Perry (2018) 。
思路具体如下,
- 创建四个指标
TotalDistance
总成绩StandardDev
成绩数据的标准差,体现风险Success
成功次数diff
后三次和前三次的成绩差
- 匹配权重,这里可以用PCA等完成,但不是本文重点,因此直接赋值。
- 找出最好的五位选手,用
max
和mean
水平来体现这五个选手的优势,这个idea很棒。
1 清洗数据
# Load the tidyverse package
library(tidyverse)
## Warning: 程辑包'tidyverse'是用R版本3.6.3 来建造的
## -- Attaching packages --------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2 √ purrr 0.3.4
## √ tibble 3.0.3 √ dplyr 1.0.2
## √ tidyr 1.1.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## Warning: 程辑包'ggplot2'是用R版本3.6.3 来建造的
## Warning: 程辑包'tibble'是用R版本3.6.3 来建造的
## Warning: 程辑包'tidyr'是用R版本3.6.3 来建造的
## Warning: 程辑包'readr'是用R版本3.6.3 来建造的
## Warning: 程辑包'purrr'是用R版本3.6.3 来建造的
## Warning: 程辑包'dplyr'是用R版本3.6.3 来建造的
## Warning: 程辑包'stringr'是用R版本3.6.3 来建造的
## Warning: 程辑包'forcats'是用R版本3.6.3 来建造的
## -- Conflicts ------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Import the full dataset
data <- read_csv('../../../picbackup/athletics.csv')
## Parsed with column specification:
## cols(
## Event = col_character(),
## Male_Female = col_character(),
## EventID = col_double(),
## Athlete = col_character(),
## Flight1 = col_double(),
## Flight2 = col_double(),
## Flight3 = col_double(),
## Flight4 = col_double(),
## Flight5 = col_double(),
## Flight6 = col_double()
## )
# Select the results of interest: women's javelin
javelin <- data %>%
filter(Male_Female =='Female',Event=='Javelin') %>%
select(-Male_Female,-Event)
# Give yourself a snapshot of your data
javelin %>% head()
## # A tibble: 6 x 8
## EventID Athlete Flight1 Flight2 Flight3 Flight4 Flight5 Flight6
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8 Brittany Borman 54.0 51.2 57.3 52.6 57.0 60.9
## 2 8 Ariana Ince 49.0 54.8 53.6 55.1 55.3 56.7
## 3 8 Kara Patterson 50.1 52.1 0 50.8 55.9 54.6
## 4 8 Kimberley Hamilton 48.0 0 50.9 54.1 55.2 53.3
## 5 8 Laura Loht 44.4 53.8 50.6 54.2 0 49.0
## 6 8 Brianna Bain 49.3 0 51.3 0 48.6 53.0
javelin %>% summary()
## EventID Athlete Flight1 Flight2
## Min. : 8.0 Length:178 Min. : 0.00 Min. : 0.00
## 1st Qu.: 178.0 Class :character 1st Qu.:41.53 1st Qu.:40.23
## Median : 511.0 Mode :character Median :48.85 Median :48.85
## Mean : 796.8 Mean :40.80 Mean :39.87
## 3rd Qu.:1703.0 3rd Qu.:53.20 3rd Qu.:53.07
## Max. :1859.0 Max. :64.94 Max. :61.38
## Flight3 Flight4 Flight5 Flight6
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.:40.57 1st Qu.: 0.00 1st Qu.: 0.00
## Median :47.34 Median :49.30 Median :48.01 Median :46.80
## Mean :34.22 Mean :39.37 Mean :32.97 Mean :34.82
## 3rd Qu.:52.08 3rd Qu.:52.10 3rd Qu.:51.44 3rd Qu.:52.44
## Max. :62.42 Max. :61.56 Max. :60.84 Max. :64.45
2 构建指标
# Assign the tidy data to javelin_long
javelin_long <- javelin %>%
gather(Flight,Distance,Flight1:Flight6)
# Make Flight a numeric
javelin_long <-
javelin_long %>%
mutate(Flight = str_extract(Flight,'[:digit:]{1,}')) %>%
mutate(Flight = as.numeric(Flight))
# Examine the first 6 rows
javelin_long %>% head()
## # A tibble: 6 x 4
## EventID Athlete Flight Distance
## <dbl> <chr> <dbl> <dbl>
## 1 8 Brittany Borman 1 54.0
## 2 8 Ariana Ince 1 49.0
## 3 8 Kara Patterson 1 50.1
## 4 8 Kimberley Hamilton 1 48.0
## 5 8 Laura Loht 1 44.4
## 6 8 Brianna Bain 1 49.3
javelin_totals <- javelin_long %>%
filter(Distance > 0) %>%
group_by(Athlete, EventID) %>%
summarise(
TotalDistance = sum(Distance)
,StandardDev = sd(Distance) %>% round(.,3)
,Success = n()
)
## `summarise()` regrouping output by 'Athlete' (override with `.groups` argument)
# View 10 rows of javelin_totals
javelin_totals[11:20,]
## # A tibble: 10 x 5
## # Groups: Athlete [2]
## Athlete EventID TotalDistance StandardDev Success
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Alyssa Olin 1740 195. 2.08 4
## 2 Alyssa Olin 1859 210. 0.971 4
## 3 Ariana Ince 8 324. 2.69 6
## 4 Ariana Ince 238 276. 3.08 5
## 5 Ariana Ince 498 255. 0.933 5
## 6 Ariana Ince 511 333. 2.78 6
## 7 Ariana Ince 747 230. 0.724 4
## 8 Ariana Ince 815 270. 2.54 5
## 9 Ariana Ince 1566 225. 2.54 4
## 10 Ariana Ince 1575 231. 1.52 4
javelin <- javelin %>%
mutate(early = Flight1+Flight2+Flight3
# ,late = Flight2+Flight3+Flight4
,late = Flight4+Flight5+Flight6
) %>%
mutate(diff = late - early)
javelin %>% tail(10)
## # A tibble: 10 x 11
## EventID Athlete Flight1 Flight2 Flight3 Flight4 Flight5 Flight6 early late
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1773 Meliss~ 47.6 48.7 47.5 0 0 45.6 144. 45.6
## 2 1773 Kaelyn~ 43.4 44.9 40 43.2 40.3 40.6 128. 124.
## 3 1859 Kara W~ 56.9 52.9 55.5 54.4 57.6 62.9 165. 175.
## 4 1859 Avione~ 56.5 0 54.4 51.6 54.3 0 111. 106.
## 5 1859 Ariana~ 51.9 53.5 52.4 56.0 55.2 0 158. 111.
## 6 1859 Bethan~ 49.9 51.0 54.2 0 50.6 0 155. 50.6
## 7 1859 Alyssa~ 0 53.7 52.1 51.5 0 52.8 106. 104.
## 8 1859 Domini~ 49.6 44.2 50.6 51.3 49.2 53.2 144. 154.
## 9 1859 Kriste~ 47.2 50.9 0 48.2 49.3 49.6 98.1 147.
## 10 1859 Rebeka~ 48.8 0 50.4 48.2 0 46.6 99.2 94.9
## # ... with 1 more variable: diff <dbl>
# .... YOUR CODE FOR TASK 4 ....
3 构建综合评分和选出选手
# Examine the last ten rows
# .... YOUR CODE FOR TASK 4 ....
javelin_totals <-
javelin_totals %>%
left_join(javelin ,by=c('Athlete','EventID')) %>%
select(Athlete,TotalDistance,StandardDev,Success,diff)
javelin_totals %>%
head(10)
## # A tibble: 10 x 5
## # Groups: Athlete [4]
## Athlete TotalDistance StandardDev Success diff
## <chr> <dbl> <dbl> <int> <dbl>
## 1 Abigail Gomez 152. 1.23 3 -52.9
## 2 Abigail Gomez 244. 1.63 5 -48
## 3 Abigail Gomez 207. 2.97 4 -110.
## 4 Abigail Gomez 222. 1.30 4 -3.11
## 5 Abigail Gomez 155. 1.03 3 53.4
## 6 Abigail Gomez Hernandez 135. 0.718 3 45.6
## 7 Alicia DeShasier 270. 2.15 5 60.0
## 8 Alicia DeShasier 320. 2.26 6 0.74
## 9 Alicia DeShasier 275. 1.53 5 53.5
## 10 Allison Updike 147. 3.84 3 -46.6
norm <- function(result) {
(result - min(result)) / (max(result) - min(result))
}
aggstats <- c("TotalDistance", "StandardDev", "Success", "diff")
javelin_norm <- javelin_totals %>%
ungroup() %>%
mutate_at(vars(aggstats),norm) %>%
group_by(Athlete) %>%
summarise_all(funs(mean(.)))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(aggstats)` instead of `aggstats` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
head(javelin_norm)
## # A tibble: 6 x 5
## Athlete TotalDistance StandardDev Success diff
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Abigail Gomez 0.446 0.268 0.45 0.383
## 2 Abigail Gomez Hernandez 0.244 0.115 0.25 0.720
## 3 Alicia DeShasier 0.753 0.327 0.833 0.687
## 4 Allison Updike 0.283 0.639 0.25 0.320
## 5 Alyssa Olin 0.469 0.250 0.5 0.309
## 6 Ariana Ince 0.660 0.342 0.692 0.446
weights <- c(1, 2, 3, 4)
javelin_team <- javelin_norm %>%
mutate(TotalScore =
weights[1]*TotalDistance +
weights[2]*StandardDev +
weights[3]*Success +
weights[4]*diff
) %>%
arrange(desc(TotalScore)) %>%
select(Athlete,TotalScore) %>%
head(5)
javelin_team
## # A tibble: 5 x 2
## Athlete TotalScore
## <chr> <dbl>
## 1 Madalaine Stulce 7.18
## 2 Asia Easley 7.08
## 3 Dominique Ouellette 6.89
## 4 Maggie Malone 6.83
## 5 Diana Sammai Martinez 6.75
4 构建可比水平
team_stats <- javelin_totals %>%
# .... YOUR CODE FOR TASK 8 ....
# .... YOUR CODE FOR TASK 8 ....
filter(Athlete %in% javelin_team$Athlete) %>%
summarise_all(funs(mean(.)))
pool_stats <- data.frame(do.call('cbind', sapply(javelin_totals, function(x) if(is.numeric(x)) c(max(x), mean(x)))))
pool_stats$MaxAve <- c("Maximum", "Average")
pool_stats <- pool_stats %>%
gather(key="Statistic", value="Aggregate", -MaxAve)
# Examine team stats
# .... YOUR CODE FOR TASK 8 ....
team_stats
## # A tibble: 5 x 5
## Athlete TotalDistance StandardDev Success diff
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Asia Easley 188. 4.78 4.5 64.1
## 2 Diana Sammai Martinez 262 2.38 6 11.9
## 3 Dominique Ouellette 299. 2.91 6 2.88
## 4 Madalaine Stulce 275. 4.47 6 -6.42
## 5 Maggie Malone 293. 2.00 5 61.1
pool_stats
## MaxAve Statistic Aggregate
## 1 Maximum TotalDistance 362.540000
## 2 Average TotalDistance 222.048483
## 3 Maximum StandardDev 5.994000
## 4 Average StandardDev 2.056079
## 5 Maximum Success 6.000000
## 6 Average Success 4.432584
## 7 Maximum diff 110.340000
## 8 Average diff -7.720056
5 展示
p <- team_stats %>%
gather(Statistic,Aggregate,-Athlete) %>%
ggplot(aes(x=Athlete,y=Aggregate,fill=Athlete))+
geom_bar(stat="identity") +
facet_wrap (~Statistic, nrow = 2, ncol = 2, scales="free_y") +
geom_hline(data=pool_stats, aes(yintercept=Aggregate, group=Statistic, color=MaxAve), size=1) +
labs(title=".... Your Team Name....: Women's Javelin", color="Athlete pool maximum / average") +
scale_fill_hue(l=70) +
scale_color_hue(l=20) +
theme_minimal() +
theme(axis.text.x=element_blank(), axis.title.x=element_blank(), axis.title.y=element_blank())
p
6 模拟比赛结果
home <- c(1,2,3)
away <- sample(1:nrow(javelin_totals), 3, replace=FALSE)
HomeTeam <- round(sum(team_stats$TotalDistance[home]),2)
AwayTeam <- round(sum(javelin_totals$TotalDistance[away]),2)
print(paste0("Javelin match, Final Score: ", HomeTeam, " - ", AwayTeam))
## [1] "Javelin match, Final Score: 748.23 - 701.16"
ifelse(HomeTeam > AwayTeam, print("Win!"), print("Sometimes you just have to take the L."))
## [1] "Win!"
## [1] "Win!"
Perry, George. 2018. “Scout Your Athletics Fantasy Team.” datacamp. 2018. https://www.datacamp.com/projects/177.