library(rvest)
library(tidyverse)
library(rlist)
news_rvest <- function(url) {
url <- url
#抓取网页
httr_web <- read_html(url,encoding = 'utf-8')
#抓取新闻标题
title <- httr_web %>% html_nodes('h3>a') %>% html_text(trim = T)
#抓取新闻发布者与日期
author <- httr_web %>% html_nodes('p.c-author') %>% html_text(trim = T)
candidate_date <- Sys.Date() %>% format('%Y年%m月%d日')
fun <- function(x){
re=if(length(x)==3){
re=c(x[1],candidate_date,x[length(x)])
}else{
re= x[-2]
}
re=data.frame(发布者=re[1],日期=re[2],时间=re[3])
return(re)
}
news_ppdai <- data.frame(
标题=title,
author %>% str_split('\\s') %>% lapply(fun) %>% list.stack())
# tail(news_ppdai)
return(news_ppdai)
}
'http://news.baidu.com/ns?ct=1&rn=20&ie=utf-8&bs=%E6%8B%8D%E6%8B%8D%E8%B4%B7&rsv_bp=1&sr=0&cl=2&f=8&prevct=no&tn=news&word=%E6%8B%8D%E6%8B%8D%E8%B4%B7' %>% news_rvest() %>%
arrange((`日期`)) %>%
knitr::kable()