1 min read

新闻爬虫

library(rvest)
library(tidyverse)
library(rlist)
news_rvest <- function(url) {
  url <- url
  #抓取网页
  httr_web <- read_html(url,encoding = 'utf-8')
  #抓取新闻标题
  title <- httr_web %>% html_nodes('h3>a') %>% html_text(trim = T)
  #抓取新闻发布者与日期
  author <- httr_web %>% html_nodes('p.c-author') %>% html_text(trim = T)
  
  candidate_date <- Sys.Date() %>% format('%Y年%m月%d日')
  
  fun <- function(x){
    re=if(length(x)==3){
      re=c(x[1],candidate_date,x[length(x)])
    }else{
      re= x[-2]
    }
    re=data.frame(发布者=re[1],日期=re[2],时间=re[3])
    return(re)
  }
  
  news_ppdai <- data.frame(
    标题=title,
    author %>% str_split('\\s') %>% lapply(fun) %>% list.stack())
  # tail(news_ppdai)
  return(news_ppdai)
}
'http://news.baidu.com/ns?ct=1&rn=20&ie=utf-8&bs=%E6%8B%8D%E6%8B%8D%E8%B4%B7&rsv_bp=1&sr=0&cl=2&f=8&prevct=no&tn=news&word=%E6%8B%8D%E6%8B%8D%E8%B4%B7' %>% news_rvest() %>% 
  arrange((`日期`)) %>% 
  knitr::kable()