본문 바로가기
R-크롤링

야구 뉴스 수집 크롤링

by 미스터탁 2019. 12. 12.



i<-1
j<-1

final_data<-NULL

for(j in 1:5){
  
  
for(i in 1:2){

date<-Sys.Date()-j

date2<-gsub("-","",date)

url<-paste0("https://sports.news.naver.com/kbaseball/news/list.nhn?date=",date2,"&isphoto=N&page=",i)
url
b<-readLines(url,encoding="UTF-8")

library(RJSONIO)
b2<-fromJSON(b)

a1<-sapply(b2$list,function(x){x$oid})
a2<-sapply(b2$list,function(x){x$aid})
a3<-sapply(b2$list,function(x){x$title})
final_data<-rbind(final_data,cbind(a1,a2,a3))

cat("\n",date2,"-",i,"page 수집중")

}
  
}


setwd("D:\\인프런\\crawling")

write.csv(final_data,"baseball_news.csv",row.names = F)

final_data[,1]
final_data[,2]
dim(final_data)
con_url<-paste0("https://sports.news.naver.com/news.nhn?oid=",final_data[,1],"&aid=",final_data[,2])

library(stringr)
con_url  
k<-1
con<-c()

for(k in 1:length(con_url)){
b<-  readLines(con_url[k],encoding="UTF-8")
b2<-b[which(str_detect(b,"id=\"newsEndContents\">")):which(str_detect(b,"news_end_btn"))]
b3<-paste(b2,collapse = " ")
b3
b3<-gsub("<.*?>","",b3)
b3<-gsub("\t|>| |<","",b3)

con[k]<-b3
cat("\n",k)
}

baseball_data<-cbind(final_data,con)
colnames(baseball_data)<-c("oid","aid","head","cont")
write.csv(baseball_data,"baseball.csv",row.names=F)

반응형

댓글