본문 바로가기
R-크롤링

오늘의 유머 크롤링 코드

by 미스터탁 2019. 12. 12.



today_data<-NULL

for(i in 1:3){
url<-paste0("http://www.todayhumor.co.kr/board/list.php?table=bestofbest&page=",i)

b<-readLines(url,encoding="UTF-8")

library(stringr)

b2<-b[str_detect(b,"")]
b2
b3<-str_extract(b2,("(?<=target).*(?=)"))  ##b에서 AA와 BB사이에 있는 모든것 
title<-str_sub(b3,9)
b2

b5<-str_sub(str_extract(b2,("(?<=a href).*(?=target)")),3,end=-3)

base_url<-paste0("http://www.todayhumor.co.kr",b5)

today_data<- rbind(today_data,cbind(title,base_url))
cat("\n",i)



}


dim(today_data)

head(today_data)
con_url<-today_data[,2]

j<-2
final_con<-c()
for(j in 1:length(con_url)){
  
  b<-readLines(con_url[j],encoding = "UTF-8")
  con_index<-which(str_detect(b,"viewContent"))
  con<-paste(b[con_index[1]:con_index[2]],collapse = "")
  final_con[j]<-con
  cat("\n",j)
  # Sys.sleep(1)
  

  
}
final_con<-gsub("<.*?>","",final_con)
final_con<-gsub("\t","",final_con)
final_con


final_today_data<-cbind(today_data,final_con)
dim(today_data)
length(final_con)
final_today_data

write.csv(final_today_data,"today_data.csv",row.names=F)




반응형

댓글