오늘의 유머 크롤링 코드
today_data<-NULL
for(i in 1:3){
url<-paste0("http://www.todayhumor.co.kr/board/list.php?table=bestofbest&page=",i)
b<-readLines(url,encoding="UTF-8")
library(stringr)
b2<-b[str_detect(b,"")]
b2
b3<-str_extract(b2,("(?<=target).*(?=)")) ##b에서 AA와 BB사이에 있는 모든것
title<-str_sub(b3,9)
b2
b5<-str_sub(str_extract(b2,("(?<=a href).*(?=target)")),3,end=-3)
base_url<-paste0("http://www.todayhumor.co.kr",b5)
today_data<- rbind(today_data,cbind(title,base_url))
cat("\n",i)
}
dim(today_data)
head(today_data)
con_url<-today_data[,2]
j<-2
final_con<-c()
for(j in 1:length(con_url)){
b<-readLines(con_url[j],encoding = "UTF-8")
con_index<-which(str_detect(b,"viewContent"))
con<-paste(b[con_index[1]:con_index[2]],collapse = "")
final_con[j]<-con
cat("\n",j)
# Sys.sleep(1)
}
final_con<-gsub("<.*?>","",final_con)
final_con<-gsub("\t","",final_con)
final_con
final_today_data<-cbind(today_data,final_con)
dim(today_data)
length(final_con)
final_today_data
write.csv(final_today_data,"today_data.csv",row.names=F)