-
Notifications
You must be signed in to change notification settings - Fork 0
/
randomPgLoader.R
35 lines (32 loc) · 989 Bytes
/
randomPgLoader.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#----------Load packages
for (i in (pkg<-c("wrapr","XML"))) {
if(i %in% rownames(installed.packages()) == F) {
install.packages(i,lib=.libPaths(),dependencies=T)
}
}
sapply(pkg,require,character.only=T)
#----------Load a webpage
loadWebPage<-"let(alias=list(
x='webpage'
),expr={
x <- iconv(x, to='UTF-8')
x <- sub('ISO-8859-1', 'utf-8', x)
doc<-htmlTreeParse(x, useInternal = T, encoding='UTF-8', asText=T)
xmlRoot(doc)
}
)"
#----------Scrap a webpage
saveWebPage<-function(i=1,url,totPage=1,fPrefix="/trialSearch",upperT=120) {
library(XML)
ramT<-sample(20:upperT,1,replace=T)
consT<-runif(1)
t<-sample(c(consT+ramT,consT*ramT),1)
t<-ifelse(t<20,t+20,t)
print(paste("Streaming page", (i+1), "in", round(t,2), "seconds:)..."))
flush.console()
Sys.sleep(t)
webpage<-RCurl::getURL(url)
doc<-eval(parse(text=loadWebPage))
id<-sprintf(paste0("%0",nchar(totPage+1),"d"),i+1)
saveXML(doc,file=paste0(wkDir,fPrefix,id,".xml"),compression=9)
}