Starting up workers.

library(parallel)
detectCores()
## [1] 8
cl <- makeCluster(detectCores())

Setting up workers.

clusterSetRNGStream(cl, 1234)  # setting random seed for random number generation on workers.
#set.seed(123) # is needed for random number generation on the master node.
#clusterEvalQ(cl, library(MASS)) # you have to load library on workers 
#clusterEvalQ(cl, source("20141115_test.R")) # this is too noisy, try the next line
#invisible(clusterCall(cl, source, "20141115_test.R")) # you have to load your own functions on workers.
#source("20141115_test.R")  # is needed if you use this file on the master node.

Preparing a sample data vector. We try bootstrap on this data.

xx = 1:1000  # data
nboot = 100003  # number of bootstrap replicates

Preparing bootstrap function. Simply resampling the data and taking the mean.

mymean <- function(x) mean(x)  # nothing but taking the mean of vectors
myboot <- function(nb, data, func) {
  xlen = length(data)
  result = numeric(nb)
  for(i in 1:nb) result[i] = func(data[sample(xlen, replace=TRUE)])
  result
}
myboot(10,xx,mymean) # check if it works
##  [1] 510.733 508.618 517.619 488.779 496.186 502.189 495.080 508.061
##  [9] 508.040 490.616

Preparing bootstrap jobs. (See the source code of the pvclust package in CRAN)

ncl = length(cl) # number of workers
nbl = as.list(rep(nboot %/% ncl, times=ncl)) #  =  floor( nboot / ncl )
if((rem <- nboot %% ncl) > 0) nbl[1:rem] <- lapply(nbl[1:rem], "+", 1) # correction of the remainder
nbl # numbers of bootstrap replicates on workers
## [[1]]
## [1] 12501
## 
## [[2]]
## [1] 12501
## 
## [[3]]
## [1] 12501
## 
## [[4]]
## [1] 12500
## 
## [[5]]
## [1] 12500
## 
## [[6]]
## [1] 12500
## 
## [[7]]
## [1] 12500
## 
## [[8]]
## [1] 12500
sum(as.numeric(nbl)) # check if sums up to nboot
## [1] 100003

Run bootstrap on workers. This may take a long time if you increase the number of replicates, but actually very fast for this simple example.

date() # starting time
## [1] "Sat Nov 15 16:47:07 2014"
results = parLapply(cl, nbl, myboot, data=xx, func=mymean)
date() # ending time
## [1] "Sat Nov 15 16:47:08 2014"

Looking at results.

results[[1]][1:10] # results from worker-1
##  [1] 501.412 498.797 504.167 510.600 488.406 499.824 488.674 508.645
##  [9] 496.560 506.116
results[[2]][1:10] # results from worker-2
##  [1] 506.544 491.602 487.103 500.113 495.464 492.601 502.972 508.463
##  [9] 492.818 494.137

Merging the results computed by workers.

length(results) # should be ncl
## [1] 8
replicates = unlist(results) # making a long vector
save(replicates,xx,file="20141115_test.rda") # i want to do this immediately for later use.
length(replicates) # should be nboot
## [1] 100003

See the histogram.

hist(replicates)

Stop clusters

stopCluster(cl)
#quit(save="no") # dont save the big workspace