Run ABC-SMC inference in parallel

Usage

abcsmc(
  model_list = list(),
  model_def = NULL,
  prior_dist = list(),
  ss_obs = NA,
  max_number_of_gen = 15,
  nb_acc_prtcl_per_gen = 1000,
  var_prtrbtn_krnl = NA,
  var_prtrbtn_krnl_min = 0.01,
  scaling_sd_prtrbtn_krnl = 1,
  model_jump_prob = 0.1,
  nb_threshold = 1,
  new_threshold_quantile = 0.8,
  distance_threshold_min = 0.01,
  max_attempts = 1e+05,
  acceptance_rate_min = 0.01,
  use_lhs_for_first_iter = TRUE,
  experiment_folderpath = "./",
  on_cluster = FALSE,
  cluster_type = NULL,
  slurm_script_template =
    "#!/bin/bash\n#SBATCH --ntasks=1\n#SBATCH --ntasks-per-node=1\n#SBATCH --hint=nomultithread\n#SBATCH --time=24:00:00\n# THE FOLLOWING SECTION SHOULD NOT BE MODIFIED\n#SBATCH --job-name=job-array_%%A_%%a   # nom du job\n#SBATCH --array=%s-%s%%%d\noutput_fpath=%s\nerror_fpath=%s\n#SBATCH --output=$output_fpath/output_%%A_%%a.out\n#SBATCH --error=$error_fpath/error_%%A_%%a.out\nmkdir -p $output_fpath\nmkdir -p $error_fpath\nRscript %s $SLURM_ARRAY_TASK_ID\n",
  sge_script_template =
    "#!/bin/bash\n#$ -S /bin/bash\n#$ -N subjob_abcsmc_prlll\n# #$ -q \"short.q|long.q\"\n# THE FOLLOWING SECTION SHOULD NOT BE MODIFIED\n#$ -cwd\n#$ -V\n#$ -t %s-%s\n#$ -tc %d\n#$ -o /dev/null\n#$ -e /dev/null\noutput_fpath=%s\nerror_fpath=%s\nmkdir -p $output_fpath\nmkdir -p $error_fpath\nRscript %s $SGE_TASK_ID >$output_fpath/subjob.${SGE_TASK_ID}.out 2>$error_fpath/subjob.${SGE_TASK_ID}.err\n",
  max_concurrent_jobs = 1,
  previous_gens = NA,
  previous_epsilons = NA,
  verbose = FALSE,
  progressbar = FALSE
)

Arguments

model_list: a list linking model name ( character string) to associated function
model_def: a R file containing only the model(s) function(s)
prior_dist: a list linking model name (character string) to a list describing the prior distribution of each parameter to be estimated
ss_obs: the observed summary statistics
max_number_of_gen: the maximum number of generations to be performed
nb_acc_prtcl_per_gen: the number of particles (per model, the total number corresponding to this number multiplied by the number of models) to be accepted during a generation before moving on to the next one
var_prtrbtn_krnl: the standard deviation to be used for the perturbation kernel, if NA the empirical standard deviation will be used instead
var_prtrbtn_krnl_min: the minimum standard deviation to be used for the perturbation kernel (must be greater than zero)
scaling_sd_prtrbtn_krnl: scaling parameter for increasing or decreasing the value of the standard deviation used for the perturbation kernel
model_jump_prob: probability of changing model when creating a new particle
nb_threshold: number of thresholds used, corresponding to the number of distances returned by the function(s) stipulated in the model list (model_list)
new_threshold_quantile: the quantile to be used to decrease the threshold(s) during iterations
distance_threshold_min: the minimum threshold below which the procedure stops (in order to prevent excessively long computations)
max_attempts: the maximum number of particles to be tested during an iteration, beyond which the procedure stops (in order to prevent excessively long computations)
acceptance_rate_min: the acceptance rate below which the procedure stops (in order to prevent excessively long computations)
use_lhs_for_first_iter: whether or not to use a LHS sampling method for the first iteration
experiment_folderpath: the folder in which to carry out the estimation procedure and save the results
on_cluster: whether or not the procedure is run on a computation cluster
cluster_type: cluster type used (sge and slurm currently supported)
slurm_script_template: script used to launch jobs on a slurm cluster
sge_script_template: script used to launch jobs on a sge cluster
max_concurrent_jobs: maximum number of jobs/tasks run in parallel
previous_gens: an object (dataframe) containing previous results (set of iterations), in order to start from the last iteration performed
previous_epsilons: an object (dataframe) containing previous results (set of thresholds), in order to start from the last iteration performed
verbose: whether or not to display specific information
progressbar: whether or not to display progressbar

Value

a list containing two dataframes corresponding to (1) the particles accepted and (2) the thresholds used, during the successive iterations

Examples

library(BRREWABC)

tmp_dir <- tempdir()

# model definition
compute_dist = function(x, ss_obs){
    ss_sim = c( x[["alpha"]] + x[["beta"]] + rnorm(1,0,0.1),
           x[["alpha"]] * x[["beta"]] + rnorm(1,0,0.1) ) # a simple toy model
    dist = sum((ss_sim-ss_obs)^2)
    return(c(dist))
}

MODEL_LIST <- list("m1" = compute_dist)
PRIOR_DIST <- list("m1" = list(c('alpha', 'unif', 0, 4),
                               c('beta', 'unif', 0, 1)))

# create a reference trajectory
sum_stat_obs = c(2.0,0.75)

# run abc smc procedure
res = abcsmc(model_list = MODEL_LIST, prior_dist = PRIOR_DIST,
ss_obs = sum_stat_obs, max_number_of_gen = 20, nb_acc_prtcl_per_gen = 2000,
new_threshold_quantile = 0.8, experiment_folderpath = tmp_dir,
max_concurrent_jobs = 2, verbose = FALSE)
#> The distance threshold(s) (epsilon(s)) fall(s) below the predetermined min value!
#> [1] 0.009374698

# get results and plots
all_accepted_particles = res$particles
all_thresholds = res$thresholds
plot_abcsmc_res(data = all_accepted_particles, prior = PRIOR_DIST, colorpal = "YlOrBr", filename = file.path(tmp_dir, "abcsmc_results.png"))
#> [1] "Number of generations exceed the threshold (15) allowed by ggpairs, it may cause long processing times. You may (re)define the iter argument to choose which generations to plot."
#> [1] "Plot saved as 'png'."
#> Registered S3 method overwritten by 'GGally':
#>   method from   
#>   +.gg   ggplot2
plot_thresholds(data = all_thresholds, nb_threshold = 1, colorpal = "YlOrBr", filename = file.path(tmp_dir, "thresholds.png"))
#> [1] "Plot saved as 'png'."
plot_ess(data = all_accepted_particles, colorpal = "YlOrBr", filename = file.path(tmp_dir, "ess.png"))
#> [1] "Plot saved as 'png'."
#>    gen      ess
#> 1    1 2000.000
#> 2    2 1889.757
#> 3    3 1909.471
#> 4    4 1913.078
#> 5    5 1927.955
#> 6    6 1928.197
#> 7    7 1924.893
#> 8    8 1916.789
#> 9    9 1911.543
#> 10  10 1901.440
#> 11  11 1885.895
#> 12  12 1900.061
#> 13  13 1855.863
#> 14  14 1852.178
#> 15  15 1839.835
#> 16  16 1831.404
#> 17  17 1812.480
#> 18  18 1770.146
#> 19  19 1754.894
#> 20  20 1726.200
plot_densityridges(data = all_accepted_particles, prior = PRIOR_DIST, colorpal = "YlOrBr", filename = file.path(tmp_dir, "densityridges.png"))
#> [1] "Plot saved as 'png'."