library(mrgsim.ds)
library(DBI)
library(duckdb)
library(parallel)
library(dplyr)1 Introduction
This is a quick blog post showing how you can summarize a very large set of simulations using duckdb.
2 Setup
Set up a large simulation; we’ll check the number of rows in the output once it finishes.
- 3000 subjects
- 2 dose levels
- 28 days of dosing
- Sample every hour
mod <- modlib_ds("popex")Building popex ... done.
mod <- update(mod, outvars = "IPRED", end = 4*168, delta = 1)
data <- expand.ev(
ID = 1:3000,
amt = c(300, 1000),
ii = 24,
addl = 27
)
data <- mutate(data, dose = amt)3 Simulate
Simulate 1000 replicates
unlink("temp-post", recursive = TRUE)
arrow::set_io_thread_count(2)
out <- mclapply(
1:1000,
\(x) {
mrgsim_ds(mod, data, tags = list(rep = x), recover = "dose")
},
mc.cores = 5
)
out <- reduce_ds(out)
save_ds(out, "temp-post/duckdb.rds")There are 4 billion rows in this simulation set across 1000 parquet files totaling 31 Gb of data on disk that is highly compressed.
sims <- read_ds("temp-post/duckdb.rds")
simsModel: popex
Dim : 4.0B x 5
Files: 1000 [31.1 Gb]
Owner: yes (no gc)
ID time IPRED dose rep
1: 1 0 0.000000 300 1
2: 1 0 0.000000 300 1
3: 1 1 5.335456 300 1
4: 1 2 7.536079 300 1
5: 1 3 8.399812 300 1
6: 1 4 8.694028 300 1
7: 1 5 8.746235 300 1
8: 1 6 8.696203 300 1
4 Summarize
Summarize using duckdb; we’ll take median, 5th and 95th percentiles of the Week 4 trough concentration by dose and replicate.
con <- dbConnect(duckdb())
ds <- as_arrow_ds(sims) # or pass a vector of paths
duckdb_register_arrow(con, "sims", ds)
# Query normally
system.time(
summ <- dbGetQuery(con, "
SELECT
dose,
time,
rep,
COUNT(*) AS n,
QUANTILE_CONT(IPRED, 0.025) AS Lower,
QUANTILE_CONT(IPRED, 0.5) AS Median,
QUANTILE_CONT(IPRED, 0.975) AS Upper
FROM sims
WHERE time = 672
GROUP BY time, dose, rep
ORDER BY time, dose, rep
")
) user system elapsed
62.202 9.311 13.471
dbDisconnect(con, shutdown = TRUE)as_tibble(summ)# A tibble: 2,000 × 7
dose time rep n Lower Median Upper
<dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 300 672 1 3000 0.855 7.97 31.4
2 300 672 2 3000 0.756 7.85 30.3
3 300 672 3 3000 0.870 8.10 30.3
4 300 672 4 3000 0.734 8.25 31.9
5 300 672 5 3000 0.781 7.77 31.0
6 300 672 6 3000 0.644 8.16 32.6
7 300 672 7 3000 0.872 8.10 32.5
8 300 672 8 3000 0.874 8.20 32.7
9 300 672 9 3000 0.751 8.24 32.2
10 300 672 10 3000 0.823 8.03 30.9
# ℹ 1,990 more rows