Summarize simulated data with duckdb

duckdb provides a high-performance to summarize massive data sets.

duckdb
Author

Kyle Baron

Published

May 6, 2026

1 Introduction

This is a quick blog post showing how you can summarize a very large set of simulations using duckdb.

2 Setup

library(mrgsim.ds)
library(DBI)
library(duckdb)
library(parallel)
library(dplyr)

Set up a large simulation; we’ll check the number of rows in the output once it finishes.

  • 3000 subjects
  • 2 dose levels
  • 28 days of dosing
  • Sample every hour
mod <- modlib_ds("popex")
Building popex ... done.
mod <- update(mod, outvars = "IPRED", end = 4*168, delta = 1)

data <- expand.ev(
  ID = 1:3000,
  amt = c(300, 1000), 
  ii = 24, 
  addl = 27
)

data <- mutate(data, dose = amt)

3 Simulate

Simulate 1000 replicates

unlink("temp-post", recursive = TRUE)

arrow::set_io_thread_count(2)

out <- mclapply(
  1:1000, 
  \(x) {
    mrgsim_ds(mod, data, tags = list(rep = x), recover = "dose")
  }, 
  mc.cores = 5
)

out <- reduce_ds(out)

save_ds(out, "temp-post/duckdb.rds")

There are 4 billion rows in this simulation set across 1000 parquet files totaling 31 Gb of data on disk that is highly compressed.

sims <- read_ds("temp-post/duckdb.rds")

sims
Model: popex
Dim  : 4.0B x 5
Files: 1000 [31.1 Gb]
Owner: yes (no gc)
    ID time    IPRED dose rep
1:   1    0 0.000000  300   1
2:   1    0 0.000000  300   1
3:   1    1 5.335456  300   1
4:   1    2 7.536079  300   1
5:   1    3 8.399812  300   1
6:   1    4 8.694028  300   1
7:   1    5 8.746235  300   1
8:   1    6 8.696203  300   1

4 Summarize

Summarize using duckdb; we’ll take median, 5th and 95th percentiles of the Week 4 trough concentration by dose and replicate.

con <- dbConnect(duckdb())

ds <- as_arrow_ds(sims)  # or pass a vector of paths

duckdb_register_arrow(con, "sims", ds)

# Query normally
system.time(
  summ <- dbGetQuery(con, "
    SELECT
      dose,
      time,
      rep, 
      COUNT(*)                     AS n,
      QUANTILE_CONT(IPRED, 0.025)  AS Lower,
      QUANTILE_CONT(IPRED, 0.5)    AS Median,
      QUANTILE_CONT(IPRED, 0.975)  AS Upper
    FROM sims
    WHERE time = 672
    GROUP BY time, dose, rep
    ORDER BY time, dose, rep
  ")
)
   user  system elapsed 
 62.202   9.311  13.471 
dbDisconnect(con, shutdown = TRUE)
as_tibble(summ)
# A tibble: 2,000 × 7
    dose  time   rep     n Lower Median Upper
   <dbl> <dbl> <int> <dbl> <dbl>  <dbl> <dbl>
 1   300   672     1  3000 0.855   7.97  31.4
 2   300   672     2  3000 0.756   7.85  30.3
 3   300   672     3  3000 0.870   8.10  30.3
 4   300   672     4  3000 0.734   8.25  31.9
 5   300   672     5  3000 0.781   7.77  31.0
 6   300   672     6  3000 0.644   8.16  32.6
 7   300   672     7  3000 0.872   8.10  32.5
 8   300   672     8  3000 0.874   8.20  32.7
 9   300   672     9  3000 0.751   8.24  32.2
10   300   672    10  3000 0.823   8.03  30.9
# ℹ 1,990 more rows