Skip to contents

The aim of this document is to compare the results from various other R packages with those produced using outstandR. There are already a few widely-used R packages on CRAN to perform model-based standardization, such as {marginaleffects} and {stdReg}/{stdReg2}.

marginaleffects

The marginaleffects package has functions to calculate G-computation estimates.

##TODO: see unit tests
library(marginaleffects)

# marginaleffects::avg_predictions()
# marginaleffects::avg_comparisons()

stdReg[2]

The package {stdReg} has been rewritten to produce stdReg2 which has “nicer output, more available methods, the possibility to include new methods, and mainly to make maintenance and updating easier.” So, for this reason, we will use stdReg2.

We shall follow the article Estimation of causal effects using stdReg2. This is for continuous outcomes (change in weight) and includes transformed covariates. Note that the regression has transformed covariates using the I() function. Two outcome scales are calculated, risk difference and risk ratio. The data also contains categorical variable with more than 2 levels, education, smokeintensity, exercise and active.

library(stdReg2)
library(causaldata)  # contains data set

# load data set
nhefs_dat <- causaldata::nhefs_complete

m <- glm(wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) +
           smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2), 
         data = nhefs_dat)
summary(m)
#> 
#> Call:
#> glm(formula = wt82_71 ~ qsmk + sex + race + age + I(age^2) + 
#>     as.factor(education) + smokeintensity + I(smokeintensity^2) + 
#>     smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + 
#>     wt71 + I(wt71^2), data = nhefs_dat)
#> 
#> Coefficients:
#>                         Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)           -1.6586176  4.3137734  -0.384 0.700666    
#> qsmk                   3.4626218  0.4384543   7.897 5.36e-15 ***
#> sex1                  -1.4650496  0.4683410  -3.128 0.001792 ** 
#> race1                  0.5864117  0.5816949   1.008 0.313560    
#> age                    0.3626624  0.1633431   2.220 0.026546 *  
#> I(age^2)              -0.0061377  0.0017263  -3.555 0.000389 ***
#> as.factor(education)2  0.8185263  0.6067815   1.349 0.177546    
#> as.factor(education)3  0.5715004  0.5561211   1.028 0.304273    
#> as.factor(education)4  1.5085173  0.8323778   1.812 0.070134 .  
#> as.factor(education)5 -0.1708264  0.7413289  -0.230 0.817786    
#> smokeintensity         0.0651533  0.0503115   1.295 0.195514    
#> I(smokeintensity^2)   -0.0010468  0.0009373  -1.117 0.264261    
#> smokeyrs               0.1333931  0.0917319   1.454 0.146104    
#> I(smokeyrs^2)         -0.0018270  0.0015438  -1.183 0.236818    
#> as.factor(exercise)1   0.3206824  0.5349616   0.599 0.548961    
#> as.factor(exercise)2   0.3628786  0.5589557   0.649 0.516300    
#> as.factor(active)1    -0.9429574  0.4100208  -2.300 0.021593 *  
#> as.factor(active)2    -0.2580374  0.6847219  -0.377 0.706337    
#> wt71                   0.0373642  0.0831658   0.449 0.653297    
#> I(wt71^2)             -0.0009158  0.0005235  -1.749 0.080426 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for gaussian family taken to be 53.59474)
#> 
#>     Null deviance: 97176  on 1565  degrees of freedom
#> Residual deviance: 82857  on 1546  degrees of freedom
#> AIC: 10701
#> 
#> Number of Fisher Scoring iterations: 2

res_stdReg2 <- standardize_glm(
  wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) +
    smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2),
  data = nhefs_dat, 
  values = list(qsmk = c(0,1)),
  contrasts = c("difference", "ratio"),
  reference = 0)

res_stdReg2
#> Outcome formula: wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) + 
#>     smokeintensity + I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) + 
#>     as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2)
#> Outcome family: gaussian 
#> Outcome link function: identity 
#> Exposure:  qsmk 
#> 
#> Tables: 
#>   qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1    0     1.75     0.217       1.32       2.17
#> 2    1     5.21     0.420       4.39       6.03
#> 
#> Reference level:  qsmk = 0 
#> Contrast:  difference 
#>   qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1    0     0.00     0.000       0.00       0.00
#> 2    1     3.46     0.466       2.55       4.38
#> 
#> Reference level:  qsmk = 0 
#> Contrast:  ratio 
#>   qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1    0     1.00     0.000       1.00       1.00
#> 2    1     2.98     0.435       2.13       3.83

Most simply, we can drop the variables with more than 2 levels and run outstandR().

library(dplyr)

nhefs_ipd <- nhefs_dat |> 
  # drop categorical variables
  dplyr::select(qsmk, sex, race, age, smokeintensity, smokeyrs, wt71, wt82_71) |>
  dplyr::rename(trt = qsmk,
                y = wt82_71) |> 
  dplyr::mutate(sex = as.numeric(sex) - 1,
                race = as.numeric(race) - 1,
                trt = factor(trt, labels = c("C", "A")))

# create aggregate data

nhefs.X <- nhefs_ipd %>%
  summarise(across(-c(trt, y),
                   list(mean = mean, sd = sd),
                   .names = "{fn}.{col}"))

nhefs.B <- nhefs_ipd |> 
  dplyr::filter(trt == 1) |> 
  summarise(y.B.sum = sum(y),
            y.B.bar = mean(y),
            y.B.sd = sd(y),
            N.B = n())

nhefs.C <- nhefs_ipd |> 
  dplyr::filter(trt == 0) |> 
  summarise(y.C.sum = sum(y),
            y.C.bar = mean(y),
            y.C.sd = sd(y),
            N.C = n())

nhefs_ald <- cbind.data.frame(nhefs.X, nhefs.C, nhefs.B)

# simplified formula
lin_form <-
  as.formula(y ~ trt * (sex + race + age + I(age^2) +
                          smokeintensity + I(smokeintensity^2) +
                          smokeyrs + I(smokeyrs^2) + wt71 + I(wt71^2)))
nhefs_strat <- 
  strategy_gcomp_ml(formula = lin_form,
                    family = gaussian(link = "identity"))

res <- outstandR(ald_trial = nhefs_ald,
                 ipd_trial = nhefs_ipd,
                 strategy = nhefs_strat,
                 scale = "log_relative_risk")
res
#> Object of class 'outstandR' 
#> Model: gaussian 
#> Scale: log_relative_risk 
#> Common treatment: C 
#> Individual patient data study: AC 
#> Aggregate level data study: BC 
#> Confidence interval level: 0.95 
#> 
#> Contrasts:
#> 
#> # A tibble: 3 × 5
#>   Treatments Estimate Std.Error lower.0.95 upper.0.95
#>   <chr>         <dbl>     <dbl>      <dbl>      <dbl>
#> 1 AB           NaN     NaN         NaN         NaN   
#> 2 AC             1.07    0.0198      0.790       1.34
#> 3 BC           NaN     NaN         NaN         NaN   
#> 
#> Absolute:
#> 
#> # A tibble: 2 × 5
#>   Treatments Estimate Std.Error lower.0.95 upper.0.95
#>   <chr>         <dbl>     <dbl> <lgl>      <lgl>     
#> 1 A              5.26    0.192  NA         NA        
#> 2 C              1.82    0.0392 NA         NA

Alternatively, rather than dropping factors, we can create dummy variables to replace them using maicplus::dummize_ipd().

nhefs_ipd <- nhefs_dat |> 
  dplyr::select(qsmk, sex, race, age, smokeintensity, smokeyrs, wt71,
                wt82_71, education, exercise, active) |> 
  dplyr::rename(trt = qsmk,
                y = wt82_71) |> 
  dplyr::mutate(sex = as.numeric(sex) - 1,       # binary
                race = as.numeric(race) - 1) |> 
  maicplus::dummize_ipd(dummize_cols = c("education", "exercise", "active"),
                        dummize_ref_level = c("1","0","0")) |> 
  dplyr::select(-education, -exercise, -active) |>   # remove originals
  dplyr::mutate(trt = factor(trt, labels = c("C", "A")))
  
# # replace with this?
# model.matrix()

# create aggregate data

nhefs.X <- nhefs_ipd %>%
  summarise(across(-c(trt, y),
                   list(mean = mean, sd = sd),
                   .names = "{fn}.{col}"))

nhefs.B <- dplyr::filter(nhefs_ipd, trt == 1) |> 
  summarise(y.B.sum = sum(y),
            y.B.bar = mean(y),
            y.B.sd = sd(y),
            N.B = n())

nhefs.C <- dplyr::filter(nhefs_ipd, trt == 0) |> 
  summarise(y.C.sum = sum(y),
            y.C.bar = mean(y),
            y.C.sd = sd(y),
            N.C = n())

nhefs_ald <- cbind.data.frame(nhefs.X, nhefs.C, nhefs.B)

lin_form <-
  as.formula(y ~ trt * (sex + race + age + I(age^2) +
                          smokeintensity + I(smokeintensity^2) +
                          EDUCATION_2 + EDUCATION_3 + EDUCATION_4 + EDUCATION_5 +
                          EXERCISE_1 + EXERCISE_2 + ACTIVE_1 + ACTIVE_2 +
                          smokeyrs + I(smokeyrs^2) + wt71 + I(wt71^2)))
nhefs_strat <- 
  strategy_gcomp_ml(formula = lin_form,
                    family = gaussian(link = "identity"))

res <- outstandR(ald_trial = nhefs_ald,
                 ipd_trial = nhefs_ipd,
                 strategy = nhefs_strat,
                 scale = "risk_difference")
res
#> Object of class 'outstandR' 
#> Model: gaussian 
#> Scale: risk_difference 
#> Common treatment: C 
#> Individual patient data study: AC 
#> Aggregate level data study: BC 
#> Confidence interval level: 0.95 
#> 
#> Contrasts:
#> 
#> # A tibble: 3 × 5
#>   Treatments Estimate Std.Error lower.0.95 upper.0.95
#>   <chr>         <dbl>     <dbl>      <dbl>      <dbl>
#> 1 AB           NaN       NA          NA         NA   
#> 2 AC             3.59     0.235       2.64       4.54
#> 3 BC           NaN       NA          NA         NA   
#> 
#> Absolute:
#> 
#> # A tibble: 2 × 5
#>   Treatments Estimate Std.Error lower.0.95 upper.0.95
#>   <chr>         <dbl>     <dbl> <lgl>      <lgl>     
#> 1 A              5.27    0.192  NA         NA        
#> 2 C              1.68    0.0437 NA         NA

The 2024 ISPOR poster by Hatswell presents a comparison of different MAIC R packages. We shall use this as our starting point. Of the packages considered, {MAIC} is not on CRAN so not included in our analysis at the moment.

maicplus

Roche developed the maic package which is now deprecated and replaced by maicplus. We will follow the article titled Anchored Binary Analysis which is part of their package to make our comparison.

library(maicplus)

# Load required data
data(centered_ipd_twt)
data(adrs_twt)  # response IPD
data(adsl_sat)  # covariates IPD

# Load required package
library(dplyr)

# Prepare matching variables (only using means, ignoring SDs)
adsl_sat <- adsl_sat |> 
  mutate(
    SEX_MALE = as.integer(SEX == "Male"),
    AGE_SQUARED = AGE^2
  )

# Define variable names for matching (excluding SDs)
adsl_colnames <- c("AGE", "AGE_SQUARED", "SEX_MALE", "ECOG0", "SMOKE", "N_PR_THER_MEDIAN")
centered_colnames <- paste0(adsl_colnames, "_CENTERED")

# Estimate weights based only on means
weighted_data <- estimate_weights(
  data = centered_ipd_twt,
  centered_colnames = centered_colnames
)

# Aggregate summary data for AGD (ONLY MEANS, NO SDs)
agd <- data.frame(
  AGE_MEAN = 51,
  SEX_MALE_PROP = 147 / 300,
  ECOG0_PROP = 0.40,
  SMOKE_PROP = 58 / (300 - 5),
  N_PR_THER_MEDIAN = 2
)

# Binary AGD summary data
binary_agd <- data.frame(
  ARM = rep(c("B", "C"), each = 2),
  RESPONSE = rep(c("YES", "NO"), 2),
  COUNT = c(280, 200, 120, 200)
)

# Generate pseudo-IPD from binary AGD
pseudo_adrs <- get_pseudo_ipd_binary(
  binary_agd = binary_agd,
  format = "stacked"
)

# Perform MAIC analysis (matching only on means)
res_maicplus <- maic_anchored(
  weights_object = weighted_data,
  ipd = adrs_twt,
  pseudo_ipd = pseudo_adrs,
  trt_ipd = "A",
  trt_agd = "B",
  trt_common = "C",
  normalize_weight = FALSE,
  endpoint_type = "binary",
  endpoint_name = "Binary Endpoint",
  eff_measure = "OR",
  binary_robust_cov_type = "HC3"
)


res_maicplus
#> $descriptive
#> $descriptive$summary
#>   trt_ind treatment                 type        n   events events_pct
#> 1       C         C IPD, before matching 500.0000 338.0000   67.60000
#> 2       A         A IPD, before matching 500.0000 390.0000   78.00000
#> 3       C         C  IPD, after matching 199.4265 134.0364   67.21094
#> 4       A         A  IPD, after matching 199.4265 142.8968   71.65386
#> 5       C         C        AgD, external 320.0000 120.0000   37.50000
#> 6       B         B        AgD, external 480.0000 280.0000   58.33333
#> 
#> 
#> $inferential
#> $inferential$fit
#> $inferential$fit$model_before_ipd
#> 
#> Call:  glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link), 
#>     data = ipd)
#> 
#> Coefficients:
#> (Intercept)         ARMA  
#>      0.7354       0.5302  
#> 
#> Degrees of Freedom: 999 Total (i.e. Null);  998 Residual
#> Null Deviance:       1170 
#> Residual Deviance: 1157  AIC: 1161
#> 
#> $inferential$fit$model_after_ipd
#> 
#> Call:  glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link), 
#>     data = ipd, weights = weights)
#> 
#> Coefficients:
#> (Intercept)         ARMA  
#>      0.7177       0.2096  
#> 
#> Degrees of Freedom: 999 Total (i.e. Null);  998 Residual
#> Null Deviance:       491.1 
#> Residual Deviance: 490.1     AIC: 458.4
#> 
#> $inferential$fit$model_agd
#> 
#> Call:  glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link), 
#>     data = pseudo_ipd)
#> 
#> Coefficients:
#> (Intercept)         ARMB  
#>     -0.5108       0.8473  
#> 
#> Degrees of Freedom: 799 Total (i.e. Null);  798 Residual
#> Null Deviance:       1109 
#> Residual Deviance: 1075  AIC: 1079
#> 
#> $inferential$fit$res_AC
#> $inferential$fit$res_AC$est
#> [1] 1.233204
#> 
#> $inferential$fit$res_AC$se
#> [1] 0.3085377
#> 
#> $inferential$fit$res_AC$ci_l
#> [1] 0.7710134
#> 
#> $inferential$fit$res_AC$ci_u
#> [1] 1.972458
#> 
#> $inferential$fit$res_AC$pval
#> [1] 0.3817109
#> 
#> 
#> $inferential$fit$res_AC_unadj
#> $inferential$fit$res_AC_unadj$est
#> [1] 1.699301
#> 
#> $inferential$fit$res_AC_unadj$se
#> [1] 0.2488482
#> 
#> $inferential$fit$res_AC_unadj$ci_l
#> [1] 1.280998
#> 
#> $inferential$fit$res_AC_unadj$ci_u
#> [1] 2.254199
#> 
#> $inferential$fit$res_AC_unadj$pval
#> [1] 0.0002354448
#> 
#> 
#> $inferential$fit$res_BC
#> $inferential$fit$res_BC$est
#> [1] 2.333333
#> 
#> $inferential$fit$res_BC$se
#> [1] 0.3510631
#> 
#> $inferential$fit$res_BC$ci_l
#> [1] 1.745809
#> 
#> $inferential$fit$res_BC$ci_u
#> [1] 3.118579
#> 
#> $inferential$fit$res_BC$pval
#> [1] 1.035032e-08
#> 
#> 
#> $inferential$fit$res_AB
#>             result             pvalue 
#> "0.53[0.30; 0.92]"            "0.024" 
#> 
#> $inferential$fit$res_AB_unadj
#>             result             pvalue 
#> "0.73[0.49; 1.09]"            "0.125" 
#> 
#> $inferential$fit$boot_res
#> NULL
#> 
#> $inferential$fit$boot_res_AC
#> NULL
#> 
#> $inferential$fit$boot_res_AB_mc
#> NULL
#> 
#> $inferential$fit$boot_res_AB
#> NULL
#> 
#> 
#> $inferential$summary
#>          case        OR       LCL       UCL         pval
#> 1          AC 1.6993007 1.2809976 2.2541985 2.354448e-04
#> 2 adjusted_AC 1.2332036 0.7710134 1.9724576 3.817109e-01
#> 3          BC 2.3333333 1.7458092 3.1185794 1.035032e-08
#> 4          AB 0.7282717 0.4857575 1.0918611 1.248769e-01
#> 5 adjusted_AB 0.5285158 0.3043103 0.9179084 2.356848e-02
library(dplyr, verbose = FALSE, quietly = TRUE)
library(glue, verbose = FALSE, quietly = TRUE)

##TODO: why has this got median in name here?
adsl_colnames[adsl_colnames == "N_PR_THER_MEDIAN"] <- "N_PR_THER"

lin_form <- as.formula(glue("y ~ trt * ({paste(adsl_colnames, collapse = ' + ')})"))

ipd_trial <- adsl_twt |>
  merge(adrs_twt) |> 
  rename(trt = ARM,
         y = RESPONSE) |> 
  mutate(AGE_SQUARED = AGE^2)

# sd from IPD
agd_sd <- ipd_trial |> 
  summarise(across(c(AGE, AGE_SQUARED, SMOKE, ECOG0, N_PR_THER, SEX_MALE),
                   .fns = sd, .names = "sd.{.col}"))

# transform names to format needed in outstandR
# mean.* and y.*
# note that we're nto going to use sd in matching
ald_trial <- agd |> 
  rename(mean.AGE = AGE_MEAN,
         # sd.AGE = AGE_SD,
         mean.SEX_MALE = SEX_MALE_PROP,
         mean.ECOG0 = ECOG0_PROP,
         mean.SMOKE = SMOKE_PROP,
         mean.N_PR_THER = N_PR_THER_MEDIAN) |> 
  mutate(
    # mean.AGE_SQUARED = sd.AGE^2 + mean.AGE^2,
    # mean.AGE_SQUARED = mean.AGE^2,                 ##TODO: this is a fudge
    # sd.SEX_MALE = sqrt(mean.SEX_MALE * (1 - mean.SEX_MALE)), 
    # sd.ECOG0 = sqrt(mean.ECOG0 * (1 - mean.ECOG0)), 
    # sd.SMOKE = sqrt(mean.SMOKE * (1 - mean.SMOKE)), 
    # outcomes
    N.B = 480, 
    y.B.sum = binary_agd[binary_agd$ARM == "B" & binary_agd$RESPONSE == "YES", "COUNT"],
    y.B.bar = y.B.sum/N.B,
    N.C = 320, 
    y.C.sum = binary_agd[binary_agd$ARM == "C" & binary_agd$RESPONSE == "YES", "COUNT"],
    y.C.bar = y.C.sum/N.C) |> 
    cbind(agd_sd) |> 
  relocate(N.B, y.B.sum, y.B.bar, N.C, y.C.sum, y.C.bar, .after = sd.SEX_MALE)

ald_trial
#>   mean.AGE mean.SEX_MALE mean.ECOG0 mean.SMOKE mean.N_PR_THER   sd.AGE
#> 1       51          0.49        0.4  0.1966102              2 9.006409
#>   sd.AGE_SQUARED  sd.SMOKE  sd.ECOG0 sd.N_PR_THER sd.SEX_MALE N.B y.B.sum
#> 1       1085.096 0.4667096 0.4913302     1.100505   0.4866013 480     280
#>     y.B.bar N.C y.C.sum y.C.bar
#> 1 0.5833333 320     120   0.375

Now that we’ve prepared the input data we’re ready to run the model.

##TODO: fix bug of large value in optim()
outstandR_maic <-
  outstandR(ipd_trial, ald_trial,
            strategy = strategy_maic(formula = lin_form,
                                     family = binomial(link = "logit")))

maicChecks

We now follow the examples for the package maicChecks.

##TODO:
library(maicChecks)

For G-formula we will compare with the output from the packages gFormulaMI and gfoRmula.

gfoRmula

##TODO:
library(gfoRmula)

gFormulaMI