Skip to contents

There are already a few widely-used R packages in CRAN to perform model-based standardization, such as {marginaleffects} and {stdReg}/{StdReg2}.

marginaleffects

##TODO: see unit tests
library(marginaleffects)

stdReg[2]

{StdReg} has been rewritten to produce {StdReg2} which has “nicer output, more available methods, the possibility to include new methods, and mainly to make maintenance and updating easier.” So, for this reason, we will use {StdReg2}.

We shall follow the article Estimation of causal effects using stdReg2. This is for continuous outcomes (change in weight) and includes transformed covariates. Note that the regression has transformed covariates using the I() function. Two outcome scales are calculated, risk difference and risk ratio. The data also contains categorical variable with more than 2 levels, education, smokeintensity, exercise and active.

library(stdReg2)
library(causaldata)  # contains data set

# load data set
nhefs_dat <- causaldata::nhefs_complete

m <- glm(wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) +
           smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2), 
         data = nhefs_dat)
summary(m)
#> 
#> Call:
#> glm(formula = wt82_71 ~ qsmk + sex + race + age + I(age^2) + 
#>     as.factor(education) + smokeintensity + I(smokeintensity^2) + 
#>     smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + 
#>     wt71 + I(wt71^2), data = nhefs_dat)
#> 
#> Coefficients:
#>                         Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)           -1.6586176  4.3137734  -0.384 0.700666    
#> qsmk                   3.4626218  0.4384543   7.897 5.36e-15 ***
#> sex1                  -1.4650496  0.4683410  -3.128 0.001792 ** 
#> race1                  0.5864117  0.5816949   1.008 0.313560    
#> age                    0.3626624  0.1633431   2.220 0.026546 *  
#> I(age^2)              -0.0061377  0.0017263  -3.555 0.000389 ***
#> as.factor(education)2  0.8185263  0.6067815   1.349 0.177546    
#> as.factor(education)3  0.5715004  0.5561211   1.028 0.304273    
#> as.factor(education)4  1.5085173  0.8323778   1.812 0.070134 .  
#> as.factor(education)5 -0.1708264  0.7413289  -0.230 0.817786    
#> smokeintensity         0.0651533  0.0503115   1.295 0.195514    
#> I(smokeintensity^2)   -0.0010468  0.0009373  -1.117 0.264261    
#> smokeyrs               0.1333931  0.0917319   1.454 0.146104    
#> I(smokeyrs^2)         -0.0018270  0.0015438  -1.183 0.236818    
#> as.factor(exercise)1   0.3206824  0.5349616   0.599 0.548961    
#> as.factor(exercise)2   0.3628786  0.5589557   0.649 0.516300    
#> as.factor(active)1    -0.9429574  0.4100208  -2.300 0.021593 *  
#> as.factor(active)2    -0.2580374  0.6847219  -0.377 0.706337    
#> wt71                   0.0373642  0.0831658   0.449 0.653297    
#> I(wt71^2)             -0.0009158  0.0005235  -1.749 0.080426 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for gaussian family taken to be 53.59474)
#> 
#>     Null deviance: 97176  on 1565  degrees of freedom
#> Residual deviance: 82857  on 1546  degrees of freedom
#> AIC: 10701
#> 
#> Number of Fisher Scoring iterations: 2

res_stdReg2 <- standardize_glm(
  wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) +
    smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2),
  data = nhefs_dat, 
  values = list(qsmk = c(0,1)),
  contrasts = c("difference", "ratio"),
  reference = 0)

res_stdReg2
#> Outcome formula: wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) + 
#>     smokeintensity + I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) + 
#>     as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2)
#> Outcome family: gaussian 
#> Outcome link function: identity 
#> Exposure:  qsmk 
#> 
#> Tables: 
#>   qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1    0     1.75     0.217       1.32       2.17
#> 2    1     5.21     0.420       4.39       6.03
#> 
#> Reference level:  qsmk = 0 
#> Contrast:  difference 
#>   qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1    0     0.00     0.000       0.00       0.00
#> 2    1     3.46     0.466       2.55       4.38
#> 
#> Reference level:  qsmk = 0 
#> Contrast:  ratio 
#>   qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1    0     1.00     0.000       1.00       1.00
#> 2    1     2.98     0.435       2.13       3.83

Most simply, we can drop the variables with more than 2 levels and run outstandR().

library(dplyr)

nhefs_ipd <- nhefs_dat |> 
  # drop categorical variables
  select(qsmk, sex, race, age, smokeintensity, smokeyrs, wt71, wt82_71) |>
  rename(trt = qsmk,
         y = wt82_71) |> 
  mutate(sex = as.numeric(sex) - 1,
         race = as.numeric(race) - 1)

# create aggregate data

nhefs.X <- nhefs_ipd %>%
  summarise(across(-c(trt, y),
                   list(mean = mean, sd = sd),
                   .names = "{fn}.{col}"))

nhefs.B <- dplyr::filter(nhefs_ipd, trt == 1) |> 
  summarise(y.B.sum = sum(y),
            y.B.bar = mean(y),
            y.B.sd = sd(y),
            N.B = n())

nhefs.C <- dplyr::filter(nhefs_ipd, trt == 0) |> 
  summarise(y.C.sum = sum(y),
            y.C.bar = mean(y),
            y.C.sd = sd(y),
            N.C = n())

nhefs_ald <- cbind.data.frame(nhefs.X, nhefs.C, nhefs.B)

# simplified formula
lin_form <-
  as.formula(y ~ trt * (sex + race + age + I(age^2) +
                          smokeintensity + I(smokeintensity^2) +
                          smokeyrs + I(smokeyrs^2) + wt71 + I(wt71^2)))
nhefs_strat <- 
  strategy_gcomp_ml(formula = lin_form,
                    family = gaussian(link = "identity"))

res <- outstandR(BC.ALD = nhefs_ald,
                 AC.IPD = nhefs_ipd,
                 strategy = nhefs_strat,
                 scale = "log_relative_risk")
res
#> Object of class 'outstandR' 
#> Scale: log_relative_risk 
#> Common treatment: C 
#> Individual patient data study: AC 
#> Aggregate level data study: BC 
#> Confidence interval level: 0.95 
#> 
#> # A tibble: 3 × 5
#>   Treatments Estimate Std.Error lower.0.95 upper.0.95
#>   <chr>         <dbl>     <dbl>      <dbl>      <dbl>
#> 1 AB            0.241    2.21       -2.68        3.16
#> 2 AC            1.07     0.0198      0.790       1.34
#> 3 BC            0.824    2.20       -2.08        3.73

Alternatively, rather than dropping factors, we can create dummy variables to replace them using maicplus::dummize_ipd().


nhefs_ipd <- nhefs_dat |> 
  select(qsmk, sex, race, age, smokeintensity, smokeyrs, wt71,
         wt82_71, education, exercise, active) |> 
  rename(trt = qsmk,
         y = wt82_71) |> 
  mutate(sex = as.numeric(sex) - 1,       # binary
         race = as.numeric(race) - 1) |> 
  maicplus::dummize_ipd(dummize_cols = c("education", "exercise", "active"),
                        dummize_ref_level = c("1","0","0")) |> 
  select(-education, -exercise, -active)  # remove originals

# # replace with this?
# model.matrix()

# create aggregate data

nhefs.X <- nhefs_ipd %>%
  summarise(across(-c(trt, y),
                   list(mean = mean, sd = sd),
                   .names = "{fn}.{col}"))

nhefs.B <- dplyr::filter(nhefs_ipd, trt == 1) |> 
  summarise(y.B.sum = sum(y),
            y.B.bar = mean(y),
            y.B.sd = sd(y),
            N.B = n())

nhefs.C <- dplyr::filter(nhefs_ipd, trt == 0) |> 
  summarise(y.C.sum = sum(y),
            y.C.bar = mean(y),
            y.C.sd = sd(y),
            N.C = n())

nhefs_ald <- cbind.data.frame(nhefs.X, nhefs.C, nhefs.B)

lin_form <-
  as.formula(y ~ trt * (sex + race + age + I(age^2) +
                          smokeintensity + I(smokeintensity^2) +
                          EDUCATION_2 + EDUCATION_3 + EDUCATION_4 + EDUCATION_5 +
                          EXERCISE_1 + EXERCISE_2 + ACTIVE_1 + ACTIVE_2 +
                          smokeyrs + I(smokeyrs^2) + wt71 + I(wt71^2)))
nhefs_strat <- 
  strategy_gcomp_ml(formula = lin_form,
                    family = gaussian(link = "identity"))

res <- outstandR(BC.ALD = nhefs_ald,
                 AC.IPD = nhefs_ipd,
                 strategy = nhefs_strat,
                 scale = "risk_difference")
res
#> Object of class 'outstandR' 
#> Scale: risk_difference 
#> Common treatment: C 
#> Individual patient data study: AC 
#> Aggregate level data study: BC 
#> Confidence interval level: 0.95 
#> 
#> # A tibble: 3 × 5
#>   Treatments Estimate Std.Error lower.0.95 upper.0.95
#>   <chr>         <dbl>     <dbl>      <dbl>      <dbl>
#> 1 AB             1.05     0.473     -0.295       2.40
#> 2 AC             3.59     0.235      2.64        4.54
#> 3 BC             2.54     0.238      1.59        3.50

The 2024 ISPOR poster by Hatswell presents a comparison of different MAIC R packages. We shall use this as our starting point. Of the packages considered, {MAIC} is not on CRAN so not included in our analysis at the moment.

maicplus

Roche developed the maic package which is now deprecated and replaced by maicplus. We will follow the article titled Anchored Binary Analysis to make our comparison.

library(maicplus)

# Load required data
data(centered_ipd_twt)
data(adrs_twt)
data("adsl_sat")  # Original IPD

# Load required package
library(dplyr)

# Prepare matching variables (only using means, ignoring SDs)
adsl_sat <- adsl_sat |> 
  mutate(
    SEX_MALE = as.integer(SEX == "Male"),
    AGE_SQUARED = AGE^2
  )

# Define variable names for matching (excluding SDs)
adsl_colnames <- c("AGE", "AGE_SQUARED", "SEX_MALE", "ECOG0", "SMOKE", "N_PR_THER_MEDIAN")
centered_colnames <- paste0(adsl_colnames, "_CENTERED")

# Estimate weights based only on means
weighted_data <- estimate_weights(
  data = centered_ipd_twt,
  centered_colnames = centered_colnames
)

# Aggregate summary data for AGD (ONLY MEANS, NO SDs)
agd <- data.frame(
  AGE_MEAN = 51,
  SEX_MALE_PROP = 147 / 300,
  ECOG0_PROP = 0.40,
  SMOKE_PROP = 58 / (300 - 5),
  N_PR_THER_MEDIAN = 2
)

# Binary AGD summary data
binary_agd <- data.frame(
  ARM = rep(c("B", "C"), each = 2),
  RESPONSE = rep(c("YES", "NO"), 2),
  COUNT = c(280, 200, 120, 200)
)

# Generate pseudo-IPD from binary AGD
pseudo_adrs <- get_pseudo_ipd_binary(
  binary_agd = binary_agd,
  format = "stacked"
)

# Perform MAIC analysis (matching only on means)
res_maicplus <- maic_anchored(
  weights_object = weighted_data,
  ipd = adrs_twt,
  pseudo_ipd = pseudo_adrs,
  trt_ipd = "A",
  trt_agd = "B",
  trt_common = "C",
  normalize_weight = FALSE,
  endpoint_type = "binary",
  endpoint_name = "Binary Endpoint",
  eff_measure = "OR",
  binary_robust_cov_type = "HC3"
)


res_maicplus
#> $descriptive
#> $descriptive$summary
#>   trt_ind treatment                 type        n   events events_pct
#> 1       C         C IPD, before matching 500.0000 338.0000   67.60000
#> 2       A         A IPD, before matching 500.0000 390.0000   78.00000
#> 3       C         C  IPD, after matching 199.4265 134.0364   67.21094
#> 4       A         A  IPD, after matching 199.4265 142.8968   71.65386
#> 5       C         C        AgD, external 320.0000 120.0000   37.50000
#> 6       B         B        AgD, external 480.0000 280.0000   58.33333
#> 
#> 
#> $inferential
#> $inferential$fit
#> $inferential$fit$model_before_ipd
#> 
#> Call:  glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link), 
#>     data = ipd)
#> 
#> Coefficients:
#> (Intercept)         ARMA  
#>      0.7354       0.5302  
#> 
#> Degrees of Freedom: 999 Total (i.e. Null);  998 Residual
#> Null Deviance:       1170 
#> Residual Deviance: 1157  AIC: 1161
#> 
#> $inferential$fit$model_after_ipd
#> 
#> Call:  glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link), 
#>     data = ipd, weights = weights)
#> 
#> Coefficients:
#> (Intercept)         ARMA  
#>      0.7177       0.2096  
#> 
#> Degrees of Freedom: 999 Total (i.e. Null);  998 Residual
#> Null Deviance:       491.1 
#> Residual Deviance: 490.1     AIC: 458.4
#> 
#> $inferential$fit$model_agd
#> 
#> Call:  glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link), 
#>     data = pseudo_ipd)
#> 
#> Coefficients:
#> (Intercept)         ARMB  
#>     -0.5108       0.8473  
#> 
#> Degrees of Freedom: 799 Total (i.e. Null);  798 Residual
#> Null Deviance:       1109 
#> Residual Deviance: 1075  AIC: 1079
#> 
#> $inferential$fit$res_AC
#> $inferential$fit$res_AC$est
#> [1] 1.233204
#> 
#> $inferential$fit$res_AC$se
#> [1] 0.3085377
#> 
#> $inferential$fit$res_AC$ci_l
#> [1] 0.7710134
#> 
#> $inferential$fit$res_AC$ci_u
#> [1] 1.972458
#> 
#> $inferential$fit$res_AC$pval
#> [1] 0.3817109
#> 
#> 
#> $inferential$fit$res_AC_unadj
#> $inferential$fit$res_AC_unadj$est
#> [1] 1.699301
#> 
#> $inferential$fit$res_AC_unadj$se
#> [1] 0.2488482
#> 
#> $inferential$fit$res_AC_unadj$ci_l
#> [1] 1.280998
#> 
#> $inferential$fit$res_AC_unadj$ci_u
#> [1] 2.254199
#> 
#> $inferential$fit$res_AC_unadj$pval
#> [1] 0.0002354448
#> 
#> 
#> $inferential$fit$res_BC
#> $inferential$fit$res_BC$est
#> [1] 2.333333
#> 
#> $inferential$fit$res_BC$se
#> [1] 0.3510631
#> 
#> $inferential$fit$res_BC$ci_l
#> [1] 1.745809
#> 
#> $inferential$fit$res_BC$ci_u
#> [1] 3.118579
#> 
#> $inferential$fit$res_BC$pval
#> [1] 1.035032e-08
#> 
#> 
#> $inferential$fit$res_AB
#>             result             pvalue 
#> "0.53[0.30; 0.92]"            "0.024" 
#> 
#> $inferential$fit$res_AB_unadj
#>             result             pvalue 
#> "0.73[0.49; 1.09]"            "0.125" 
#> 
#> $inferential$fit$boot_res
#> NULL
#> 
#> $inferential$fit$boot_res_AC
#> NULL
#> 
#> $inferential$fit$boot_res_AB_mc
#> NULL
#> 
#> $inferential$fit$boot_res_AB
#> NULL
#> 
#> 
#> $inferential$summary
#>          case        OR       LCL       UCL         pval
#> 1          AC 1.6993007 1.2809976 2.2541985 2.354448e-04
#> 2 adjusted_AC 1.2332036 0.7710134 1.9724576 3.817109e-01
#> 3          BC 2.3333333 1.7458092 3.1185794 1.035032e-08
#> 4          AB 0.7282717 0.4857575 1.0918611 1.248769e-01
#> 5 adjusted_AB 0.5285158 0.3043103 0.9179084 2.356848e-02
library(dplyr, verbose = FALSE, quietly = TRUE)
library(glue, verbose = FALSE, quietly = TRUE)

##TODO: why has this got median in name here?
adsl_colnames[adsl_colnames == "N_PR_THER_MEDIAN"] <- "N_PR_THER"

lin_form <- as.formula(glue("y ~ trt * ({paste(adsl_colnames, collapse = ' + ')})"))

AC.IPD <- adsl_twt |>
  merge(adrs_twt) |> 
  rename(trt = ARM,
         y = RESPONSE) |> 
  mutate(AGE_SQUARED = AGE^2)

# transform names to format needed in outstandR
# mean.* and y.*
# note that we're nto going to use sd in matching
BC.ALD <- agd |> 
  rename(mean.AGE = AGE_MEAN,
         # sd.AGE = AGE_SD,
         mean.SEX_MALE = SEX_MALE_PROP,
         mean.ECOG0 = ECOG0_PROP,
         mean.SMOKE = SMOKE_PROP,
         mean.N_PR_THER = N_PR_THER_MEDIAN) |> 
  mutate(
    # mean.AGE_SQUARED = sd.AGE^2 + mean.AGE^2,
    mean.AGE_SQUARED = mean.AGE^2,                 ##TODO: this is a fudge
    # sd.SEX_MALE = sqrt(mean.SEX_MALE * (1 - mean.SEX_MALE)), 
    # sd.ECOG0 = sqrt(mean.ECOG0 * (1 - mean.ECOG0)), 
    # sd.SMOKE = sqrt(mean.SMOKE * (1 - mean.SMOKE)), 
    # outcomes
    N.B = 480, 
    y.B.sum = binary_agd[binary_agd$ARM == "B" & binary_agd$RESPONSE == "YES", "COUNT"],
    y.B.bar = y.B.sum/N.B,
    N.C = 320, 
    y.C.sum = binary_agd[binary_agd$ARM == "C" & binary_agd$RESPONSE == "YES", "COUNT"],
    y.C.bar = y.C.sum/N.C)

BC.ALD
#>   mean.AGE mean.SEX_MALE mean.ECOG0 mean.SMOKE mean.N_PR_THER mean.AGE_SQUARED
#> 1       51          0.49        0.4  0.1966102              2             2601
#>   N.B y.B.sum   y.B.bar N.C y.C.sum y.C.bar
#> 1 480     280 0.5833333 320     120   0.375

Now that we’ve prepared the input data we’re ready to run the model.

##TODO: fix bug
outstandR_maic <-
  outstandR(AC.IPD, BC.ALD,
            strategy = strategy_maic(formula = lin_form,
                                     family = binomial(link = "logit")))

maicChecks

We now follow the examples for the package maicChecks.

##TODO:
library(maicChecks)

For G-formula we will compare with the output from the packages gFormulaMI and gfoRmula.

gfoRmula

##TODO:
library(gfoRmula)

gFormulaMI