Comparison with other packages
Source:vignettes/comparison-with-other-packages.Rmd
comparison-with-other-packages.Rmd
There are already a few widely-used R packages in CRAN to perform
model-based standardization, such as {marginaleffects}
and {stdReg}/{StdReg2}
.
marginaleffects
##TODO: see unit tests
library(marginaleffects)
stdReg[2]
{StdReg}
has been rewritten to produce
{StdReg2}
which has “nicer output, more available methods,
the possibility to include new methods, and mainly to make maintenance
and updating easier.” So, for this reason, we will use
{StdReg2}
.
We shall follow the article Estimation
of causal effects using stdReg2. This is for continuous
outcomes (change in weight) and includes transformed covariates. Note
that the regression has transformed covariates using the
I()
function. Two outcome scales are calculated, risk
difference and risk ratio. The data also contains categorical variable
with more than 2 levels, education
,
smokeintensity
, exercise
and
active
.
library(stdReg2)
library(causaldata) # contains data set
# load data set
nhefs_dat <- causaldata::nhefs_complete
m <- glm(wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) +
smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2),
data = nhefs_dat)
summary(m)
#>
#> Call:
#> glm(formula = wt82_71 ~ qsmk + sex + race + age + I(age^2) +
#> as.factor(education) + smokeintensity + I(smokeintensity^2) +
#> smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) +
#> wt71 + I(wt71^2), data = nhefs_dat)
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -1.6586176 4.3137734 -0.384 0.700666
#> qsmk 3.4626218 0.4384543 7.897 5.36e-15 ***
#> sex1 -1.4650496 0.4683410 -3.128 0.001792 **
#> race1 0.5864117 0.5816949 1.008 0.313560
#> age 0.3626624 0.1633431 2.220 0.026546 *
#> I(age^2) -0.0061377 0.0017263 -3.555 0.000389 ***
#> as.factor(education)2 0.8185263 0.6067815 1.349 0.177546
#> as.factor(education)3 0.5715004 0.5561211 1.028 0.304273
#> as.factor(education)4 1.5085173 0.8323778 1.812 0.070134 .
#> as.factor(education)5 -0.1708264 0.7413289 -0.230 0.817786
#> smokeintensity 0.0651533 0.0503115 1.295 0.195514
#> I(smokeintensity^2) -0.0010468 0.0009373 -1.117 0.264261
#> smokeyrs 0.1333931 0.0917319 1.454 0.146104
#> I(smokeyrs^2) -0.0018270 0.0015438 -1.183 0.236818
#> as.factor(exercise)1 0.3206824 0.5349616 0.599 0.548961
#> as.factor(exercise)2 0.3628786 0.5589557 0.649 0.516300
#> as.factor(active)1 -0.9429574 0.4100208 -2.300 0.021593 *
#> as.factor(active)2 -0.2580374 0.6847219 -0.377 0.706337
#> wt71 0.0373642 0.0831658 0.449 0.653297
#> I(wt71^2) -0.0009158 0.0005235 -1.749 0.080426 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for gaussian family taken to be 53.59474)
#>
#> Null deviance: 97176 on 1565 degrees of freedom
#> Residual deviance: 82857 on 1546 degrees of freedom
#> AIC: 10701
#>
#> Number of Fisher Scoring iterations: 2
res_stdReg2 <- standardize_glm(
wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) +
smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2),
data = nhefs_dat,
values = list(qsmk = c(0,1)),
contrasts = c("difference", "ratio"),
reference = 0)
res_stdReg2
#> Outcome formula: wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) +
#> smokeintensity + I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) +
#> as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2)
#> Outcome family: gaussian
#> Outcome link function: identity
#> Exposure: qsmk
#>
#> Tables:
#> qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1 0 1.75 0.217 1.32 2.17
#> 2 1 5.21 0.420 4.39 6.03
#>
#> Reference level: qsmk = 0
#> Contrast: difference
#> qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1 0 0.00 0.000 0.00 0.00
#> 2 1 3.46 0.466 2.55 4.38
#>
#> Reference level: qsmk = 0
#> Contrast: ratio
#> qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1 0 1.00 0.000 1.00 1.00
#> 2 1 2.98 0.435 2.13 3.83
Most simply, we can drop the variables with more than 2 levels and
run outstandR()
.
library(dplyr)
nhefs_ipd <- nhefs_dat |>
# drop categorical variables
select(qsmk, sex, race, age, smokeintensity, smokeyrs, wt71, wt82_71) |>
rename(trt = qsmk,
y = wt82_71) |>
mutate(sex = as.numeric(sex) - 1,
race = as.numeric(race) - 1)
# create aggregate data
nhefs.X <- nhefs_ipd %>%
summarise(across(-c(trt, y),
list(mean = mean, sd = sd),
.names = "{fn}.{col}"))
nhefs.B <- dplyr::filter(nhefs_ipd, trt == 1) |>
summarise(y.B.sum = sum(y),
y.B.bar = mean(y),
y.B.sd = sd(y),
N.B = n())
nhefs.C <- dplyr::filter(nhefs_ipd, trt == 0) |>
summarise(y.C.sum = sum(y),
y.C.bar = mean(y),
y.C.sd = sd(y),
N.C = n())
nhefs_ald <- cbind.data.frame(nhefs.X, nhefs.C, nhefs.B)
# simplified formula
lin_form <-
as.formula(y ~ trt * (sex + race + age + I(age^2) +
smokeintensity + I(smokeintensity^2) +
smokeyrs + I(smokeyrs^2) + wt71 + I(wt71^2)))
nhefs_strat <-
strategy_gcomp_ml(formula = lin_form,
family = gaussian(link = "identity"))
res <- outstandR(BC.ALD = nhefs_ald,
AC.IPD = nhefs_ipd,
strategy = nhefs_strat,
scale = "log_relative_risk")
res
#> Object of class 'outstandR'
#> Scale: log_relative_risk
#> Common treatment: C
#> Individual patient data study: AC
#> Aggregate level data study: BC
#> Confidence interval level: 0.95
#>
#> # A tibble: 3 × 5
#> Treatments Estimate Std.Error lower.0.95 upper.0.95
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 AB 0.241 2.21 -2.68 3.16
#> 2 AC 1.07 0.0198 0.790 1.34
#> 3 BC 0.824 2.20 -2.08 3.73
Alternatively, rather than dropping factors, we can create dummy
variables to replace them using
maicplus::dummize_ipd()
.
nhefs_ipd <- nhefs_dat |>
select(qsmk, sex, race, age, smokeintensity, smokeyrs, wt71,
wt82_71, education, exercise, active) |>
rename(trt = qsmk,
y = wt82_71) |>
mutate(sex = as.numeric(sex) - 1, # binary
race = as.numeric(race) - 1) |>
maicplus::dummize_ipd(dummize_cols = c("education", "exercise", "active"),
dummize_ref_level = c("1","0","0")) |>
select(-education, -exercise, -active) # remove originals
# # replace with this?
# model.matrix()
# create aggregate data
nhefs.X <- nhefs_ipd %>%
summarise(across(-c(trt, y),
list(mean = mean, sd = sd),
.names = "{fn}.{col}"))
nhefs.B <- dplyr::filter(nhefs_ipd, trt == 1) |>
summarise(y.B.sum = sum(y),
y.B.bar = mean(y),
y.B.sd = sd(y),
N.B = n())
nhefs.C <- dplyr::filter(nhefs_ipd, trt == 0) |>
summarise(y.C.sum = sum(y),
y.C.bar = mean(y),
y.C.sd = sd(y),
N.C = n())
nhefs_ald <- cbind.data.frame(nhefs.X, nhefs.C, nhefs.B)
lin_form <-
as.formula(y ~ trt * (sex + race + age + I(age^2) +
smokeintensity + I(smokeintensity^2) +
EDUCATION_2 + EDUCATION_3 + EDUCATION_4 + EDUCATION_5 +
EXERCISE_1 + EXERCISE_2 + ACTIVE_1 + ACTIVE_2 +
smokeyrs + I(smokeyrs^2) + wt71 + I(wt71^2)))
nhefs_strat <-
strategy_gcomp_ml(formula = lin_form,
family = gaussian(link = "identity"))
res <- outstandR(BC.ALD = nhefs_ald,
AC.IPD = nhefs_ipd,
strategy = nhefs_strat,
scale = "risk_difference")
res
#> Object of class 'outstandR'
#> Scale: risk_difference
#> Common treatment: C
#> Individual patient data study: AC
#> Aggregate level data study: BC
#> Confidence interval level: 0.95
#>
#> # A tibble: 3 × 5
#> Treatments Estimate Std.Error lower.0.95 upper.0.95
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 AB 1.05 0.473 -0.295 2.40
#> 2 AC 3.59 0.235 2.64 4.54
#> 3 BC 2.54 0.238 1.59 3.50
The 2024 ISPOR
poster by Hatswell presents a comparison of different MAIC R
packages. We shall use this as our starting point. Of the packages
considered, {MAIC}
is not on CRAN so not included in our
analysis at the moment.
maicplus
Roche developed the maic
package which is now deprecated
and replaced by maicplus
. We will follow the article titled
Anchored
Binary Analysis to make our comparison.
library(maicplus)
# Load required data
data(centered_ipd_twt)
data(adrs_twt)
data("adsl_sat") # Original IPD
# Load required package
library(dplyr)
# Prepare matching variables (only using means, ignoring SDs)
adsl_sat <- adsl_sat |>
mutate(
SEX_MALE = as.integer(SEX == "Male"),
AGE_SQUARED = AGE^2
)
# Define variable names for matching (excluding SDs)
adsl_colnames <- c("AGE", "AGE_SQUARED", "SEX_MALE", "ECOG0", "SMOKE", "N_PR_THER_MEDIAN")
centered_colnames <- paste0(adsl_colnames, "_CENTERED")
# Estimate weights based only on means
weighted_data <- estimate_weights(
data = centered_ipd_twt,
centered_colnames = centered_colnames
)
# Aggregate summary data for AGD (ONLY MEANS, NO SDs)
agd <- data.frame(
AGE_MEAN = 51,
SEX_MALE_PROP = 147 / 300,
ECOG0_PROP = 0.40,
SMOKE_PROP = 58 / (300 - 5),
N_PR_THER_MEDIAN = 2
)
# Binary AGD summary data
binary_agd <- data.frame(
ARM = rep(c("B", "C"), each = 2),
RESPONSE = rep(c("YES", "NO"), 2),
COUNT = c(280, 200, 120, 200)
)
# Generate pseudo-IPD from binary AGD
pseudo_adrs <- get_pseudo_ipd_binary(
binary_agd = binary_agd,
format = "stacked"
)
# Perform MAIC analysis (matching only on means)
res_maicplus <- maic_anchored(
weights_object = weighted_data,
ipd = adrs_twt,
pseudo_ipd = pseudo_adrs,
trt_ipd = "A",
trt_agd = "B",
trt_common = "C",
normalize_weight = FALSE,
endpoint_type = "binary",
endpoint_name = "Binary Endpoint",
eff_measure = "OR",
binary_robust_cov_type = "HC3"
)
res_maicplus
#> $descriptive
#> $descriptive$summary
#> trt_ind treatment type n events events_pct
#> 1 C C IPD, before matching 500.0000 338.0000 67.60000
#> 2 A A IPD, before matching 500.0000 390.0000 78.00000
#> 3 C C IPD, after matching 199.4265 134.0364 67.21094
#> 4 A A IPD, after matching 199.4265 142.8968 71.65386
#> 5 C C AgD, external 320.0000 120.0000 37.50000
#> 6 B B AgD, external 480.0000 280.0000 58.33333
#>
#>
#> $inferential
#> $inferential$fit
#> $inferential$fit$model_before_ipd
#>
#> Call: glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link),
#> data = ipd)
#>
#> Coefficients:
#> (Intercept) ARMA
#> 0.7354 0.5302
#>
#> Degrees of Freedom: 999 Total (i.e. Null); 998 Residual
#> Null Deviance: 1170
#> Residual Deviance: 1157 AIC: 1161
#>
#> $inferential$fit$model_after_ipd
#>
#> Call: glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link),
#> data = ipd, weights = weights)
#>
#> Coefficients:
#> (Intercept) ARMA
#> 0.7177 0.2096
#>
#> Degrees of Freedom: 999 Total (i.e. Null); 998 Residual
#> Null Deviance: 491.1
#> Residual Deviance: 490.1 AIC: 458.4
#>
#> $inferential$fit$model_agd
#>
#> Call: glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link),
#> data = pseudo_ipd)
#>
#> Coefficients:
#> (Intercept) ARMB
#> -0.5108 0.8473
#>
#> Degrees of Freedom: 799 Total (i.e. Null); 798 Residual
#> Null Deviance: 1109
#> Residual Deviance: 1075 AIC: 1079
#>
#> $inferential$fit$res_AC
#> $inferential$fit$res_AC$est
#> [1] 1.233204
#>
#> $inferential$fit$res_AC$se
#> [1] 0.3085377
#>
#> $inferential$fit$res_AC$ci_l
#> [1] 0.7710134
#>
#> $inferential$fit$res_AC$ci_u
#> [1] 1.972458
#>
#> $inferential$fit$res_AC$pval
#> [1] 0.3817109
#>
#>
#> $inferential$fit$res_AC_unadj
#> $inferential$fit$res_AC_unadj$est
#> [1] 1.699301
#>
#> $inferential$fit$res_AC_unadj$se
#> [1] 0.2488482
#>
#> $inferential$fit$res_AC_unadj$ci_l
#> [1] 1.280998
#>
#> $inferential$fit$res_AC_unadj$ci_u
#> [1] 2.254199
#>
#> $inferential$fit$res_AC_unadj$pval
#> [1] 0.0002354448
#>
#>
#> $inferential$fit$res_BC
#> $inferential$fit$res_BC$est
#> [1] 2.333333
#>
#> $inferential$fit$res_BC$se
#> [1] 0.3510631
#>
#> $inferential$fit$res_BC$ci_l
#> [1] 1.745809
#>
#> $inferential$fit$res_BC$ci_u
#> [1] 3.118579
#>
#> $inferential$fit$res_BC$pval
#> [1] 1.035032e-08
#>
#>
#> $inferential$fit$res_AB
#> result pvalue
#> "0.53[0.30; 0.92]" "0.024"
#>
#> $inferential$fit$res_AB_unadj
#> result pvalue
#> "0.73[0.49; 1.09]" "0.125"
#>
#> $inferential$fit$boot_res
#> NULL
#>
#> $inferential$fit$boot_res_AC
#> NULL
#>
#> $inferential$fit$boot_res_AB_mc
#> NULL
#>
#> $inferential$fit$boot_res_AB
#> NULL
#>
#>
#> $inferential$summary
#> case OR LCL UCL pval
#> 1 AC 1.6993007 1.2809976 2.2541985 2.354448e-04
#> 2 adjusted_AC 1.2332036 0.7710134 1.9724576 3.817109e-01
#> 3 BC 2.3333333 1.7458092 3.1185794 1.035032e-08
#> 4 AB 0.7282717 0.4857575 1.0918611 1.248769e-01
#> 5 adjusted_AB 0.5285158 0.3043103 0.9179084 2.356848e-02
library(dplyr, verbose = FALSE, quietly = TRUE)
library(glue, verbose = FALSE, quietly = TRUE)
##TODO: why has this got median in name here?
adsl_colnames[adsl_colnames == "N_PR_THER_MEDIAN"] <- "N_PR_THER"
lin_form <- as.formula(glue("y ~ trt * ({paste(adsl_colnames, collapse = ' + ')})"))
AC.IPD <- adsl_twt |>
merge(adrs_twt) |>
rename(trt = ARM,
y = RESPONSE) |>
mutate(AGE_SQUARED = AGE^2)
# transform names to format needed in outstandR
# mean.* and y.*
# note that we're nto going to use sd in matching
BC.ALD <- agd |>
rename(mean.AGE = AGE_MEAN,
# sd.AGE = AGE_SD,
mean.SEX_MALE = SEX_MALE_PROP,
mean.ECOG0 = ECOG0_PROP,
mean.SMOKE = SMOKE_PROP,
mean.N_PR_THER = N_PR_THER_MEDIAN) |>
mutate(
# mean.AGE_SQUARED = sd.AGE^2 + mean.AGE^2,
mean.AGE_SQUARED = mean.AGE^2, ##TODO: this is a fudge
# sd.SEX_MALE = sqrt(mean.SEX_MALE * (1 - mean.SEX_MALE)),
# sd.ECOG0 = sqrt(mean.ECOG0 * (1 - mean.ECOG0)),
# sd.SMOKE = sqrt(mean.SMOKE * (1 - mean.SMOKE)),
# outcomes
N.B = 480,
y.B.sum = binary_agd[binary_agd$ARM == "B" & binary_agd$RESPONSE == "YES", "COUNT"],
y.B.bar = y.B.sum/N.B,
N.C = 320,
y.C.sum = binary_agd[binary_agd$ARM == "C" & binary_agd$RESPONSE == "YES", "COUNT"],
y.C.bar = y.C.sum/N.C)
BC.ALD
#> mean.AGE mean.SEX_MALE mean.ECOG0 mean.SMOKE mean.N_PR_THER mean.AGE_SQUARED
#> 1 51 0.49 0.4 0.1966102 2 2601
#> N.B y.B.sum y.B.bar N.C y.C.sum y.C.bar
#> 1 480 280 0.5833333 320 120 0.375
Now that we’ve prepared the input data we’re ready to run the model.
##TODO: fix bug
outstandR_maic <-
outstandR(AC.IPD, BC.ALD,
strategy = strategy_maic(formula = lin_form,
family = binomial(link = "logit")))
maicChecks
We now follow the examples for the package maicChecks
.
##TODO:
library(maicChecks)
For G-formula we will compare with the output from the packages
gFormulaMI
and gfoRmula
.
gFormulaMI
##TODO:
library(gFormulaMI)