Comparison with other packages
Source:vignettes/comparison-with-other-packages.Rmd
comparison-with-other-packages.Rmd
The aim of this document is to compare the results from various other
R packages with those produced using outstandR. There are
already a few widely-used R packages on CRAN to perform model-based
standardization, such as {marginaleffects}
and {stdReg}/{stdReg2}
.
marginaleffects
The marginaleffects package has functions to calculate G-computation estimates.
##TODO: see unit tests
library(marginaleffects)
# marginaleffects::avg_predictions()
# marginaleffects::avg_comparisons()
stdReg[2]
The package {stdReg}
has been rewritten to produce
stdReg2 which has “nicer output, more available methods,
the possibility to include new methods, and mainly to make maintenance
and updating easier.” So, for this reason, we will use
stdReg2.
We shall follow the article Estimation
of causal effects using stdReg2. This is for continuous
outcomes (change in weight) and includes transformed covariates. Note
that the regression has transformed covariates using the
I()
function. Two outcome scales are calculated, risk
difference and risk ratio. The data also contains categorical variable
with more than 2 levels, education
,
smokeintensity
, exercise
and
active
.
library(stdReg2)
library(causaldata) # contains data set
# load data set
nhefs_dat <- causaldata::nhefs_complete
m <- glm(wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) +
smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2),
data = nhefs_dat)
summary(m)
#>
#> Call:
#> glm(formula = wt82_71 ~ qsmk + sex + race + age + I(age^2) +
#> as.factor(education) + smokeintensity + I(smokeintensity^2) +
#> smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) +
#> wt71 + I(wt71^2), data = nhefs_dat)
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -1.6586176 4.3137734 -0.384 0.700666
#> qsmk 3.4626218 0.4384543 7.897 5.36e-15 ***
#> sex1 -1.4650496 0.4683410 -3.128 0.001792 **
#> race1 0.5864117 0.5816949 1.008 0.313560
#> age 0.3626624 0.1633431 2.220 0.026546 *
#> I(age^2) -0.0061377 0.0017263 -3.555 0.000389 ***
#> as.factor(education)2 0.8185263 0.6067815 1.349 0.177546
#> as.factor(education)3 0.5715004 0.5561211 1.028 0.304273
#> as.factor(education)4 1.5085173 0.8323778 1.812 0.070134 .
#> as.factor(education)5 -0.1708264 0.7413289 -0.230 0.817786
#> smokeintensity 0.0651533 0.0503115 1.295 0.195514
#> I(smokeintensity^2) -0.0010468 0.0009373 -1.117 0.264261
#> smokeyrs 0.1333931 0.0917319 1.454 0.146104
#> I(smokeyrs^2) -0.0018270 0.0015438 -1.183 0.236818
#> as.factor(exercise)1 0.3206824 0.5349616 0.599 0.548961
#> as.factor(exercise)2 0.3628786 0.5589557 0.649 0.516300
#> as.factor(active)1 -0.9429574 0.4100208 -2.300 0.021593 *
#> as.factor(active)2 -0.2580374 0.6847219 -0.377 0.706337
#> wt71 0.0373642 0.0831658 0.449 0.653297
#> I(wt71^2) -0.0009158 0.0005235 -1.749 0.080426 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for gaussian family taken to be 53.59474)
#>
#> Null deviance: 97176 on 1565 degrees of freedom
#> Residual deviance: 82857 on 1546 degrees of freedom
#> AIC: 10701
#>
#> Number of Fisher Scoring iterations: 2
res_stdReg2 <- standardize_glm(
wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) +
smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2),
data = nhefs_dat,
values = list(qsmk = c(0,1)),
contrasts = c("difference", "ratio"),
reference = 0)
res_stdReg2
#> Outcome formula: wt82_71 ~ qsmk + sex + race + age + I(age^2) + as.factor(education) +
#> smokeintensity + I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) +
#> as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2)
#> Outcome family: gaussian
#> Outcome link function: identity
#> Exposure: qsmk
#>
#> Tables:
#> qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1 0 1.75 0.217 1.32 2.17
#> 2 1 5.21 0.420 4.39 6.03
#>
#> Reference level: qsmk = 0
#> Contrast: difference
#> qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1 0 0.00 0.000 0.00 0.00
#> 2 1 3.46 0.466 2.55 4.38
#>
#> Reference level: qsmk = 0
#> Contrast: ratio
#> qsmk Estimate Std.Error lower.0.95 upper.0.95
#> 1 0 1.00 0.000 1.00 1.00
#> 2 1 2.98 0.435 2.13 3.83
Most simply, we can drop the variables with more than 2 levels and
run outstandR()
.
library(dplyr)
nhefs_ipd <- nhefs_dat |>
# drop categorical variables
dplyr::select(qsmk, sex, race, age, smokeintensity, smokeyrs, wt71, wt82_71) |>
dplyr::rename(trt = qsmk,
y = wt82_71) |>
dplyr::mutate(sex = as.numeric(sex) - 1,
race = as.numeric(race) - 1,
trt = factor(trt, labels = c("C", "A")))
# create aggregate data
nhefs.X <- nhefs_ipd %>%
summarise(across(-c(trt, y),
list(mean = mean, sd = sd),
.names = "{fn}.{col}"))
nhefs.B <- nhefs_ipd |>
dplyr::filter(trt == 1) |>
summarise(y.B.sum = sum(y),
y.B.bar = mean(y),
y.B.sd = sd(y),
N.B = n())
nhefs.C <- nhefs_ipd |>
dplyr::filter(trt == 0) |>
summarise(y.C.sum = sum(y),
y.C.bar = mean(y),
y.C.sd = sd(y),
N.C = n())
nhefs_ald <- cbind.data.frame(nhefs.X, nhefs.C, nhefs.B)
# simplified formula
lin_form <-
as.formula(y ~ trt * (sex + race + age + I(age^2) +
smokeintensity + I(smokeintensity^2) +
smokeyrs + I(smokeyrs^2) + wt71 + I(wt71^2)))
nhefs_strat <-
strategy_gcomp_ml(formula = lin_form,
family = gaussian(link = "identity"))
res <- outstandR(ald_trial = nhefs_ald,
ipd_trial = nhefs_ipd,
strategy = nhefs_strat,
scale = "log_relative_risk")
res
#> Object of class 'outstandR'
#> Model: gaussian
#> Scale: log_relative_risk
#> Common treatment: C
#> Individual patient data study: AC
#> Aggregate level data study: BC
#> Confidence interval level: 0.95
#>
#> Contrasts:
#>
#> # A tibble: 3 × 5
#> Treatments Estimate Std.Error lower.0.95 upper.0.95
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 AB NaN NaN NaN NaN
#> 2 AC 1.07 0.0198 0.790 1.34
#> 3 BC NaN NaN NaN NaN
#>
#> Absolute:
#>
#> # A tibble: 2 × 5
#> Treatments Estimate Std.Error lower.0.95 upper.0.95
#> <chr> <dbl> <dbl> <lgl> <lgl>
#> 1 A 5.26 0.192 NA NA
#> 2 C 1.82 0.0392 NA NA
Alternatively, rather than dropping factors, we can create dummy
variables to replace them using
maicplus::dummize_ipd()
.
nhefs_ipd <- nhefs_dat |>
dplyr::select(qsmk, sex, race, age, smokeintensity, smokeyrs, wt71,
wt82_71, education, exercise, active) |>
dplyr::rename(trt = qsmk,
y = wt82_71) |>
dplyr::mutate(sex = as.numeric(sex) - 1, # binary
race = as.numeric(race) - 1) |>
maicplus::dummize_ipd(dummize_cols = c("education", "exercise", "active"),
dummize_ref_level = c("1","0","0")) |>
dplyr::select(-education, -exercise, -active) |> # remove originals
dplyr::mutate(trt = factor(trt, labels = c("C", "A")))
# # replace with this?
# model.matrix()
# create aggregate data
nhefs.X <- nhefs_ipd %>%
summarise(across(-c(trt, y),
list(mean = mean, sd = sd),
.names = "{fn}.{col}"))
nhefs.B <- dplyr::filter(nhefs_ipd, trt == 1) |>
summarise(y.B.sum = sum(y),
y.B.bar = mean(y),
y.B.sd = sd(y),
N.B = n())
nhefs.C <- dplyr::filter(nhefs_ipd, trt == 0) |>
summarise(y.C.sum = sum(y),
y.C.bar = mean(y),
y.C.sd = sd(y),
N.C = n())
nhefs_ald <- cbind.data.frame(nhefs.X, nhefs.C, nhefs.B)
lin_form <-
as.formula(y ~ trt * (sex + race + age + I(age^2) +
smokeintensity + I(smokeintensity^2) +
EDUCATION_2 + EDUCATION_3 + EDUCATION_4 + EDUCATION_5 +
EXERCISE_1 + EXERCISE_2 + ACTIVE_1 + ACTIVE_2 +
smokeyrs + I(smokeyrs^2) + wt71 + I(wt71^2)))
nhefs_strat <-
strategy_gcomp_ml(formula = lin_form,
family = gaussian(link = "identity"))
res <- outstandR(ald_trial = nhefs_ald,
ipd_trial = nhefs_ipd,
strategy = nhefs_strat,
scale = "risk_difference")
res
#> Object of class 'outstandR'
#> Model: gaussian
#> Scale: risk_difference
#> Common treatment: C
#> Individual patient data study: AC
#> Aggregate level data study: BC
#> Confidence interval level: 0.95
#>
#> Contrasts:
#>
#> # A tibble: 3 × 5
#> Treatments Estimate Std.Error lower.0.95 upper.0.95
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 AB NaN NA NA NA
#> 2 AC 3.59 0.235 2.64 4.54
#> 3 BC NaN NA NA NA
#>
#> Absolute:
#>
#> # A tibble: 2 × 5
#> Treatments Estimate Std.Error lower.0.95 upper.0.95
#> <chr> <dbl> <dbl> <lgl> <lgl>
#> 1 A 5.27 0.192 NA NA
#> 2 C 1.68 0.0437 NA NA
The 2024 ISPOR
poster by Hatswell presents a comparison of different MAIC R
packages. We shall use this as our starting point. Of the packages
considered, {MAIC}
is not on CRAN so not included in our
analysis at the moment.
maicplus
Roche developed the maic
package which is now deprecated
and replaced by maicplus
. We will follow the article titled
Anchored
Binary Analysis which is part of their package to make our
comparison.
library(maicplus)
# Load required data
data(centered_ipd_twt)
data(adrs_twt) # response IPD
data(adsl_sat) # covariates IPD
# Load required package
library(dplyr)
# Prepare matching variables (only using means, ignoring SDs)
adsl_sat <- adsl_sat |>
mutate(
SEX_MALE = as.integer(SEX == "Male"),
AGE_SQUARED = AGE^2
)
# Define variable names for matching (excluding SDs)
adsl_colnames <- c("AGE", "AGE_SQUARED", "SEX_MALE", "ECOG0", "SMOKE", "N_PR_THER_MEDIAN")
centered_colnames <- paste0(adsl_colnames, "_CENTERED")
# Estimate weights based only on means
weighted_data <- estimate_weights(
data = centered_ipd_twt,
centered_colnames = centered_colnames
)
# Aggregate summary data for AGD (ONLY MEANS, NO SDs)
agd <- data.frame(
AGE_MEAN = 51,
SEX_MALE_PROP = 147 / 300,
ECOG0_PROP = 0.40,
SMOKE_PROP = 58 / (300 - 5),
N_PR_THER_MEDIAN = 2
)
# Binary AGD summary data
binary_agd <- data.frame(
ARM = rep(c("B", "C"), each = 2),
RESPONSE = rep(c("YES", "NO"), 2),
COUNT = c(280, 200, 120, 200)
)
# Generate pseudo-IPD from binary AGD
pseudo_adrs <- get_pseudo_ipd_binary(
binary_agd = binary_agd,
format = "stacked"
)
# Perform MAIC analysis (matching only on means)
res_maicplus <- maic_anchored(
weights_object = weighted_data,
ipd = adrs_twt,
pseudo_ipd = pseudo_adrs,
trt_ipd = "A",
trt_agd = "B",
trt_common = "C",
normalize_weight = FALSE,
endpoint_type = "binary",
endpoint_name = "Binary Endpoint",
eff_measure = "OR",
binary_robust_cov_type = "HC3"
)
res_maicplus
#> $descriptive
#> $descriptive$summary
#> trt_ind treatment type n events events_pct
#> 1 C C IPD, before matching 500.0000 338.0000 67.60000
#> 2 A A IPD, before matching 500.0000 390.0000 78.00000
#> 3 C C IPD, after matching 199.4265 134.0364 67.21094
#> 4 A A IPD, after matching 199.4265 142.8968 71.65386
#> 5 C C AgD, external 320.0000 120.0000 37.50000
#> 6 B B AgD, external 480.0000 280.0000 58.33333
#>
#>
#> $inferential
#> $inferential$fit
#> $inferential$fit$model_before_ipd
#>
#> Call: glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link),
#> data = ipd)
#>
#> Coefficients:
#> (Intercept) ARMA
#> 0.7354 0.5302
#>
#> Degrees of Freedom: 999 Total (i.e. Null); 998 Residual
#> Null Deviance: 1170
#> Residual Deviance: 1157 AIC: 1161
#>
#> $inferential$fit$model_after_ipd
#>
#> Call: glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link),
#> data = ipd, weights = weights)
#>
#> Coefficients:
#> (Intercept) ARMA
#> 0.7177 0.2096
#>
#> Degrees of Freedom: 999 Total (i.e. Null); 998 Residual
#> Null Deviance: 491.1
#> Residual Deviance: 490.1 AIC: 458.4
#>
#> $inferential$fit$model_agd
#>
#> Call: glm(formula = RESPONSE ~ ARM, family = binomial(link = glm_link),
#> data = pseudo_ipd)
#>
#> Coefficients:
#> (Intercept) ARMB
#> -0.5108 0.8473
#>
#> Degrees of Freedom: 799 Total (i.e. Null); 798 Residual
#> Null Deviance: 1109
#> Residual Deviance: 1075 AIC: 1079
#>
#> $inferential$fit$res_AC
#> $inferential$fit$res_AC$est
#> [1] 1.233204
#>
#> $inferential$fit$res_AC$se
#> [1] 0.3085377
#>
#> $inferential$fit$res_AC$ci_l
#> [1] 0.7710134
#>
#> $inferential$fit$res_AC$ci_u
#> [1] 1.972458
#>
#> $inferential$fit$res_AC$pval
#> [1] 0.3817109
#>
#>
#> $inferential$fit$res_AC_unadj
#> $inferential$fit$res_AC_unadj$est
#> [1] 1.699301
#>
#> $inferential$fit$res_AC_unadj$se
#> [1] 0.2488482
#>
#> $inferential$fit$res_AC_unadj$ci_l
#> [1] 1.280998
#>
#> $inferential$fit$res_AC_unadj$ci_u
#> [1] 2.254199
#>
#> $inferential$fit$res_AC_unadj$pval
#> [1] 0.0002354448
#>
#>
#> $inferential$fit$res_BC
#> $inferential$fit$res_BC$est
#> [1] 2.333333
#>
#> $inferential$fit$res_BC$se
#> [1] 0.3510631
#>
#> $inferential$fit$res_BC$ci_l
#> [1] 1.745809
#>
#> $inferential$fit$res_BC$ci_u
#> [1] 3.118579
#>
#> $inferential$fit$res_BC$pval
#> [1] 1.035032e-08
#>
#>
#> $inferential$fit$res_AB
#> result pvalue
#> "0.53[0.30; 0.92]" "0.024"
#>
#> $inferential$fit$res_AB_unadj
#> result pvalue
#> "0.73[0.49; 1.09]" "0.125"
#>
#> $inferential$fit$boot_res
#> NULL
#>
#> $inferential$fit$boot_res_AC
#> NULL
#>
#> $inferential$fit$boot_res_AB_mc
#> NULL
#>
#> $inferential$fit$boot_res_AB
#> NULL
#>
#>
#> $inferential$summary
#> case OR LCL UCL pval
#> 1 AC 1.6993007 1.2809976 2.2541985 2.354448e-04
#> 2 adjusted_AC 1.2332036 0.7710134 1.9724576 3.817109e-01
#> 3 BC 2.3333333 1.7458092 3.1185794 1.035032e-08
#> 4 AB 0.7282717 0.4857575 1.0918611 1.248769e-01
#> 5 adjusted_AB 0.5285158 0.3043103 0.9179084 2.356848e-02
library(dplyr, verbose = FALSE, quietly = TRUE)
library(glue, verbose = FALSE, quietly = TRUE)
##TODO: why has this got median in name here?
adsl_colnames[adsl_colnames == "N_PR_THER_MEDIAN"] <- "N_PR_THER"
lin_form <- as.formula(glue("y ~ trt * ({paste(adsl_colnames, collapse = ' + ')})"))
ipd_trial <- adsl_twt |>
merge(adrs_twt) |>
rename(trt = ARM,
y = RESPONSE) |>
mutate(AGE_SQUARED = AGE^2)
# sd from IPD
agd_sd <- ipd_trial |>
summarise(across(c(AGE, AGE_SQUARED, SMOKE, ECOG0, N_PR_THER, SEX_MALE),
.fns = sd, .names = "sd.{.col}"))
# transform names to format needed in outstandR
# mean.* and y.*
# note that we're nto going to use sd in matching
ald_trial <- agd |>
rename(mean.AGE = AGE_MEAN,
# sd.AGE = AGE_SD,
mean.SEX_MALE = SEX_MALE_PROP,
mean.ECOG0 = ECOG0_PROP,
mean.SMOKE = SMOKE_PROP,
mean.N_PR_THER = N_PR_THER_MEDIAN) |>
mutate(
# mean.AGE_SQUARED = sd.AGE^2 + mean.AGE^2,
# mean.AGE_SQUARED = mean.AGE^2, ##TODO: this is a fudge
# sd.SEX_MALE = sqrt(mean.SEX_MALE * (1 - mean.SEX_MALE)),
# sd.ECOG0 = sqrt(mean.ECOG0 * (1 - mean.ECOG0)),
# sd.SMOKE = sqrt(mean.SMOKE * (1 - mean.SMOKE)),
# outcomes
N.B = 480,
y.B.sum = binary_agd[binary_agd$ARM == "B" & binary_agd$RESPONSE == "YES", "COUNT"],
y.B.bar = y.B.sum/N.B,
N.C = 320,
y.C.sum = binary_agd[binary_agd$ARM == "C" & binary_agd$RESPONSE == "YES", "COUNT"],
y.C.bar = y.C.sum/N.C) |>
cbind(agd_sd) |>
relocate(N.B, y.B.sum, y.B.bar, N.C, y.C.sum, y.C.bar, .after = sd.SEX_MALE)
ald_trial
#> mean.AGE mean.SEX_MALE mean.ECOG0 mean.SMOKE mean.N_PR_THER sd.AGE
#> 1 51 0.49 0.4 0.1966102 2 9.006409
#> sd.AGE_SQUARED sd.SMOKE sd.ECOG0 sd.N_PR_THER sd.SEX_MALE N.B y.B.sum
#> 1 1085.096 0.4667096 0.4913302 1.100505 0.4866013 480 280
#> y.B.bar N.C y.C.sum y.C.bar
#> 1 0.5833333 320 120 0.375
Now that we’ve prepared the input data we’re ready to run the model.
##TODO: fix bug of large value in optim()
outstandR_maic <-
outstandR(ipd_trial, ald_trial,
strategy = strategy_maic(formula = lin_form,
family = binomial(link = "logit")))
maicChecks
We now follow the examples for the package maicChecks
.
##TODO:
library(maicChecks)
For G-formula we will compare with the output from the packages
gFormulaMI
and gfoRmula
.
gFormulaMI
##TODO:
library(gFormulaMI)