Releases: robertthiesmeier/unmeasured_confounding
Code for the simulation of low driving mileage bias
Code for the simulation of low driving mileage bias
Summary: Probabilistic sensitivity analysis using Monte Carlo simulations (MCS) has become an increasingly valuable tool to assess unmeasured confounding. This study applies MCSs to quantify suspected unmeasured confounding, i.e. low driving mileage, on the effect of polypharmacy and road traffic crashes (RTCs) in older adults.
Aim: To determine the impact of low driving mileage bias on the effect of polypharmacy on RTCs in older adults.
This study applies an MCS approach to the case of unmeasured confounding, namely low annual driving mileage, on the effect of polypharmacy on road traffic crashes (RTCs) in older adults. The term unmeasured confounding is used to refer to confounding due to a lack of data for adjustment and consequent omission from the statistical model. In addition, we define concurrent consumption of five or more prescribed medications in 90 days as polypharmacy. The code for the simulation can be used and edited based on different assumptions and parameters.
Stata Code
Loop for simulation
capture program drop sim_logit_driving
program define sim_logit_driving, rclass
version 16.1
syntax [, obs(integer 100000) ]
drop _all
set obs `obs'
*Parameters for K
gen age = rbinomial(1, .3) // >65yo
gen sex = rbinomial(1, .52) // female
gen area = rbinomial(1, .6) // urban
*Baseline b0 characteristics
scalar a0 = logit(.3) // low driving milage: 30%
scalar c0 = logit(0.17) // Polypharmacy:17%
scalar b0 = logit(.002) // RTC 0.2%
*Parameters for the ZK relationship
scalar a1 = log(1.2) // Age
scalar a2 = log(1.8) // Sex
scalar a3 = log(1.2) // Area
*Parameters for the XKZ relationship
scalar c1 = log(2) // Age
scalar c2 = log(0.9) // Sex
scalar c3 = log(1.2) // Area
scalar c4 = log(1.5) // XZ
*Parameters for the YXZK relationship
scalar b1 = rnormal(ln(1.35), (ln(1.56)-ln(1.17))/(2*1.96)) // X
scalar b2 = log(3) // Z
scalar b3 = log(0.6) // Age
scalar b4 = log(0.3) // Sex
scalar b5 = log(0.9) // Area
gen z = rbinomial(1, invlogit(a0 + a1*age + a2*sex + a3*area))
gen x = rbinomial(1, invlogit(c0 + c1*age + c2*sex + c3*area + c4*z))
gen y = rbinomial(1, invlogit(b0 + b1*x + b2*z + b3*age + b4*sex +
b5*area))
* DGM under an effect of X not adjusted for Z
di _n "Ignoring confounding by Z"
logit y x age sex area, or
return scalar b1_1_x = _b[x]
return scalar se_b1_1_x = _se[x]
* DGM under an effect of X adjusted for Z
di _n "Adjusting for confounding by Z"
logit y x z age sex area, or
test _b[x] = 0
return scalar p = r(p)
return scalar b1_1_z = _b[x]
return scalar se_b1_1_z = _se[x]
end
Simulate one large prospective study of size n = 100000
sim_logit_driving
Simulate a 1000 studies, each n=100000
simulate p = r(p) b1_1_x = r(b1_1_x) se_b1_1_x = r(se_b1_1_x) ///
b1_1_z = r(b1_1_z) se_b1_1_z = r(se_b1_1_z), reps(1000):
sim_logit_driving
count if p < 0.05
display "Simulated Power = " r(N)/c(N)*100
OR1 (effect of X not adjusting for Z) (95%CI)
gen orx = exp(b1_1_x)
su b1_1_x
di exp(r(mean)-1.96*r(sd))
di exp(r(mean)+1.96*r(sd))
OR0 (effect of X adjusting for Z) (95%CI)
gen orz = exp(b1_1_z)
su b1_1_z
di exp(r(mean)-1.96*r(sd))
di exp(r(mean)+1.96*r(sd))
Graphical presentation
twoway ///
(histogram orx, bin(60) fc(black%10) lc(black%30)) ///
(histogram orz, bin(60) fc(black%25) lc(black%25)) ///
, xlabel(#10) ylabel(#10, angle(horiz) format(%3.2f)) ///
xline(1.37 1.53, lc(black%80) lp(-)) ///
xtitle("Odds Ratio (ln)") xscale(log) ytitle("Simulated
Distribution") ///
scheme(s1mono) plotregion(style(none)) ///
ylabel(#10) plotregion(style(none)) ///
legend(order(1 2) label(1 "Crude OR{sub:{it:YX}}=1.53") label(2
"Adjusted OR{sub:{it:YX|Z}}=1.37") ring(0) col(1) pos(1)
size(small) region(style(none))) name(fig1, replace)
* Kernel Density: Comparison Effect of X vs no Effect of X after Adjustment of Z
twoway ///
(kdensity orx, color(black%10) cmissing(y) recast(area)) ///
(kdensity orz, color(black%25) cmissing(y) recast(area)), ///
xline(1.37 1.53, lc(black%50) lp(-)) ///
ytitle("Simulated Distribution") ///
xtitle("Odds Ratio (ln)") xscale(log) ///
ylabel(#10, angle(horiz) format(%3.2f)) ///
xlabel(#10) scheme(s1mono) plotregion(style(none)) ///
legend(order(1 2) label(1 "Crude OR{sub:{it:YX}}=1.53") ///
label(2 "Adjusted OR{sub:{it:YX|Z}}=1.37") ring(0) col(1)
pos(1) size(small) region(style(none))) name(fig2, replace)
* Combined Graph
graph combine fig1 fig2 ///
, ycommon cols(1) scheme(s1mono) name(fig_combine, replace)
Python Code
Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
Define the mathematical function invlogit
def invlogit(x):
return np.exp(x)/(1+np.exp(x))
Define the mathematical function logit
def logit(x):
return np.log(x/(1-x))
def sim_logit_driving():
n = 100000
Parameters
# Parameters for K
age = np.random.binomial(1, .3, n) # >65yo
sex = np.random.binomial(1, .52, n) # female
area = np.random.binomial(1, .6, n) # urban
# Baseline b0 characteristics
a0 = logit(.3) # low driving mileage
c0 = logit(0.17) # Polypharmacy
b0 = logit(.002) # RTC
# Parameters for the ZK relationship
a1 = np.log(1.2)
a2 = np.log(1.8)
a3 = np.log(1.2)
# Parameters for the XKZ relationship
c1 = np.log(2)
c2 = np.log(0.9)
c3 = np.log(1.2)
c4 = np.log(1.5)
# Parameters for the YXZK relationship
b1 = np.random.normal(np.log(1.35), (np.log(1.56)-
np.log(1.17))/(2*1.96), 1)
b2 = np.log(3)
b3 = np.log(0.6)
b4 = np.log(0.3)
b5 = np.log(0.9)
z = np.random.binomial(1, invlogit(a0 + a1*age + a2*sex +
a3*area), n)
x = np.random.binomial(1, invlogit(c0 + c1*age + c2*sex +
c3*area + c4*z), n)
y = np.random.binomial(1, invlogit(b0 + b1*x + b2*z + b3*age +
b4*sex + b5*area), n)
df = pd.DataFrame(data = np.column_stack((y, x, z,age, sex,
area) ))
DGM under an effect of X
model1 = smf.logit('y~x+z+age+sex+area', data=df).fit()
se = np.sqrt(np.diag(model1.cov_params()))
b1_1 = model1.params[1]
se_b1_1 = se[1]
# print(model1.summary())
DGM under no effect of X
b1 = 0
y = np.random.binomial(1, invlogit(b0 + b1*x + b2*z + b3*age +
b4*sex + b5*area), n)
model0 = smf.logit('y~x+z+age+sex+area', data=df).fit()
b1_0 = model0.params[1]
se = np.sqrt(np.diag(model0.cov_params()))
b1_0 = model0.params[1]
se_b1_0 = se[1]
# print(model0.summary())
b = (b1_1, se_b1_1, b1_0, se_b1_0)
return(b)
nsim = 1000
b = []
for i in range(nsim):
b = np.append(b, np.array(sim_logit_driving()), axis=0)
sim_results = np.reshape(b, (nsim, 4))
Extract point estimates and std errors
b1 = sim_results[:,0]
se_b1 = sim_results[:,1]
or1 = np.exp(sim_results[:,0])
or0 = np.exp(sim_results[:,2])
fig, ax = plt.subplots(figsize=(5, 3))
ax.set(xscale="log")
ax.hist(or0, color='red', alpha=.2, density=True, bins=30, rwidth=.9)
ax.set_xticks((0.6, 0.8, 1.0, 1.3, 1.6, 2))
ax.xaxis.set_major_formatter(FormatStrFormatter('%3.2f'))
plt.axvline(np.exp(np.mean(np.log(or0))), color='green', alpha=.5,
linestyle='dashed', linewidth=3)
plt.hist(or1, color='blue', alpha=.2, density=True, bins=30, rwidth=.9)
plt.axvline(np.exp(np.mean(np.log(or1))), color='green', alpha=.5,
linestyle='dashed', linewidth=3)
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.ylabel("Sampling distribution")
plt.xlabel(r"$\hat \beta_1$")
plt.show()
average adjusted odds ratio under OR_XY|Z=1.35
np.exp(np.mean(np.log(or1)))
average adjusted odds ratio under OR_XY|Z=0
np.exp(np.mean(np.log(or0)))
Simulated statistical power
z = (b1-0)/se_b1
p = stats.norm.cdf(-abs(z))*2
power = np.sum(p<.05)/nsim
print("Simulated power (%) = " , power*100)
R code
Define the mathematical function invlogit
inv_logit <- function(x){1 / (1 + exp(-x))}
Loop for simulation
sim_logit_driving <- function() {
n = 100000
# Parameters for K
age <- rbinom(n, 1, .3) # >65yo
sex <- rbinom(n, 1, .52) # female
area <- rbinom(n, 1, .6) # urban
# Baseline b0 characteristics
a0 <- logit(.3) # low driving mileage
c0 <- logit(0.17) # Polypharmacy
b0 <- logit(.002) # RTC
# Parameters for the ZK relationship
a1 <- log(1.2)
a2 <- log(1.8)
a3 <- log(1.2)
# Parameters for the XKZ relationship
c1 <- log(2)
c2 <- log(0.9)
c3 <- log(1.2)
c4 <- log(1.5)
# Parameters for the YXZK relationship
b1 <- rnorm(1, log(1.35), (log(1.56)-log(1.17))/(2*1.96))
b2 <- log(3)
b3 <- log(0.6)
b4 <- log(0.3)
b5 <- log(0.9)
z <- rbinom(n, 1, inv_logit(a0 + a1*age + a2*sex + a3*area))
x <- rbinom(n, 1, inv_logit(c0 + c1*age + c2*sex + c3*area + c4*z))
y <- rbinom(n, 1, inv_logit(b0 + b1*x + b2*z + b3*age + b4*sex + b5*area))
# df = data.frame(data = cbind(y, x, z,age, sex, area) )
# DGM under an effect of X
...