Hello everyone,
I'm planning a between-subjects, within-items experiment with a binary
variable (accuracy). I know that accuracy varies depending on a number
of subject- and item-related variables that I'm not particularly
interested in for the purposes of this experiment but that I can't
neutralise either. In linear regression models, including important but
uninteresting covariates improves statistical power and that benefit
seems to carry over to (simple) logistic regression, too
(http://pidgin.ucsd.edu/pipermail/r-lang/2015-July/000816.html). I'd
like to find out whether it'd be worth the effort (in terms of keeping
the participants around) to collect these subject-related variables in
order to include them as covariates in a logistic mixed-effect model.
I ran some simulations (with, unfortunately, only few runs, as these
simulations take forever) to explore the effect of including subject-
and item-related covariates in a logistic GLMM when analysing a
randomised between-subjects experiment (details below). Regardless of
how I included the covariates (fixed effect only, fixed effect + random
slope, fixed-effect + random slope + correlation), it didn't seem to
affect the model's power, and I don't readily understand why, which is
why I've got two questions for you:
1. Does the simulation below make sense?
2. If it does, could anyone please explain to me in plain language why
including these covariates doesn't affect the model's power?
Thanks,
Jan
### The simulation
# I took the dataset from a between-subjects experiment similar to the
one I hope to run as a starting point.
dat <-
read.csv("http://homeweb.unifr.ch/VanhoveJ/Pub/Data/correspondences_shortened.csv")
# Summarise data per participant and per item in order to later generate
new covariate values:
# Summarise by participant
library(plyr)
perPart <- ddply(dat, .(Subject), summarise,
c.EnglishScore = mean(c.EnglishScore)) # the
subject-related covariate
# Summarise by item
perItem <- ddply(dat, .(Item), summarise,
c.LevDist = mean(c.LevDist)) # the item-related covariate
# LearningCondition is the experimental condition; recode to -0.5 and 0.5:
dat$LearningCondition <- as.numeric(dat$LearningCondition) - 1.5
# Fit a GLMM on the original data
library(lme4)
mod <- glmer(CorrectVowel ~ LearningCondition + c.LevDist + c.EnglishScore +
(1 + c.LevDist | Subject) + (1 + c.EnglishScore +
LearningCondition | Item),
data = dat, family = binomial, control =
glmerControl(optimizer="bobyqa"))
# Extract random effects
thetas <- getME(mod,"theta")
# Extract fixed effects:
betas <- fixef(mod)
# Set fixed effects for the simulation:
betas[2] <- 0.5 # effect of experimental condition
betas[3] <- -3 # effect of item-related covariate
betas[4] <- 3 # effect of subject-related covariate
# Function for simulating new data
power.simulation2 <- function(k = 80, m = 24) {
### Generate new data set with k participants and m items
# First generate k new participants,
# randomly assign them to the experimental/control conditions,
# and assign an EnglishScore to them:
parts <- data.frame(Subject = factor(1:k),
LearningCondition = sample(c(rep(-0.5, k/2),
rep(0.5, k/2))),
c.EnglishScore = sample(perPart$c.EnglishScore,
k, replace = TRUE))
# Then generate m new items,
# and assign a LevDist to them.
items <- data.frame(Item = factor(1:m),
c.LevDist = sample(perItem$c.LevDist, m, replace
= TRUE))
# Fully cross participants and items,
newdat <- expand.grid(Subject = factor(1:k),
Item = factor(1:m))
# add the item- and participant-related variables to new dataset:
newdat <- merge(newdat, parts, by = "Subject")
newdat <- merge(newdat, items, by = "Item")
# Generate new dependent variable
newdat$New <- factor(unlist(simulate(mod,
newdata = newdat,
allow.new.levels = TRUE,
newparams = list(theta = thetas,
beta = betas))))
### Run new model comparison with both within-subject (LevDist) and
within-item (EnglishScore) covariates
# mod1: random intercepts + random slope for between-subject
condition (no-covariate null model)
mod1 <- glmer(New ~ (1 | Subject) + (1 + LearningCondition | Item),
data = newdat, family = binomial,
control = glmerControl(optimizer="bobyqa"))
# mod2: add experimental condition FE
mod2 <- update(mod1, . ~ . + LearningCondition)
# mod3: random intercepts + random slopes for between- and
within-subject variables as well as FE (covariate null model)
mod3 <- glmer(New ~ c.LevDist + c.EnglishScore + (1 + c.LevDist |
Subject) + (1 + LearningCondition + c.EnglishScore | Item),
data = newdat, family = binomial,
control = glmerControl(optimizer="bobyqa"))
# mod4: add experimental condition FE
mod4 <- update(mod3, . ~ . + LearningCondition)
# Compare mod1 and mod2 and return p-value (p-value for no-covariate
model)
pvaluewithout <- anova(mod1, mod2)[2, 8]
# Compare mod3 and mod4 and return p-value (p-value for covariate model)
pvaluewith <- anova(mod3, mod4)[2, 8]
# Save estimate and se of mod2 (no-covariate model)
est.without <- summary(mod2)$coef[2,1]
se.without <- summary(mod2)$coef[2,2]
# Save estimate and se of mod4 (covariate model)
est.with <- summary(mod4)$coef[4,1]
se.with <- summary(mod4)$coef[4,2]
# Return p-values, estimates and standard errors
return(list(pvaluewith,
pvaluewithout,
est.with,
est.without,
se.with,
se.without))
}
# Run power.simulation2() 50 times with 80 participants and 50 items per
participant.
# This takes a while, hence the low number of runs.
covar.adjust <- replicate(50, power.simulation2(k = 80, m = 50))
# Simulation results in a number of warning messages:
# 1: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
# unable to evaluate scaled gradient
# 2: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
# Model failed to converge: degenerate Hessian with 1 negative eigenvalues
# In terms of power, the two models don't differ from each other:
table(unlist(covar.adjust[1,]) <= 0.05, unlist(covar.adjust[2,]) <= 0.05)
# FALSE TRUE
# FALSE 14 1
# TRUE 2 33
# Here are the summaries per measure:
# 1. p-values for model with covariates:
summary(unlist(covar.adjust[1,]))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0000179 0.0036060 0.0141600 0.0743900 0.0704900 0.7896000
# 2. p-values for model without covariates:
summary(unlist(covar.adjust[2,]))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0000225 0.0030270 0.0168300 0.0720500 0.0643600 0.5697000
# 3. estimate of exp. condition with covariates:
summary(unlist(covar.adjust[3,]))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.06243 0.43710 0.56460 0.55000 0.65240 1.02900
# 4. estimate of exp. condition without covariates:
summary(unlist(covar.adjust[4,]))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.1385 0.4279 0.5399 0.5507 0.6809 1.0250
# 5. standard error of exp. condition with covariates:
summary(unlist(covar.adjust[5,]))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.1870 0.2130 0.2249 0.2271 0.2327 0.2894
# 6. standard error of exp. condition without covariates
summary(unlist(covar.adjust[6,]))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.1913 0.2199 0.2260 0.2313 0.2406 0.2911