Hello everyone,

I'm planning a between-subjects, within-items experiment with a binary variable (accuracy). I know that accuracy varies depending on a number of subject- and item-related variables that I'm not particularly interested in for the purposes of this experiment but that I can't neutralise either. In linear regression models, including important but uninteresting covariates improves statistical power and that benefit seems to carry over to (simple) logistic regression, too (http://pidgin.ucsd.edu/pipermail/r-lang/2015-July/000816.html). I'd like to find out whether it'd be worth the effort (in terms of keeping the participants around) to collect these subject-related variables in order to include them as covariates in a logistic mixed-effect model.

I ran some simulations (with, unfortunately, only few runs, as these simulations take forever) to explore the effect of including subject- and item-related covariates in a logistic GLMM when analysing a randomised between-subjects experiment (details below). Regardless of how I included the covariates (fixed effect only, fixed effect + random slope, fixed-effect + random slope + correlation), it didn't seem to affect the model's power, and I don't readily understand why, which is why I've got two questions for you:

1. Does the simulation below make sense?

2. If it does, could anyone please explain to me in plain language why including these covariates doesn't affect the model's power?

Thanks,
Jan

### The simulation
# I took the dataset from a between-subjects experiment similar to the one I hope to run as a starting point.

dat <- read.csv("http://homeweb.unifr.ch/VanhoveJ/Pub/Data/correspondences_shortened.csv";)

# Summarise data per participant and per item in order to later generate new covariate values:
# Summarise by participant
library(plyr)
perPart <- ddply(dat, .(Subject), summarise,
c.EnglishScore = mean(c.EnglishScore)) # the subject-related covariate
# Summarise by item
perItem <- ddply(dat, .(Item), summarise,
                 c.LevDist = mean(c.LevDist)) # the item-related covariate

# LearningCondition is the experimental condition; recode to -0.5 and 0.5:
dat$LearningCondition <- as.numeric(dat$LearningCondition) - 1.5

# Fit a GLMM on the original data
library(lme4)
mod <- glmer(CorrectVowel ~ LearningCondition + c.LevDist + c.EnglishScore +
(1 + c.LevDist | Subject) + (1 + c.EnglishScore + LearningCondition | Item), data = dat, family = binomial, control = glmerControl(optimizer="bobyqa"))

# Extract random effects
thetas <- getME(mod,"theta")

# Extract fixed effects:
betas <- fixef(mod)

# Set fixed effects for the simulation:
betas[2] <- 0.5 # effect of experimental condition
betas[3] <- -3 # effect of item-related covariate
betas[4] <- 3 # effect of subject-related covariate

# Function for simulating new data
power.simulation2 <- function(k = 80, m = 24) {

  ### Generate new data set with k participants and m items

  # First generate k new participants,
  # randomly assign them to the experimental/control conditions,
  # and assign an EnglishScore to them:
  parts <- data.frame(Subject = factor(1:k),
                      LearningCondition = sample(c(rep(-0.5, k/2),
                                                   rep(0.5, k/2))),
c.EnglishScore = sample(perPart$c.EnglishScore, k, replace = TRUE))

  # Then generate m new items,
  # and assign a LevDist to them.
  items <- data.frame(Item = factor(1:m),
c.LevDist = sample(perItem$c.LevDist, m, replace = TRUE))

  # Fully cross participants and items,
  newdat <- expand.grid(Subject = factor(1:k),
                        Item = factor(1:m))
  # add the item- and participant-related variables to new dataset:
  newdat <- merge(newdat, parts, by = "Subject")
  newdat <- merge(newdat, items, by = "Item")

  # Generate new dependent variable
  newdat$New <- factor(unlist(simulate(mod,
                                    newdata = newdat,
                                    allow.new.levels = TRUE,
newparams = list(theta = thetas, beta = betas))))

### Run new model comparison with both within-subject (LevDist) and within-item (EnglishScore) covariates

# mod1: random intercepts + random slope for between-subject condition (no-covariate null model)
  mod1 <- glmer(New ~  (1 | Subject) + (1 + LearningCondition | Item),
                data = newdat, family = binomial,
                control = glmerControl(optimizer="bobyqa"))

  # mod2: add experimental condition FE
  mod2 <- update(mod1, . ~ . + LearningCondition)

# mod3: random intercepts + random slopes for between- and within-subject variables as well as FE (covariate null model) mod3 <- glmer(New ~ c.LevDist + c.EnglishScore + (1 + c.LevDist | Subject) + (1 + LearningCondition + c.EnglishScore | Item),
                data = newdat, family = binomial,
                control = glmerControl(optimizer="bobyqa"))

  # mod4: add experimental condition FE
  mod4 <- update(mod3, . ~ . + LearningCondition)


# Compare mod1 and mod2 and return p-value (p-value for no-covariate model)
  pvaluewithout <- anova(mod1, mod2)[2, 8]

  # Compare mod3 and mod4 and return p-value (p-value for covariate model)
  pvaluewith <- anova(mod3, mod4)[2, 8]

  # Save estimate and se of mod2 (no-covariate model)
  est.without <- summary(mod2)$coef[2,1]
  se.without <- summary(mod2)$coef[2,2]

  # Save estimate and se of mod4 (covariate model)
  est.with <- summary(mod4)$coef[4,1]
  se.with <- summary(mod4)$coef[4,2]

  # Return p-values, estimates and standard errors
  return(list(pvaluewith,
              pvaluewithout,
              est.with,
              est.without,
              se.with,
              se.without))
}

# Run power.simulation2() 50 times with 80 participants and 50 items per participant.
# This takes a while, hence the low number of runs.
covar.adjust <- replicate(50, power.simulation2(k = 80, m = 50))

# Simulation results in a number of warning messages:
# 1: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv,  :
#  unable to evaluate scaled gradient
# 2: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv,  :
#  Model failed to converge: degenerate  Hessian with 1 negative eigenvalues

# In terms of power, the two models don't differ from each other:
table(unlist(covar.adjust[1,]) <= 0.05, unlist(covar.adjust[2,]) <= 0.05)

#        FALSE TRUE
#  FALSE    14    1
#  TRUE      2   33

# Here are the summaries per measure:

# 1. p-values for model with covariates:
summary(unlist(covar.adjust[1,]))
#     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
# 0.0000179 0.0036060 0.0141600 0.0743900 0.0704900 0.7896000

# 2. p-values for model without covariates:
summary(unlist(covar.adjust[2,]))
#     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
# 0.0000225 0.0030270 0.0168300 0.0720500 0.0643600 0.5697000

# 3. estimate of exp. condition with covariates:
summary(unlist(covar.adjust[3,]))
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 0.06243 0.43710 0.56460 0.55000 0.65240 1.02900

# 4. estimate of exp. condition without covariates:
summary(unlist(covar.adjust[4,]))
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 0.1385  0.4279  0.5399  0.5507  0.6809  1.0250

# 5. standard error of exp. condition with covariates:
summary(unlist(covar.adjust[5,]))
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 0.1870  0.2130  0.2249  0.2271  0.2327  0.2894

# 6. standard error of exp. condition without covariates
summary(unlist(covar.adjust[6,]))
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 0.1913  0.2199  0.2260  0.2313  0.2406  0.2911

Reply via email to