exampleLogistic.Rmd
This is a simulation study in which adaptive FAB \(p\)-values are constructed for logistic regression coefficients. This document serves as the replication code for the example in Section 4.3 of the article “Smaller \(p\)-values via indirect information” (Hoff 2019).
The linking model we consider here is that the logistic regression coefficients \(\theta_1,\ldots, \theta_p\) are i.i.d. from a mixture of a point-mass distribution at zero and a normal distribution with some mean and variance.
Here is a function that estimates the parameters in the linking model. More generally, it obtains MLEs of \(\mu\) and \(\tau^2\) under the model \[ \begin{align*} y & \sim N_p( \theta , \sigma^2 I) \\ \theta_1,\ldots, \theta_p &\sim i.i.d\ (1-\pi) \delta_0(\theta) + \text{dnorm}(\theta, \mu,\tau) \end{align*} \]
fitMM<-function(y,s2){
obj<-function(gam){
pi<-gam[1] ; mu<-gam[2] ; t2<-gam[3]
-sum( log( pi*dnorm(y,mu,sqrt(t2+s2)) + (1-pi)*dnorm(y,0,sqrt(s2))) )
}
mut2<-optim( c(.9,mean(y),var(y)),obj,method="L-BFGS-B",
lower=c(.001,-Inf,.001),upper=c(.999,Inf,Inf))$par[2:3]
names(mut2)<-c("mu","t2")
mut2
}
For this example, for each \(\theta_j\) we will plug-in an estimate \(\hat\theta_{-j}\) of the vector \(\theta_{-j}\) using data that is independent (asymptotically) of \(\hat\theta_j\).
Here is the simulation:
set.seed(1)
p<-30
theta0<-rep(c(3,0),times=c(p/2,p/2))
nvals<-c(200,400,800,1600)
nsim<-5000
histW0<-histF0<-histW1<-histF1<-list()
sigTN<-sigNN<-NULL
par(mfrow=c(4,3),mar=c(3,3,1,1),mgp=c(1.75,.75,0))
for(k in 1:length(nvals)){
n<-nvals[k]
PF<-PW<-NULL
for(s in 1:nsim){
X<-matrix(rnorm(n*p),n,p)
lp<-X%*%theta0/sqrt(n)
y<-rbinom(n,1,1/(1+exp(-lp)))
fit<-glm(y~-1+X,family="binomial")
theta<-fit$coef
Sigma<-summary(fit)$cov.scaled
s2<-diag(Sigma)
MUT2<-NULL
for(j in 1:p){
G<-MASS::Null(Sigma[,j])
MUT2<-rbind(MUT2,fitMM( (G%*%(t(G)%*%theta))[-j],s2[-j]) )
}
z<-theta/sqrt(s2)
pW<-1-abs( pnorm(z)-pnorm(-z))
pF<-1-abs( pnorm(z+2*sqrt(s2)*MUT2[,1]/MUT2[,2]) - pnorm(-z) )
PF<-rbind(PF,pF) ; PW<-rbind(PW,pW)
}
histW0[[k]]<-hist(PW[,theta0==0],plot=FALSE)
histW1[[k]]<-hist(PW[,theta0!=0],plot=FALSE)
histF0[[k]]<-hist(PF[,theta0==0],plot=FALSE)
histF1[[k]]<-hist(PF[,theta0!=0],plot=FALSE)
hist(PF[,theta0!=0],col=rgb(.3,.3,.3,.5),prob=TRUE)
hist(PW[,theta0!=0],col=rgb(.8,.2,.5,.5),add=TRUE,prob=TRUE)
hist(PF[,theta0==0])
hist(PW[,theta0==0])
pW<-mean(PW[,theta0==0]<.05) ; pF<-mean(PF[,theta0==0]<.05)
sigTN<-cbind(sigTN,c(pW,pF))
pW<-mean(PW[,theta0!=0]<.05) ; pF<-mean(PF[,theta0!=0]<.05)
sigNN<-cbind(sigNN,c(pW,pF))
}
Here we save the results:
colnames(sigTN)<-colnames(sigNN)<-nvals
rownames(sigTN)<-rownames(sigNN)<-c("pW","pF")
names(histW0)<-names(histW1)<-nvals
names(histF0)<-names(histF1)<-nvals
save(sigTN,sigNN,histW0,histW1,histF0,histF1,file="resultsLogistic.RData")
Here are the fractions of true-null \(p\)-values below 0.05:
sigTN
## 200 400 800 1600
## pW 0.06944000 0.05845333 0.05421333 0.05048
## pF 0.06394667 0.05730667 0.05309333 0.05004
Here are the fractions of non-null \(p\)-values below 0.05:
sigNN
## 200 400 800 1600
## pW 0.2923733 0.3062933 0.3154933 0.3189067
## pF 0.4010933 0.4189067 0.4282800 0.4353733
Here are the plots from the article:
par(mfrow=c(4,3),mar=c(2.75,3.2,0,1),mgp=c(1.75,.75,0))
for(k in 1:4){
xlab<-1*(k==4) + 1
plot(histF1[[k]],col=rgb(.1,.4,.6,.5),main="",freq=FALSE,
xlab=c("","non-null p-values")[xlab],ylab="",xaxt=c("n","s")[xlab] )
plot(histW1[[k]],col=rgb(.9,.3,.3,.5),add=TRUE,freq=FALSE)
mtext(paste0("n=",names(histF1)[k]),2,line=2,cex=.85)
if(k==1){
legend(.5,7,legend=c("Wald","FAB"),pch=15,col=c(rgb(.9,.3,.3,.5),rgb(.1,.4,.6,.5)),bty="n")
}
plot(histW0[[k]],col="gray",main="",freq=FALSE,
xlab=c("","null Wald p-values")[xlab],ylab="",xaxt=c("n","s")[xlab])
plot(histF0[[k]],col="gray",main="",freq=FALSE,
xlab=c("","null FAB p-values")[xlab],ylab="",xaxt=c("n","s")[xlab])
}