## Summary

This is a simulation study in which adaptive FAB $$p$$-values are constructed for logistic regression coefficients. This document serves as the replication code for the example in Section 4.3 of the article “Smaller $$p$$-values via indirect information” (Hoff 2019).

The linking model we consider here is that the logistic regression coefficients $$\theta_1,\ldots, \theta_p$$ are i.i.d. from a mixture of a point-mass distribution at zero and a normal distribution with some mean and variance.

Here is a function that estimates the parameters in the linking model. More generally, it obtains MLEs of $$\mu$$ and $$\tau^2$$ under the model \begin{align*} y & \sim N_p( \theta , \sigma^2 I) \\ \theta_1,\ldots, \theta_p &\sim i.i.d\ (1-\pi) \delta_0(\theta) + \text{dnorm}(\theta, \mu,\tau) \end{align*}

fitMM<-function(y,s2){

obj<-function(gam){
pi<-gam[1] ; mu<-gam[2] ; t2<-gam[3]
-sum( log( pi*dnorm(y,mu,sqrt(t2+s2)) + (1-pi)*dnorm(y,0,sqrt(s2))) )
}

mut2<-optim( c(.9,mean(y),var(y)),obj,method="L-BFGS-B",
lower=c(.001,-Inf,.001),upper=c(.999,Inf,Inf))$par[2:3] names(mut2)<-c("mu","t2") mut2 } For this example, for each $$\theta_j$$ we will plug-in an estimate $$\hat\theta_{-j}$$ of the vector $$\theta_{-j}$$ using data that is independent (asymptotically) of $$\hat\theta_j$$. Here is the simulation: set.seed(1) p<-30 theta0<-rep(c(3,0),times=c(p/2,p/2)) nvals<-c(200,400,800,1600) nsim<-5000 histW0<-histF0<-histW1<-histF1<-list() sigTN<-sigNN<-NULL par(mfrow=c(4,3),mar=c(3,3,1,1),mgp=c(1.75,.75,0)) for(k in 1:length(nvals)){ n<-nvals[k] PF<-PW<-NULL for(s in 1:nsim){ X<-matrix(rnorm(n*p),n,p) lp<-X%*%theta0/sqrt(n) y<-rbinom(n,1,1/(1+exp(-lp))) fit<-glm(y~-1+X,family="binomial") theta<-fit$coef
Sigma<-summary(fit)\$cov.scaled
s2<-diag(Sigma)

MUT2<-NULL
for(j in 1:p){
G<-MASS::Null(Sigma[,j])
MUT2<-rbind(MUT2,fitMM(  (G%*%(t(G)%*%theta))[-j],s2[-j]) )
}

z<-theta/sqrt(s2)
pW<-1-abs( pnorm(z)-pnorm(-z))
pF<-1-abs( pnorm(z+2*sqrt(s2)*MUT2[,1]/MUT2[,2]) - pnorm(-z) )
PF<-rbind(PF,pF) ; PW<-rbind(PW,pW)
}

histW0[[k]]<-hist(PW[,theta0==0],plot=FALSE)
histW1[[k]]<-hist(PW[,theta0!=0],plot=FALSE)
histF0[[k]]<-hist(PF[,theta0==0],plot=FALSE)
histF1[[k]]<-hist(PF[,theta0!=0],plot=FALSE)

hist(PF[,theta0!=0],col=rgb(.3,.3,.3,.5),prob=TRUE)

hist(PF[,theta0==0])
hist(PW[,theta0==0])

pW<-mean(PW[,theta0==0]<.05) ; pF<-mean(PF[,theta0==0]<.05)
sigTN<-cbind(sigTN,c(pW,pF))

pW<-mean(PW[,theta0!=0]<.05) ; pF<-mean(PF[,theta0!=0]<.05)
sigNN<-cbind(sigNN,c(pW,pF))

}

Here we save the results:

colnames(sigTN)<-colnames(sigNN)<-nvals
rownames(sigTN)<-rownames(sigNN)<-c("pW","pF")
names(histW0)<-names(histW1)<-nvals
names(histF0)<-names(histF1)<-nvals
save(sigTN,sigNN,histW0,histW1,histF0,histF1,file="resultsLogistic.RData") 

Here are the fractions of true-null $$p$$-values below 0.05:

sigTN
##           200        400        800    1600
## pW 0.06944000 0.05845333 0.05421333 0.05048
## pF 0.06394667 0.05730667 0.05309333 0.05004

Here are the fractions of non-null $$p$$-values below 0.05:

sigNN
##          200       400       800      1600
## pW 0.2923733 0.3062933 0.3154933 0.3189067
## pF 0.4010933 0.4189067 0.4282800 0.4353733

Here are the plots from the article:

par(mfrow=c(4,3),mar=c(2.75,3.2,0,1),mgp=c(1.75,.75,0))
for(k in 1:4){

xlab<-1*(k==4) + 1

plot(histF1[[k]],col=rgb(.1,.4,.6,.5),main="",freq=FALSE,
xlab=c("","non-null p-values")[xlab],ylab="",xaxt=c("n","s")[xlab] )
}