##################### # general functions # ##################### haldane<- function(c1){ #calculate distance (M) #c1: recombination rate -0.5*log(1-2*c1) } haldane.inv<- function(d){ #calculate recombination rate #d: distance (M) (1-exp(-2*d))/2 } recomb.rate.ac<- function(r.ab,r.bc){ #calculate recombination rate between loci a and c (no interference) #r.ab: recombination rate between loci a and b #r.bc: recombination rate between loci b and c r.ab+r.bc-2*r.ab*r.bc } recomb.rate.bc<- function(r.ab,r.ac){ #calculate recombination rate between loci b and c (no interference) (r.ac-r.ab)/(1-2*r.ab) } getprob<- function(m1,m2,r1,r){ #conditional probability of QTL(QQ) given flanking markers #r: recombination rate between flanking markers m1 and m2 #r1: recombination rate between marker m1 and QTL r2<- recomb.rate.bc(r1,r) (m1-r1)*(m2-r2)/((m1-1)*m2+m1*(m2-1)+1-r) } rr<- function(d){ #recombination rate between two loci #d: distance between two loci (cM) haldane.inv(d/100) } div<- function(mpos,by=2){ #divide a genetic map mpos into small intervals #by: walk speed (cM) mid<- NULL chr<- NULL dist<- NULL N<- dim(mpos)[1] for(n in 1:(N-1)){ tmp1<- mpos$dist[n] tmp2<- mpos$dist[n+1] dff<- tmp2-tmp1 tmp<- tmp1 if(dff>0){ len0<- max(1,ceiling(dff/by)) len0<- dff/len0 while(tmp+len0/10 tol * max(pd) if (all(pd)){ inv<- xsvd$v %*% ((1/xsvd$d) * t(xsvd$u)) } else if (!any(pd)){ inv<- array(0, dim(xx)[2:1]) } else { inv<- xsvd$v[, pd] %*% ((1/xsvd$d[pd]) * t(xsvd$u[, pd])) } list(inv=inv,rank=sum(pd)) } ################################ # (linear) regression of y on x ################################ lr<- function(y,x){ #y: independent variable #x: a matrix, each column represents a predictor n<- length(y) xx<- as.matrix(x) xx<- cbind(1,xx) if(length(y)!=n) stop("wrong data...") xinv<- gv(t(xx)%*%xx) b<- xinv$inv %*% t(xx) %*% y fitted<- xx%*%b resid<- y-fitted rss<- sum(resid^2) r2<- 1-rss/(var(y)*(n-1)) loglik<- -(log(2*pi)+log(rss/n)+1)*n/2 list(coefficients=b, rank=xinv$rank, RSS=rss, R2=r2, loglik=loglik, fitted.values=fitted, residuals=resid, x=as.matrix(xx[,-1])) } ############################ # add one term: main effect ############################ madd1<- function(object,x){ #object: initial model #x: matrix, each column represents a predictor out<- object y<- out$fitted+out$residuals add<- c(0,0) x<- as.matrix(x) nx<- ncol(x) if(nx>0){ lik<- -Inf for(i in 1:nx){ xx<- cbind(object$x,x[,i]) o1<- lr(y,xx) if(o1$loglik>lik+1e-8){ out<- o1 lik<- o1$loglik add<- c(i,0) } } } if(add[1]){ xx<- cbind(object$x,x[,add[1]]) }else xx<- object$x list(coefficients=out$coefficients, rank=out$rank, RSS=out$RSS, R2=out$R2, loglik=out$loglik, add=add, fitted.values=out$fitted.values, residuals=out$residuals, x=xx) } ########################## # add one term: epistasis ########################## #object: initial model #x: matrix, each column represents a predictor madd2<- function(object,x){ out<- object y<- out$fitted+out$residuals add<- c(0,0) x<- as.matrix(x) nx<- ncol(x) if(nx>1){ lik<- -Inf for(i in 1:(nx-1)){ for(j in (i+1):nx){ xx<- cbind(object$x,x[,i]*x[,j]) o1<- lr(y,xx) if(o1$loglik>lik+1e-8){ out<- o1 lik<- o1$loglik add<- c(i,j) } } } } if(add[1]){ xx<- cbind(object$x,x[,add[1]]*x[,add[2]]) }else xx<- object$x list(coefficients=out$coefficients, rank=out$rank, RSS=out$RSS, R2=out$R2, loglik=out$loglik, add=add, fitted.values=out$fitted.values, residuals=out$residuals, x=xx) } ######################### # drop one term (if any) ######################### #object: initial model mdrop<- function(object){ out<- object y<- out$fitted+out$residuals x<- as.matrix(out$x) nx<- ncol(x) drop<- 0 if(nx>0){ lik<- -Inf for(i in 1:nx){ o1<- lr(y,x[,-i]) if(o1$loglik>lik+1e-8){ out<- o1 lik<- o1$loglik drop<- i } } } if(drop){ xx<- as.matrix(x[,-drop]) }else xx<- as.matrix(out$x) list(coefficients=out$coefficients, rank=out$rank, RSS=out$RSS, R2=out$R2, loglik=out$loglik, drop=drop, fitted.values=out$fitted.values, residuals=out$residuals, x=xx) } ############################ # model selection: stepwise ############################ fwsm<- function(mpos){ #calculate weights for single-marker regression #mpos: genetic map dd<- diff(mpos$dist) dd<- dd[dd>0] dd<- sum(dd)/length(dd) wa<- 1-0.9*exp(-10*dd/100+10*(dd/100)^2) we<- exp(-10.7*dd/100+8.7*(dd/100)^2) w<- c(wa,we); names(w)<- c("add","epi") w } fwim<- function(mpos){ #calculate weights for interval mapping #mpos: genetic map dd<- diff(mpos$dist) dd<- dd[dd>0] dd<- sum(dd)/length(dd) wa<- -0.15+3.1*sqrt(dd/100)-1.3*dd/100 we<- -0.53+5.4*sqrt(dd/100)-2.7*dd/100 w<- c(wa,we); names(w)<- c("add","epi") w } fmbic<- function(ob,neff,Nm,Ne,w=c(1,1),const=c(2.2,2.2)){ #calculate mBIC #ob: object #neff: neff[1]--number of main effects selected in ob, neff[2]--number of epistatic effects selected in ob #Nm: number of potential main effects #Ne: number of potential epistatic effects n<- length(ob$residuals) mbic<- n*log(ob$RSS) + (neff[1]+neff[2])*log(n)+ 2*neff[1]*log(w[1]*Nm/const[1]-1) + 2*neff[2]*log(w[2]*Ne/const[2]-1) mbic } # model selection using mBIC mstep<- function(trait,mdat,mpos,qtl.main=NULL,qtl.eps=NULL, direction = c("both", "backward","forward"),by=2,steps=1000,w=c(1,1),const=c(2.2,2.2)){ # mdat: marker data (AA-1, Aa-0) # mpos: genetic map (data.frame(id=,chr=,dist)) # qtl.main: QTL of main effects (data.frame(chr=,position=)) # qtl.eps: QTL of epistasis (data.frame(chr1=,position1=,chr2=,position2=)) # by: walk speed # steps: maximum steps # w: weights associated with main effects (w[1]) and epistasis (w[2]) respectively cat("\n") cat("***********************************************************\n") cat(" \aQTL mapping by modified BIC \n") cat("***********************************************************\n") cat("\n") if(sum(mdat!=0 & mdat!=1)>0) stop("marker data: 0 or 1 only...") if(!is.numeric(w) || !is.vector(w)) stop("w: wrong...") if(length(w)<2) w<- w[c(1,1)] yy<- as.numeric(trait); ny<- length(yy) dv<- div(mpos,by=by) xxx<- matrix(0,nrow=ny,ncol=nrow(dv)) cat("\aimputing data. please wait...") for(i in 1:ny){ for(j in 1:nrow(dv)){ id<- dv$mid[j] d1<- dv$dist[j]-mpos$dist[id] if(d1>0){ d2<- mpos$dist[id+1]-mpos$dist[id] r1<- rr(d1) r2<- rr(d2) r<- getprob(mdat[i,id],mdat[i,id+1],r1,r2) xxx[i,j]<- r-1/2 }else xxx[i,j]<- mdat[i,id]-1/2 } } cat("\adone\n") ins<- matrix(0,nrow=0,ncol=2) if(!is.null(qtl.main) && nrow(qtl.main)>0){ for(i in 1:nrow(qtl.main)){ ch<- dv$chr==qtl.main$chr[i] dst<- min(abs(dv$dist[ch]-qtl.main$position[i])) ch<- ch & abs(dv$dist-qtl.main$position[i])0){ for(i in 1:nrow(qtl.eps)){ ch1<- dv$chr==qtl.eps$chr1[i] dst1<- min(abs(dv$dist[ch1]-qtl.eps$position1[i])) ch1<- ch1 & abs(dv$dist-qtl.eps$position1[i])0) for(i in 1:nrow(ins)){ if(!ins[i,2]){ xx<- cbind(xx,xxx[,ins[i,1]]) }else xx<- cbind(xx,xxx[,ins[i,1]]*xxx[,ins[i,2]]) } Nm<- max(mpos$id); if(nrow(mpos)!=Nm) stop("mpos: may be wrong...") Ne<- Nm*(Nm-1)/2 nn<- ins[,2]==0 nn<- c(sum(nn),sum(!nn)) ob<- lr(yy,xx) mbic<- fmbic(ob,nn,Nm,Ne,w,const) direction <- match.arg(direction) backward <- direction == "both" || direction == "backward" forward <- direction == "both" || direction == "forward" cat("\astarting model selection...",date(),"\n") while(steps){ go<- TRUE if(backward && go){ ob1<- mdrop(ob) if(ob1$drop){ nn1<- nn if(!ins[ob1$drop,2]){ nn1[1]<- nn1[1]-1 }else nn1[2]<- nn1[2]-1 mbic1<- fmbic(ob1,nn1,Nm,Ne,w,const) if(mbic1 < mbic-1e-8){ ob<- ob1 mbic<- mbic1 nn<- nn1 ins<- ins[-ob$drop,] go<- FALSE cat("-") } } } if(forward && go){ ob2<- madd1(ob,xxx) if(ob2$add[1]){ nn2<- nn nn2[1]<- nn2[1]+1 mbic2<- fmbic(ob2,nn2,Nm,Ne,w,const) if(mbic2 < mbic-1e-8){ ob<- ob2 mbic<- mbic2 nn<- nn2 ins<- rbind(ins,ob2$add) go<- FALSE cat("+") } } if(go){ ob3<- madd2(ob,xxx) if(ob3$add[1]){ nn3<- nn nn3[2]<- nn3[2]+1 mbic3<- fmbic(ob3,nn3,Nm,Ne,w,const) if(mbic3 < mbic-1e-8){ ob<- ob3 mbic<- mbic3 nn<- nn3 ins<- rbind(ins,ob3$add) go<- FALSE cat("+") } } } } if(go) break steps<- steps-1 } cat("\a\n") cat("done...",date(),"\n") if(nrow(ins)>0){ ch<- ins[,2]==0 if(sum(ch)!=0 && sum(!ch)==0){ cat(sum(ch)," main effects\n\n") cat("***********************************************************\n") main<- list() main$effects<- ob$coefficients[-1][ch] main$QTL_locations<- data.frame(chr=dv$chr[ins[ch,1]],position=dv$dist[ins[ch,1]]) list(main=main,Rsquare=ob$R2) }else if(sum(ch)==0 && sum(!ch)!=0){ cat(sum(!ch), " epistatic effects\n\n") cat("***********************************************************\n") epistasis<- list() epistasis$effects<- ob$coefficients[-1][!ch] epistasis$QTL_locations<- data.frame(chr1=dv$chr[ins[!ch,1]],position1=dv$dist[ins[!ch,1]], chr2=dv$chr[ins[!ch,2]],position2=dv$dist[ins[!ch,2]]) list(epistasis=epistasis,Rsquare=ob$R2) }else if(sum(ch)!=0 && sum(!ch)!=0){ cat("\n",sum(ch)," main effects,", sum(!ch), " epistatic effects\n\n") cat("***********************************************************\n") main<- list() main$effects<- ob$coefficients[-1][ch] main$QTL_locations<- data.frame(chr=dv$chr[ins[ch,1]],position=dv$dist[ins[ch,1]]) epistasis<- list() epistasis$effects<- ob$coefficients[-1][!ch] epistasis$QTL_locations<- data.frame(chr1=dv$chr[ins[!ch,1]],position1=dv$dist[ins[!ch,1]], chr2=dv$chr[ins[!ch,2]],position2=dv$dist[ins[!ch,2]]) list(main=main,epistasis=epistasis,Rsquare=ob$R2) } }else{ cat("\nno QTL found\n\n") cat("***********************************************************\n") } } save.image("~/gosia/mbic.RData"); q("no") ########### # example # ##################################################################################### # genetic map associated with the Drosophila data of Zeng et al. (2000) mpos<- data.frame(id=1:45,chr=c(rep(1,6),rep(2,16),rep(3,23)), m=c(1:6,1:16,1:23), dist=c(cumsum(c(0, 3.60, 10.60, 9.20, 17.20, 18.70)), cumsum(c(0,6.98, 10.10, 4.94, 6.51, 6.19, 20.46, 12.78, 3.90, 4.55, 7.49, 30.02, 16.85, 4.34, 3.71, 7.03)), cumsum(c(0,4.99, 9.34, 6.97, 7.44, 14.46, 6.79, 3.55, 6.32, 11.86, 4.58, 6.85, 6.35, 11.79, 12.88, 9.15, 3.30, 7.98, 13.09, 10.04, 3.70, 9.79, 3.43)))) # backcross marker data simulated from the above map mdat<- read.table("http://www.stat.purdue.edu/~rcheng/gosia/markerData",header=F) # trait simulated (4 main effects, 2 epistatic effects) trait<- read.table("http://www.stat.purdue.edu/~rcheng/gosia/trait",header=F) trait<- as.vector(trait[,1]) ########################### # two step model selection ########################### # step 1: resulting in 4 main effects out<- mstep(trait,mdat,mpos,qtl.main=NULL,qtl.eps=NULL,direction = "both", by=2,steps=1000,w=fwim(mpos),const=c(2.2,2.2)) out # step 2: const=c(4,4), resulting in 4 main effects qtl.main<- out$main$QTL qtl.eps<- out$epistasis$QTL out1<- mstep(trait,mdat,mpos,qtl.main,qtl.eps,direction = "both", by=2,steps=1000,w=fwim(mpos),const=c(4,4)) out1 ##################################################################################### # the end # ###########