###############################################################
#
# This R code provides the analysis of the multilevel approach
# for one and two factors presented in the paper :
# "A novel approach for biomarker selection and the integration of 
#   repeated measures experiments from two assays" by
# B. liquet, K.A. Lecao, H. Hocini and R. Thiebaut
#
# The main funvtions of this code have been integrated
#    in the R package mixOmics
#
###############################################################


##############################################################
#### VAC 18 STUDY
###############################################################
install.packages("mixOmics") ## you have to choose a mirror to download this R package
require("mixOmics") ##load the package mixOmics

###############################################################
#
#### Function extract.clust
# Function enables to extract cluster of probes/genes 
# This function is used for extract the cluster of genes over expressed found by the heatmap
#
###############################################################
extract.clust <- function(mat,k,fig=FALSE,method="complete"){
m <- hclust(dist(mat),method=method) 
cluster <- cutree(m,k)
if(fig==TRUE) plot(m)
res <- vector("list", k)
for (i in 1:k){
nom <- names(cluster[which(cluster==i)])
res[[i]] <- nom
}
nom1 <- paste("cluster",1:k,sep="")
names(res) <- nom1
return(res)
}
#############################################################

load("VAC18.RData") # load the data for the VAC18 study


#############################################################
###First Analysis: one factor after Vaccination
#############################################################

data.1level <- data.1level.After


res.1level <- multilevel(data.1level$X, cond = data.1level$Y,sample=data.1level$sample,
ncomp=3,keepX=c(30,137,123),tab.prob.gene=data.1level$tab.prob.gene, method = 'splsda')

###Fig: component 1 versus component 2
col1 <- c("darkblue","purple","green4","red3")
col2 <- data.1level$Y
levels(col2) <- col1
col2 <- as.character(col2)

plotIndiv(res.1level,comp=c(1,2),col=col2,ind.names =FALSE,cex = 1, pch = 16) 
legend(x="bottomright", c( "LIPO5", "GAG+","GAG-","NS"), col = col1, pt.cex = 1, pch = 16,
       title = "Stimulation")

###Fig: component 1 versus component 3
plotIndiv(res.1level,comp=c(1,3),col=col2,ind.names =FALSE,cex = 1, pch = 16) 
legend(x="topright", c( "LIPO5", "GAG+","GAG-","NS"), col = col1, pt.cex = 1, pch = 16,
       title = "Stimulation")

###Fig: 3D plot 
plot3dIndiv(res.1level, ind.names = FALSE, col = col2, cex = 0.3, axes.box = "box") 


###heatmap of the gene selected by the multilevel approach
color_stimulation <- c("darkblue","purple","green4","red3")
color_sample <- c("lightgreen", "red","lightblue","darkorange","purple","maroon","blue","chocolate","turquoise","tomato1","pink2","aquamarine")

range.res <- range(res.1level$Xw)
breaks0 <- seq(range.res[1],0,length=50)
breaks <- unique(c(breaks0,seq(0,range.res[2],length=51)))


pheatmap.multilevel(res.1level,breaks=breaks, col_sample=color_sample, col_stimulation=color_stimulation, label_annotation=c("Subject","Stimulation"),fontsize=8,border=FALSE,clustering_method="ward",show_colnames = FALSE,show_rownames = TRUE,fontsize_row=4,fontsize_col=2)



###Extract the cluster over expressed and zoom on the other genes
name.probe <- rownames(res.1level$loadings$X[unique(which(res.1level$loadings$X[,]!=0,arr.ind=TRUE)[,1]),1:res.1level$ncomp]) #name of probes selected
mat <- res.1level$Xw[,name.probe] #within matrix for the probes selected 
res.over.expressed <- extract.clust(t(mat),k=2,method="ward") #res.over.expressed$cluster1 probes over expressed
prob.over <- res.over.expressed$cluster1
order_sample <- c(7,10,5,8,6,9,2,3,4,1,11,14,21,18,19,13,12,17,16,15,20,31,22,27,30,24,28,23,25,26,29,32,38,40,34,39,42,33,41,37,35,36)
## we order the sample in the order than the first heatmap

range.res <- range(res.1level$Xw[,setdiff(name.probe,prob.over)])
breaks0 <- seq(range.res[1],0,length=50)
breaks <- unique(c(breaks0,seq(0,range.res[2],length=51)))

pheatmap.multilevel(res.1level, breaks=breaks,cluster=prob.over, col_sample=color_sample, col_stimulation=color_stimulation, label_annotation=c("Subject","Stimulation"),clustering_method="ward",cluster_cols = FALSE,show_colnames = FALSE,show_rownames = TRUE,fontsize_row=4,order_sample=order_sample,fontsize_col = 2)


#############################################################
#
###Second Analysis: two factors: Stimulation and Time
#
#############################################################

stimu.time <- data.frame(cbind(data.2level$STIMULATION,data.2level$TIME))

res.2level <- multilevel(data.2level$X, cond = stimu.time,sample=data.2level$sample,
ncomp=3,keepX=c(30,40,150),tab.prob.gene=data.2level$tab.prob.gene, method = 'splsda')


colW0  <- c("turquoise","pink4","lightgreen","orange") #Warning for the match of the color LIPO5 GAG+ GAG- NS
colW14 <- c("darkblue","purple","green4","red3")
col1 <- c(colW0,colW14)
col2 <- factor(data.2level$Y)
levels(col2) <- col1[c(3,2,7,6,1,5,4,8)]
col2 <- as.character(col2)

###Fig: component 1 versus component 2
plotIndiv(res.2level,comp=1:2,col=col2,ind.names =FALSE,cex = 1, pch = 16) 
legend("topleft", c( "LIPO5.W0", "GAG+.W0","GAG-.W0","NS.W0","LIPO5.W14", "GAG+.W14","GAG-.W14","NS.W14"), col = col1, pt.cex = 1, pch = 16,
       title = "Stimulation")

###Fig: component 1 versus component 3
plotIndiv(res.2level,comp=c(1,3),col=col2,ind.names =FALSE,cex = 1, pch = 16) 
legend("topright", c( "LIPO5.W0", "GAG+.W0","GAG-.W0","NS.W0","LIPO5.W14", "GAG+.W14","GAG-.W14","NS.W14"), col = col1, pt.cex = 1, pch = 16,
       title = "Stimulation")




###heatmap of the gene selected by the multilevel approach
color_stimulation <- col1[5:8]
color_time <- c("pink","lightblue1")
color_sample <- c("lightgreen", "red","lightblue","darkorange","purple","maroon","blue","chocolate","turquoise","tomato1","pink2","aquamarine")
label.stimu <- unique(data.2level$STIMULATION)
label.time <- unique(data.2level$TIME)

range.res <- range(res.2level$Xw)
breaks0 <- seq(range.res[1],0,length=50)
breaks <- unique(c(breaks0,seq(0,range.res[2],length=51)))


pheatmap.multilevel(res.2level,breaks=breaks, col_sample=color_sample, col_stimulation=color_stimulation,col_time=color_time, label_color_stimulation=label.stimu,label_color_time=label.time, label_annotation=NULL,border=FALSE,clustering_method="ward",show_colnames = FALSE,show_rownames = TRUE,fontsize_row=4,fontsize=8)



###Extract the cluster over expressed and zoom on the other genes
name.probe <- rownames(res.2level$loadings$X[unique(which(res.2level$loadings$X[,]!=0,arr.ind=TRUE)[,1]),1:res.2level$ncomp]) #name of probes selected
mat <- res.2level$Xw[,name.probe] #within matrix for the probes selected 
res.over.expressed <- extract.clust(t(mat),k=2,method="ward") #res.over.expressed$cluster1 probes over expressed
prob.over <- res.over.expressed$cluster2
order_sample <- c(49,53,47,50,48,55,45,52,46,51,54,2,9,1,4,8,5,7,11,3,6,10,22,58,63,59,64,56,60,57,65,61,62,76,77,78,85,83,81,84,82,79,80,68,72,67,66,74,70,73,69,71,75,86,12,14,20,15,19,13,16,21,17,18,25,29,24,26,28,31,27,30,23,33,35,43,41,39,42,36,40,37,38,34,32,44)
## we order the sample in the order than the first heatmap

range.res <- range(res.2level$Xw[,setdiff(name.probe,prob.over)])
breaks0 <- seq(range.res[1],0,length=50)
breaks <- unique(c(breaks0,seq(0,range.res[2],length=51)))

dev.new()

pheatmap.multilevel(res.2level,breaks=breaks, cluster=prob.over,col_sample=color_sample, col_stimulation=color_stimulation,col_time=color_time, label_color_stimulation=label.stimu ,label_color_time=label.time,,cluster_cols = FALSE, label_annotation=NULL,border=FALSE,clustering_method="ward",show_colnames = FALSE,show_rownames = TRUE,order_sample=order_sample,fontsize_row=4,fontsize=8)



#############################################################
#
###Third Analysis: one factor before Vaccination
#
#############################################################

data.1level <- data.1level.Before

res.1level <- multilevel(data.1level$X, cond = data.1level$Y,sample=data.1level$sample,
ncomp=3,keepX=c(10,5,15),tab.prob.gene=data.1level$tab.prob.gene, method = 'splsda')

###Fig: component 1 versus component 2
col1 <- c("turquoise","pink4","lightgreen","orange") 
col2 <- data.1level$Y
levels(col2) <- col1
col2 <- as.character(col2)

plotIndiv(res.1level,comp=c(1,2),col=col2,ind.names =FALSE,cex = 1, pch = 16) 
legend(x="bottomleft", c( "LIPO5", "GAG+","GAG-","NS"), col = col1, pt.cex = 1, pch = 16,
       title = "Stimulation")

###Fig: component 1 versus component 3
plotIndiv(res.1level,comp=c(1,3),col=col2,ind.names =FALSE,cex = 1, pch = 16) 
legend(x="bottomleft", c( "LIPO5", "GAG+","GAG-","NS"), col = col1, pt.cex = 1, pch = 16,
       title = "Stimulation")

###Fig: 3D plot 
plot3dIndiv(res.1level, ind.names = FALSE, col = col2, cex = 0.3, axes.box = "box") 


###heatmap of the gene selected by the multilevel approach
color_stimulation <- c("turquoise","pink4","lightgreen","orange") 
color_sample <- c("lightgreen", "red","lightblue","darkorange","purple","maroon","blue","chocolate","turquoise","tomato1","pink2","aquamarine")

range.res <- range(res.1level$Xw)
breaks0 <- seq(range.res[1],0,length=50)
breaks <- unique(c(breaks0,seq(0,range.res[2],length=51)))
pheatmap.multilevel(res.1level,breaks=breaks, col_sample=color_sample, col_stimulation=color_stimulation, label_annotation=NULL,border=FALSE,clustering_method="ward",show_colnames = FALSE,show_rownames = TRUE,fontsize_row=4,fontsize=8)



###Extract the cluster over expressed and zoom on the other genes
name.probe <- rownames(res.1level$loadings$X[unique(which(res.1level$loadings$X[,]!=0,arr.ind=TRUE)[,1]),1:res.1level$ncomp]) #name of probes selected
mat <- res.1level$Xw[,name.probe] #within matrix for the probes selected 
res.over.expressed <- extract.clust(t(mat),k=2,method="ward") #res.over.expressed$cluster1 probes over expressed
prob.over <- res.over.expressed$cluster2
order_sample <- c(11,1,6,9,8,5,7,10,2,3,4,12,14,21,13,16,17,18,19,15,20,23,33,31,28,29,24,26,27,25,30,40,38,43,39,41,34,35,37,42,22,36,32,44)
## we order the sample in the order than the first heatmap
range.res <- range(res.1level$Xw[,setdiff(name.probe,prob.over)])
breaks0 <- seq(range.res[1],0,length=50)
breaks <- unique(c(breaks0,seq(0,range.res[2],length=51)))

dev.new()
pheatmap.multilevel(res.1level, breaks=breaks,cluster=prob.over, col_sample=color_sample, col_stimulation=color_stimulation, label_annotation=c("Subject","Stimulation"),border=FALSE,clustering_method="ward",cluster_cols = FALSE,show_colnames = FALSE,show_rownames = TRUE,fontsize_row=4,order_sample=order_sample,fontsize = 8)




##########################################################################################################
#
### Method for choosing the number of genes in each component ############################################
##########################################################################################################
## we adopt a sequential method based on the loo (leave one out cross validation) to estimate 
## the miss-classification error rate
## we illustrate the use of the loo for the analysis on one factor after vaccination 
## corresponding to the data: data.1level.After
#
##########################################################################################################    

##########################################################################################################
#
### leave.one.out.seq function for estimating the the miss-classification error rate
leave.one.out.seq <- function(X,Y,ncomp=3,n=12,n.groupe=4,sample,keepX,keep.seq.brut=NULL,keep.seq.multilevel=NULL,method=method){
	
error.splsDAw <- rep(0,length(keepX))
error.splsDAbrut <- rep(0,length(keepX))

Y <- as.factor(Y)  
error.w <- rep(0,ncomp)


for (i in 1:length(keepX)) {
	print(i)	
	error.s <- 0
	error.sw <- 0


	for(j in unique(sample)){
#	print(j)

	sequence <- which(sample==j)
	trainX <- X[-sequence,] 
	trainY <- Y[-sequence]
	testY <- Y[sequence]
	xtest <- X[sequence,]
	xwtest <- X[sequence,]-matrix(colMeans(X[sequence,]),ncol=ncol(X),nrow=length(sequence),byrow=TRUE)
	samplew <- sample[-sequence]
        	 
	trainX.decomp <-  Split.variation.one.level(trainX,trainY,sample=samplew)
	trainxw <- trainX.decomp$Xw	

	#########prediction splsda brut
      result.s <- splsda(trainX, trainY ,keepX = c(keep.seq.brut,keepX[i]),ncomp = ncomp) 
	test.predict.s <- predict(result.s, xtest, method = method)
      Prediction.s <- levels(Y)[test.predict.s$class[[method]][, ncomp]]
      error.s <- error.s + sum(as.character(testY)!=Prediction.s)
	#########################	


      #########prediction splsda within
      result.sw <- splsda(trainxw, trainY ,keepX = c(keep.seq.multilevel,keepX[i]),ncomp = ncomp) 
	test.predict.sw <- predict(result.sw, xwtest, method = method)
      Prediction.sw <- levels(Y)[test.predict.sw$class[[method]][, ncomp]]
     	error.sw <- error.sw + sum(as.character(testY)!=Prediction.sw)
	########################
	
	}
	
	error.splsDAbrut[i] <- error.s/length(Y)
      error.splsDAw[i] <- error.sw/length(Y)
	
}

error <- cbind(error.splsDAbrut,error.splsDAw)
rownames(error) <- keepX
colnames(error) <- c("brut-spls","multilevel-spls")
res <- list(error=error)
}
##########################################################################################################"


###Illustration on the analysis on one factor after vaccination 

data.1level <- data.1level.After

##choice of the number of gene for the component 1
res1 <- leave.one.out.seq(X=data.1level$X,data.1level$Y,ncomp=1,n=12,n.groupe=4,sample=data.1level$sample,keepX=round(c(seq(5, 45, 5), seq(50, 2000, 25))/3)[1:3],keep.seq.brut=NULL,method="max.dist");
save(res1,file="res1.Rdata")

res2 <- leave.one.out.seq(X=data.1level$X,data.1level$Y,ncomp=2,n=12,n.groupe=4,sample=data.1level$sample,keepX=round(c(seq(5, 45, 5), seq(50, 2000, 25))/3)[1:3],keep.seq.brut=c(17),keep.seq.multilevel=c(30),method="max.dist");
save(res2,file="res2.Rdata")

res3 <- leave.one.out.seq(X=data.1level$X,data.1level$Y,ncomp=3,n=12,n.groupe=4,sample=data.1level$sample,keepX=round(c(seq(5, 45, 5), seq(50, 2000, 25))/3)[1:3],keep.seq.brut=c(17,70),L,keep.seq.multilevel=c(30,137),method="max.dist");
save(res3,file="res3.Rdata")

error.loo <- cbind(res1[,1],res2[,1],res3[,1],res1[,2],res2[,2],res3[,2])

###plot of the loo for the sPLS-DA on the orinigal data
matplot(error.loo[,1:3],type="l",axes = FALSE, xlab = 'number of selected genes',ylab = 'error rate', col = c("black", "red","blue"), lwd = c(1,1,1), lty = c(1,1,1))
axis(1, c(1:length(rownames(error.loo))), labels = rownames(error.loo))
axis(2,(0:14)/20)
legend("center", 0.45, lty = c(1,1,1), legend = c(
'Within sPLSDA: dim 1', 'Within sPLSDA: dim 1:2','Within sPLSDA: dim 1:3'),horiz = FALSE, cex = 0.9, col = c("black", "red","blue"), lwd = c(1,1,1))


###plot of the loo for the multilevel sPLS-DA 
matplot(error.loo[,4:6],type="l",axes = FALSE, xlab = 'number of selected genes',ylab = 'error rate', col = c("black", "red","blue"), lwd = c(1,1,1), lty = c(1,1,1))
axis(1, c(1:length(rownames(error.loo))), labels = rownames(error.loo))
axis(2,(0:14)/20)
legend("center", 0.45, lty = c(1,1,1), legend = c(
'Within sPLSDA: dim 1', 'Within sPLSDA: dim 1:2','Whithin sPLSDA: dim 1:3'),horiz = FALSE, cex = 0.9, col = c("black", "red","blue"), lwd = c(1,1,1))