############################################################### # # This R code provides the analysis of the multilevel approach # for one and two factors presented in the paper : # "A novel approach for biomarker selection and the integration of # repeated measures experiments from two assays" by # B. liquet, K.A. Lecao, H. Hocini and R. Thiebaut # # The main funvtions of this code have been integrated # in the R package mixOmics # ############################################################### ############################################################## #### VAC 18 STUDY ############################################################### install.packages("mixOmics") ## you have to choose a mirror to download this R package require("mixOmics") ##load the package mixOmics ############################################################### # #### Function extract.clust # Function enables to extract cluster of probes/genes # This function is used for extract the cluster of genes over expressed found by the heatmap # ############################################################### extract.clust <- function(mat,k,fig=FALSE,method="complete"){ m <- hclust(dist(mat),method=method) cluster <- cutree(m,k) if(fig==TRUE) plot(m) res <- vector("list", k) for (i in 1:k){ nom <- names(cluster[which(cluster==i)]) res[[i]] <- nom } nom1 <- paste("cluster",1:k,sep="") names(res) <- nom1 return(res) } ############################################################# load("VAC18.RData") # load the data for the VAC18 study ############################################################# ###First Analysis: one factor after Vaccination ############################################################# data.1level <- data.1level.After res.1level <- multilevel(data.1level$X, cond = data.1level$Y,sample=data.1level$sample, ncomp=3,keepX=c(30,137,123),tab.prob.gene=data.1level$tab.prob.gene, method = 'splsda') ###Fig: component 1 versus component 2 col1 <- c("darkblue","purple","green4","red3") col2 <- data.1level$Y levels(col2) <- col1 col2 <- as.character(col2) plotIndiv(res.1level,comp=c(1,2),col=col2,ind.names =FALSE,cex = 1, pch = 16) legend(x="bottomright", c( "LIPO5", "GAG+","GAG-","NS"), col = col1, pt.cex = 1, pch = 16, title = "Stimulation") ###Fig: component 1 versus component 3 plotIndiv(res.1level,comp=c(1,3),col=col2,ind.names =FALSE,cex = 1, pch = 16) legend(x="topright", c( "LIPO5", "GAG+","GAG-","NS"), col = col1, pt.cex = 1, pch = 16, title = "Stimulation") ###Fig: 3D plot plot3dIndiv(res.1level, ind.names = FALSE, col = col2, cex = 0.3, axes.box = "box") ###heatmap of the gene selected by the multilevel approach color_stimulation <- c("darkblue","purple","green4","red3") color_sample <- c("lightgreen", "red","lightblue","darkorange","purple","maroon","blue","chocolate","turquoise","tomato1","pink2","aquamarine") range.res <- range(res.1level$Xw) breaks0 <- seq(range.res[1],0,length=50) breaks <- unique(c(breaks0,seq(0,range.res[2],length=51))) pheatmap.multilevel(res.1level,breaks=breaks, col_sample=color_sample, col_stimulation=color_stimulation, label_annotation=c("Subject","Stimulation"),fontsize=8,border=FALSE,clustering_method="ward",show_colnames = FALSE,show_rownames = TRUE,fontsize_row=4,fontsize_col=2) ###Extract the cluster over expressed and zoom on the other genes name.probe <- rownames(res.1level$loadings$X[unique(which(res.1level$loadings$X[,]!=0,arr.ind=TRUE)[,1]),1:res.1level$ncomp]) #name of probes selected mat <- res.1level$Xw[,name.probe] #within matrix for the probes selected res.over.expressed <- extract.clust(t(mat),k=2,method="ward") #res.over.expressed$cluster1 probes over expressed prob.over <- res.over.expressed$cluster1 order_sample <- c(7,10,5,8,6,9,2,3,4,1,11,14,21,18,19,13,12,17,16,15,20,31,22,27,30,24,28,23,25,26,29,32,38,40,34,39,42,33,41,37,35,36) ## we order the sample in the order than the first heatmap range.res <- range(res.1level$Xw[,setdiff(name.probe,prob.over)]) breaks0 <- seq(range.res[1],0,length=50) breaks <- unique(c(breaks0,seq(0,range.res[2],length=51))) pheatmap.multilevel(res.1level, breaks=breaks,cluster=prob.over, col_sample=color_sample, col_stimulation=color_stimulation, label_annotation=c("Subject","Stimulation"),clustering_method="ward",cluster_cols = FALSE,show_colnames = FALSE,show_rownames = TRUE,fontsize_row=4,order_sample=order_sample,fontsize_col = 2) ############################################################# # ###Second Analysis: two factors: Stimulation and Time # ############################################################# stimu.time <- data.frame(cbind(data.2level$STIMULATION,data.2level$TIME)) res.2level <- multilevel(data.2level$X, cond = stimu.time,sample=data.2level$sample, ncomp=3,keepX=c(30,40,150),tab.prob.gene=data.2level$tab.prob.gene, method = 'splsda') colW0 <- c("turquoise","pink4","lightgreen","orange") #Warning for the match of the color LIPO5 GAG+ GAG- NS colW14 <- c("darkblue","purple","green4","red3") col1 <- c(colW0,colW14) col2 <- factor(data.2level$Y) levels(col2) <- col1[c(3,2,7,6,1,5,4,8)] col2 <- as.character(col2) ###Fig: component 1 versus component 2 plotIndiv(res.2level,comp=1:2,col=col2,ind.names =FALSE,cex = 1, pch = 16) legend("topleft", c( "LIPO5.W0", "GAG+.W0","GAG-.W0","NS.W0","LIPO5.W14", "GAG+.W14","GAG-.W14","NS.W14"), col = col1, pt.cex = 1, pch = 16, title = "Stimulation") ###Fig: component 1 versus component 3 plotIndiv(res.2level,comp=c(1,3),col=col2,ind.names =FALSE,cex = 1, pch = 16) legend("topright", c( "LIPO5.W0", "GAG+.W0","GAG-.W0","NS.W0","LIPO5.W14", "GAG+.W14","GAG-.W14","NS.W14"), col = col1, pt.cex = 1, pch = 16, title = "Stimulation") ###heatmap of the gene selected by the multilevel approach color_stimulation <- col1[5:8] color_time <- c("pink","lightblue1") color_sample <- c("lightgreen", "red","lightblue","darkorange","purple","maroon","blue","chocolate","turquoise","tomato1","pink2","aquamarine") label.stimu <- unique(data.2level$STIMULATION) label.time <- unique(data.2level$TIME) range.res <- range(res.2level$Xw) breaks0 <- seq(range.res[1],0,length=50) breaks <- unique(c(breaks0,seq(0,range.res[2],length=51))) pheatmap.multilevel(res.2level,breaks=breaks, col_sample=color_sample, col_stimulation=color_stimulation,col_time=color_time, label_color_stimulation=label.stimu,label_color_time=label.time, label_annotation=NULL,border=FALSE,clustering_method="ward",show_colnames = FALSE,show_rownames = TRUE,fontsize_row=4,fontsize=8) ###Extract the cluster over expressed and zoom on the other genes name.probe <- rownames(res.2level$loadings$X[unique(which(res.2level$loadings$X[,]!=0,arr.ind=TRUE)[,1]),1:res.2level$ncomp]) #name of probes selected mat <- res.2level$Xw[,name.probe] #within matrix for the probes selected res.over.expressed <- extract.clust(t(mat),k=2,method="ward") #res.over.expressed$cluster1 probes over expressed prob.over <- res.over.expressed$cluster2 order_sample <- c(49,53,47,50,48,55,45,52,46,51,54,2,9,1,4,8,5,7,11,3,6,10,22,58,63,59,64,56,60,57,65,61,62,76,77,78,85,83,81,84,82,79,80,68,72,67,66,74,70,73,69,71,75,86,12,14,20,15,19,13,16,21,17,18,25,29,24,26,28,31,27,30,23,33,35,43,41,39,42,36,40,37,38,34,32,44) ## we order the sample in the order than the first heatmap range.res <- range(res.2level$Xw[,setdiff(name.probe,prob.over)]) breaks0 <- seq(range.res[1],0,length=50) breaks <- unique(c(breaks0,seq(0,range.res[2],length=51))) dev.new() pheatmap.multilevel(res.2level,breaks=breaks, cluster=prob.over,col_sample=color_sample, col_stimulation=color_stimulation,col_time=color_time, label_color_stimulation=label.stimu ,label_color_time=label.time,,cluster_cols = FALSE, label_annotation=NULL,border=FALSE,clustering_method="ward",show_colnames = FALSE,show_rownames = TRUE,order_sample=order_sample,fontsize_row=4,fontsize=8) ############################################################# # ###Third Analysis: one factor before Vaccination # ############################################################# data.1level <- data.1level.Before res.1level <- multilevel(data.1level$X, cond = data.1level$Y,sample=data.1level$sample, ncomp=3,keepX=c(10,5,15),tab.prob.gene=data.1level$tab.prob.gene, method = 'splsda') ###Fig: component 1 versus component 2 col1 <- c("turquoise","pink4","lightgreen","orange") col2 <- data.1level$Y levels(col2) <- col1 col2 <- as.character(col2) plotIndiv(res.1level,comp=c(1,2),col=col2,ind.names =FALSE,cex = 1, pch = 16) legend(x="bottomleft", c( "LIPO5", "GAG+","GAG-","NS"), col = col1, pt.cex = 1, pch = 16, title = "Stimulation") ###Fig: component 1 versus component 3 plotIndiv(res.1level,comp=c(1,3),col=col2,ind.names =FALSE,cex = 1, pch = 16) legend(x="bottomleft", c( "LIPO5", "GAG+","GAG-","NS"), col = col1, pt.cex = 1, pch = 16, title = "Stimulation") ###Fig: 3D plot plot3dIndiv(res.1level, ind.names = FALSE, col = col2, cex = 0.3, axes.box = "box") ###heatmap of the gene selected by the multilevel approach color_stimulation <- c("turquoise","pink4","lightgreen","orange") color_sample <- c("lightgreen", "red","lightblue","darkorange","purple","maroon","blue","chocolate","turquoise","tomato1","pink2","aquamarine") range.res <- range(res.1level$Xw) breaks0 <- seq(range.res[1],0,length=50) breaks <- unique(c(breaks0,seq(0,range.res[2],length=51))) pheatmap.multilevel(res.1level,breaks=breaks, col_sample=color_sample, col_stimulation=color_stimulation, label_annotation=NULL,border=FALSE,clustering_method="ward",show_colnames = FALSE,show_rownames = TRUE,fontsize_row=4,fontsize=8) ###Extract the cluster over expressed and zoom on the other genes name.probe <- rownames(res.1level$loadings$X[unique(which(res.1level$loadings$X[,]!=0,arr.ind=TRUE)[,1]),1:res.1level$ncomp]) #name of probes selected mat <- res.1level$Xw[,name.probe] #within matrix for the probes selected res.over.expressed <- extract.clust(t(mat),k=2,method="ward") #res.over.expressed$cluster1 probes over expressed prob.over <- res.over.expressed$cluster2 order_sample <- c(11,1,6,9,8,5,7,10,2,3,4,12,14,21,13,16,17,18,19,15,20,23,33,31,28,29,24,26,27,25,30,40,38,43,39,41,34,35,37,42,22,36,32,44) ## we order the sample in the order than the first heatmap range.res <- range(res.1level$Xw[,setdiff(name.probe,prob.over)]) breaks0 <- seq(range.res[1],0,length=50) breaks <- unique(c(breaks0,seq(0,range.res[2],length=51))) dev.new() pheatmap.multilevel(res.1level, breaks=breaks,cluster=prob.over, col_sample=color_sample, col_stimulation=color_stimulation, label_annotation=c("Subject","Stimulation"),border=FALSE,clustering_method="ward",cluster_cols = FALSE,show_colnames = FALSE,show_rownames = TRUE,fontsize_row=4,order_sample=order_sample,fontsize = 8) ########################################################################################################## # ### Method for choosing the number of genes in each component ############################################ ########################################################################################################## ## we adopt a sequential method based on the loo (leave one out cross validation) to estimate ## the miss-classification error rate ## we illustrate the use of the loo for the analysis on one factor after vaccination ## corresponding to the data: data.1level.After # ########################################################################################################## ########################################################################################################## # ### leave.one.out.seq function for estimating the the miss-classification error rate leave.one.out.seq <- function(X,Y,ncomp=3,n=12,n.groupe=4,sample,keepX,keep.seq.brut=NULL,keep.seq.multilevel=NULL,method=method){ error.splsDAw <- rep(0,length(keepX)) error.splsDAbrut <- rep(0,length(keepX)) Y <- as.factor(Y) error.w <- rep(0,ncomp) for (i in 1:length(keepX)) { print(i) error.s <- 0 error.sw <- 0 for(j in unique(sample)){ # print(j) sequence <- which(sample==j) trainX <- X[-sequence,] trainY <- Y[-sequence] testY <- Y[sequence] xtest <- X[sequence,] xwtest <- X[sequence,]-matrix(colMeans(X[sequence,]),ncol=ncol(X),nrow=length(sequence),byrow=TRUE) samplew <- sample[-sequence] trainX.decomp <- Split.variation.one.level(trainX,trainY,sample=samplew) trainxw <- trainX.decomp$Xw #########prediction splsda brut result.s <- splsda(trainX, trainY ,keepX = c(keep.seq.brut,keepX[i]),ncomp = ncomp) test.predict.s <- predict(result.s, xtest, method = method) Prediction.s <- levels(Y)[test.predict.s$class[[method]][, ncomp]] error.s <- error.s + sum(as.character(testY)!=Prediction.s) ######################### #########prediction splsda within result.sw <- splsda(trainxw, trainY ,keepX = c(keep.seq.multilevel,keepX[i]),ncomp = ncomp) test.predict.sw <- predict(result.sw, xwtest, method = method) Prediction.sw <- levels(Y)[test.predict.sw$class[[method]][, ncomp]] error.sw <- error.sw + sum(as.character(testY)!=Prediction.sw) ######################## } error.splsDAbrut[i] <- error.s/length(Y) error.splsDAw[i] <- error.sw/length(Y) } error <- cbind(error.splsDAbrut,error.splsDAw) rownames(error) <- keepX colnames(error) <- c("brut-spls","multilevel-spls") res <- list(error=error) } ##########################################################################################################" ###Illustration on the analysis on one factor after vaccination data.1level <- data.1level.After ##choice of the number of gene for the component 1 res1 <- leave.one.out.seq(X=data.1level$X,data.1level$Y,ncomp=1,n=12,n.groupe=4,sample=data.1level$sample,keepX=round(c(seq(5, 45, 5), seq(50, 2000, 25))/3)[1:3],keep.seq.brut=NULL,method="max.dist"); save(res1,file="res1.Rdata") res2 <- leave.one.out.seq(X=data.1level$X,data.1level$Y,ncomp=2,n=12,n.groupe=4,sample=data.1level$sample,keepX=round(c(seq(5, 45, 5), seq(50, 2000, 25))/3)[1:3],keep.seq.brut=c(17),keep.seq.multilevel=c(30),method="max.dist"); save(res2,file="res2.Rdata") res3 <- leave.one.out.seq(X=data.1level$X,data.1level$Y,ncomp=3,n=12,n.groupe=4,sample=data.1level$sample,keepX=round(c(seq(5, 45, 5), seq(50, 2000, 25))/3)[1:3],keep.seq.brut=c(17,70),L,keep.seq.multilevel=c(30,137),method="max.dist"); save(res3,file="res3.Rdata") error.loo <- cbind(res1[,1],res2[,1],res3[,1],res1[,2],res2[,2],res3[,2]) ###plot of the loo for the sPLS-DA on the orinigal data matplot(error.loo[,1:3],type="l",axes = FALSE, xlab = 'number of selected genes',ylab = 'error rate', col = c("black", "red","blue"), lwd = c(1,1,1), lty = c(1,1,1)) axis(1, c(1:length(rownames(error.loo))), labels = rownames(error.loo)) axis(2,(0:14)/20) legend("center", 0.45, lty = c(1,1,1), legend = c( 'Within sPLSDA: dim 1', 'Within sPLSDA: dim 1:2','Within sPLSDA: dim 1:3'),horiz = FALSE, cex = 0.9, col = c("black", "red","blue"), lwd = c(1,1,1)) ###plot of the loo for the multilevel sPLS-DA matplot(error.loo[,4:6],type="l",axes = FALSE, xlab = 'number of selected genes',ylab = 'error rate', col = c("black", "red","blue"), lwd = c(1,1,1), lty = c(1,1,1)) axis(1, c(1:length(rownames(error.loo))), labels = rownames(error.loo)) axis(2,(0:14)/20) legend("center", 0.45, lty = c(1,1,1), legend = c( 'Within sPLSDA: dim 1', 'Within sPLSDA: dim 1:2','Whithin sPLSDA: dim 1:3'),horiz = FALSE, cex = 0.9, col = c("black", "red","blue"), lwd = c(1,1,1))