############################################################### ## Copyright 2000-2005 ## ## National Center for Genome Resources ## ## George Mason University ## ## All rights reserved ## ## permcluster.2005.r, written by Karen Schlauch ## ############################################################### ################################################################################################ ## Main program call ## ## This program does a breadth-first search of the clustering tree generated by Murtagh's ## ## hclust procedure to store the clusters for use of the permutation routine called in index. ## ## INPUT: data.txt is the user's tab-delimited "txt" data file ## ## dist.metric is the user-defined distance method ## ## stop.level is the user-defined level at which the clustering should stop ## ## OUTPUT: Two files written to the user directory in "txt" format ## ## as "User.Data" and "Graph.Data". ## ################################################################################################ perm.program<-function(data.txt,log.flag,dist.metric,agglom.method,stop.level,num.perms,thresh) {options(digits=10) tmp<-scan.data(data.txt,log.flag) if(log.flag) data.file<-tmp$LogData if(!log.flag) data.file<-tmp$Data gene.names<-tmp$gene.names rm(tmp) ##filter out all genes with only missing values## na.genes<-NULL for(i in 1:dim(data.file)[1]) {x<-data.file[i,-1] if(length(x[!is.na(x)]) == 0) na.genes<-c(na.genes,i)} if(length(na.genes)>0) {print("these genes have only missing values, and will be deleted") print(gene.names[na.genes]) if(dim(data.file)[1]-length(na.genes)>1) {data.file<-data.file[-na.genes,] gene.names<-gene.names[-na.genes]} if(dim(data.file)[1]-length(na.genes)<=1) {print("your data file has only one or no genes with no missing values") print("there is no data on which to run clustering") pause()}} index(data.file,gene.names,dist.metric,agglom.method,stop.level,num.perms,thresh)} ############################################################################################### ## The scan.data function reads in tab-delimited data.txt file, which is in the form: ## ## ## ## genename ratio1 ratio2 ratio3 ratio4 .... ## ## gene1 .1 .2 .3 .4 ## ## ## ## Only the FIRST row should contain column labels or other text. The program ## ## thus skips only the FIRST row, regardless of what it contains. To change this, ## ## simply change the "skip" variable which is now set to 1. ## ## INPUT: data.txt is the user's tab-delimited "txt" data file ## ## OUTPUT: Data: the data file in matrix form with the gene name replaced by gene number, ## ## where the gene number is the position of the gene in the data file ## ## num.exp: the number of experiments (or replicates) of the data file ## ## num.genes: the number of genes in the data file ## ## gene.names: a vector of the gene names in the data file ## ############################################################################################### scan.data<-function(data.txt,log.flag) {num.exp<-length(scan(data.txt,sep="\t",what="",skip=2,nlines=1))-1 header<-scan(data.txt,what="",nlines=1) if(length(header)==num.exp) exp.names<-header if(length(header)==num.exp+1) exp.names<-header[-1] Data<-matrix(scan(data.txt,sep="\t",what="",skip=1),ncol=num.exp+1,byrow=T) gene.names<-Data[,1] num.genes<-length(Data[,1]) Data<-Data[,2:(num.exp+1)] if(log.flag) {LogData<-matrix(as.numeric(Data),ncol=num.exp,byrow=F) LogData<-log2(LogData) LogData<-cbind(c(1:num.genes),LogData) LogData<-matrix(as.numeric(LogData),ncol=num.exp+1,byrow=F)} Data<-cbind(c(1:num.genes),Data) Data<-matrix(as.numeric(Data),ncol=num.exp+1,byrow=F) if(log.flag) return(Data,LogData,num.exp,num.genes,gene.names) if(!log.flag) return(Data,num.exp,num.genes,gene.names)} ################################################################################################ ## The get.clust.list function first calls the hclust function to generate the ## ## clustering info and clustering tree. ## ## The clustering info given by the hclust function is parsed. ## ## The first loop counts the total number of clusters, the second loop stores ## ## necessary information to build the View.Matrix in the following function. ## ## (See details below in comments) ## ## INPUT: data.file: the data file output of scan.data ## ## dist.metric: the user-defined distance method ## ## "euc": euclidean distance, in which NA's are excluded and no scaling occurs ## ## "cor": correlation coefficient ## ## "cos": cosine of the angle between two gene profile vectors ## ## "pcos": Munneke's penalized cosine distance metric ## ## OUTPUT: View.Matrix: a somewhat empty shell of View.Matrix to be completed ## ## in the index function here it contains the gene ## ## expression profiles and gene numbers of each cluster ## ## and the cluster dimension ## ## I.list: a list storing indeces of the clusters of View.Matrix,height,dimension ## ## m: the merge matrix produced by the hclust function ## ## d: the number of rows of the merge matrix ## ## num.exp: the number of experiments ## ## dim.sum: the sum of the dimensions of all clusters ## ## Left: the Left child of the cluster being considered ## ## Right: the right child of the cluster being considered ## ## Orig: the cluster being considered ## ## num.clusters:the number of total clusters considered ## ################################################################################################# get.clust.list<- function(data.file,gene.names,dist.metric,agglom.method) {library(mva) num.exp<-dim(data.file)[2]-1 tmp<-other.dist.metrics(data.file,gene.names[data.file[,1]],dist.metric,agglom.method) dist.matrix<-tmp$A del.genes<-tmp$del.DM.index if(length(del.genes)>0) data.file<-data.file[-del.genes,] rm(tmp) clust.info <- hclust(dist.matrix,agglom.method) m <- clust.info$merge h <- clust.info$height d <- dim(m)[1] D<-vector(mode="numeric",length=d) both<-0 either<-0 neither<-0 ################################################################################################### ## The loop below counts the total number clusters appearing in the clustering tree as follows. ## ## The merge matrix m generated by hclust consists of d rows. ## ## Each row i represents a parent cluster; each of the two column values of row i ## ## represents one of the two child clusters of cluster i. Therefore, the number of ## ## total clusters *should be* 2*d. There are two obstructions to this deduction. ## ## First, clusters of dimension two (and thus having singletons as both child clusters) ## ## are counted twice: once as a whole cluster and once as two singleton clusters. ## ## As we are not clustering any clusters of dimension two, it is enough to count these ## ## clusters once, as an entire cluster of dimension 2. ## ## Secondly, the original cluster (consisting of the entire data ## ## set) is not counted. Thus the total number of clusters is 2*(d-both)+1, where both ## ## is the number of clusters consisting of two singleton subclusters. ## ## D is the vector which counts the dimension of each cluster. ## ## dim.sum is the sum of dimensions of all clusters we consider: ## ## D is the dimension of all clusters but the original, ## ## d+1 is the dimension of the original, ## ## 2*both represents all pairs of singleton subclusters counted twice. ## ################################################################################################### for(i in 1:d) {if(m[i,1]<0 && m[i,2]<0) {both<-both+1 D[i]<-2} if(m[i,1]<0 && m[i,2]>0) {either<-either +1 D[i]<-1+D[m[i,2]]} if(m[i,1]>0 && m[i,2]>0) {neither<-neither +1 D[i]<-D[m[i,1]]+D[m[i,2]]}} num.clusters<-2*(d-both)+1 dim.sum<-sum(D) rm(D) V.dimnames<-list(NULL,c("CLUSTER","NAME","DIM","GeneNum",c(1:num.exp))) View.Matrix<-matrix(nrow=dim.sum,ncol=4+num.exp,dimnames=V.dimnames) I.list<-vector(mode="list",length=d) a<-dim.sum+1 ################################################################################# ## The loop below determines which genes belong to each cluster by unraveling ## ## the hclust merge matrix m. View.Matrix is being set up to store the ## ## cluster name, gene number of each gene within the cluster and the dimension ## ## of the cluster. Genes are identified by their numerical placement within ## ## the data file. ## ## I.list[[i]][1] stores the starting row number of View.Matrix for cluster i ## ## I.list[[i]][2] stores the ending row number of View.Matrix for cluster i ## ## I.list[[i]][3] stores the dimension of cluster i ## ## I.list[[i]][4] stores the height of cluster i ## ################################################################################# for(i in 1:d) {I.list[[i]]<-vector(mode="numeric",length=4) if(m[i,1]<0 && m[i,2]<0) {tmp<-rbind(data.file[abs(m[i,1]),],data.file[abs(m[i,2]),])} if(m[i,1]<0 && m[i,2]>0) {tmp<-rbind(data.file[abs(m[i,1]),], View.Matrix[I.list[[m[i,2]]][1]:I.list[[m[i,2]]][2],4:(4+num.exp)])} if(m[i,1]>0 && m[i,2]>0) {tmp<-rbind(View.Matrix[I.list[[m[i,1]]][1]:I.list[[m[i,1]]][2],4:(4+num.exp)], View.Matrix[I.list[[m[i,2]]][1]:I.list[[m[i,2]]][2],4:(4+num.exp)])} I.list[[i]][3]<-length(tmp)/(num.exp+1) b<-a-1 a<-b-I.list[[i]][3]+1 I.list[[i]][1]<-a I.list[[i]][2]<-b I.list[[i]][4]<-h[i] View.Matrix[a:b,3]<-rep(I.list[[i]][3],I.list[[i]][3]) ## Dim ## View.Matrix[a:b,4:(4+num.exp)]<-tmp} ## Cluster with expression profiles ## View.Matrix[1:(d+1),4:(4+num.exp)]<-data.file Orig <-View.Matrix[I.list[[d]][1]:I.list[[d]][2],4:(4+num.exp)] if(m[d,1]<0) Left <-data.file[abs(m[d,1]),] if(m[d,2]<0) Right <-data.file[abs(m[d,2]),] if(m[d,1]>0) Left <-View.Matrix[I.list[[m[d,1]]][1]:I.list[[m[d,1]]][2],4:(4+num.exp)] if(m[d,2]>0) Right <-View.Matrix[I.list[[m[d,2]]][1]:I.list[[m[d,2]]][2],4:(4+num.exp)] ##Add extra row for each singleton cluster to the end of View.Matrix## tmp<-matrix(nrow=dim.sum+either,ncol=4+num.exp,dimnames=V.dimnames) tmp[1:dim.sum,]<-View.Matrix View.Matrix<-tmp rm(tmp) return(View.Matrix,I.list,m,d,h,num.exp,dim.sum,Left,Right,Orig,num.clusters)} ########################################################################################################## ## This function sets up the rest of View.Matrix, which stores and returns the CLUSTERLABEL, ## ## GENENUM, and CLUSTERDIMENSION. View.Matrix stores the clusters in the order generated ## ## by traversing the clustering tree from top down, one level at a time, left to right. ## ## This traversal is accomplished by setting up and using v.stack, which acts as a top-down ## ## breadth-first searching stack. The columns of v.stack contain the level of the clustering ## ## tree, the node number, the number of the parent of that node, the cluster name associated ## ## to the node, the cluster number (with respect to the merge matrix m found in the function ## ## get.clust.list, the dimension of the cluster, the height of the cluster, and how many times ## ## the cluster has been visited. [PLEASE NOTE: Cluster Name corresponds to the variables ## ## z and k, and an old numbering system I use to check whether the clustering is correct. ## ## These can be ignored.] ## ## The algorithm works as follows. Beginning with the top cluster (entire data set) of the tree, ## ## each cluster is visited. If the cluster has dimension larger than two, its two subclusters ## ## will be determined first. Then the left child cluster will be visited first. If it has ## ## dimension greater than two, we continue to follow the chain of left subclusters until ## ## a subcluster has dimension two or less. At this point, we begin the traversal up to the ## ## parent node of the cluster just visited, then to its right subcluster. If the dimension of ## ## this right subcluster is more than two, we begin to traverse the chain of left subclusters ## ## until we reach one with dimension less than or equal to two. ETC. ## ## A node can only be visited twice. ## ## INPUT: data.file, gene.names generated by scan.data ## ## dist.metric, stop.level are user inputs ## ## num.perms: the number of permutations defined by the user ## ## thresh: the confidence threshold defined by the user as a percentage ## ## OUTPUT: View.Matrix, User.Matrix, which are written out in txt file format as Graph.Data, User.Data ## ########################################################################################################## index<-function(data.file,gene.names,dist.metric,agglom.method,stop.level,num.perms,thresh) {if(stop.level=="") stop.level<-"ALL" tmp<-get.clust.list(data.file,gene.names,dist.metric,agglom.method) dev.print(file="Orig.Clustering.pdf",dev=pdf) I.list<-tmp$I.list d<-tmp$d m<-tmp$m h<-tmp$h dim.sum<-tmp$dim.sum if(dim.sum>5000000) {print("POSSIBLE MEMORY CONSTRAINTS!!") print("Choose a different Agglomerative Method,") print("Restrict to a smaller Stop Level,") print("or restrict to less Permutations")} num.exp<-tmp$num.exp num.clusters<-tmp$num.clusters View.Matrix<-tmp$View.Matrix View.Matrix[1:(d+1),1]<-1 View.Matrix[1:(d+1),2]<-1 View.Matrix[1:(d+1),3]<-d+1 rm(tmp) clust.count<-0 ## The tstat.matrix contains the name of the cluster being tested, ## the sequence of tstats associated with that cluster's test, ## whether each permutation resulted in a singleton splitting (1 for singleton, 0 else), ## and the gene number of the singleton tstat.matrix<-matrix(nrow=3*num.clusters,ncol=num.perms+3) v.dimnames<-list(NULL,c("LEVEL","NODE","PARENT","CLUSTER","NAME","Cl#","DIM","HEIGHT","Visit")) v.stack<-matrix(nrow=num.clusters+1,ncol=9,dimnames=v.dimnames) v.stack[,9]<-rep(-1,num.clusters+1) v.stack[1,]<-c(0,1,0,1,1,d,d+1,I.list[[d]][4],1) pos<-1 ## stack position l<-0 ## level of stack position, set to 0 at beginning tmp<-data.file ret.pos<-1 ## the parent node of the node being visited v.stack[2,1]<-0 single.LABEL <-vector(mode="character",length=dim(View.Matrix)[1]) view.single.LABEL<-vector(mode="character",length=dim(View.Matrix)[1]) single.ct<-0 ## counting the number of singleton clusters visited b<-0 while(v.stack[ret.pos,9]==1 && l < stop.level) {while(length(tmp)>2*(num.exp+1)) ## while the cluster has dimension greater than 2 {v.stack[ret.pos,9]<-2 fix.pos<-pos ## fix the position of the cluster now being visited for(k in 1:2) ## considering the cluster's two subclusters {if(fix.pos==1) ## special case of the first cluster (entire data file) {fix.T<-d ## the row of merge matrix m of the cluster z<-2 ## old numbering scheme: the children get prefix 2 l<-1} ## moving down one level if(fix.pos !=1) ## all other successive cases {fix.T<-v.stack[ret.pos,6] ## the row of merge matrix m of the cluster now being considered ## archaic numbering scheme: formula of the children's prefix z<-2*(v.stack[ret.pos,4])-(2-v.stack[ret.pos,5]) l<-v.stack[ret.pos,1]+1} ## level of the children is one more than the parent T<-m[fix.T,k] ## T is the cluster number of the children wrt merge matrix m if(T<0) ## if child cluster is a singleton {single.ct<-single.ct+1 clust<-data.file[abs(T),] ## the singleton contains only this gene number Dim<-1## Dim=dimension of the cluster height<-v.stack[ret.pos,8]## height of singleton clusters are equal to that of parent cluster single<-gene.names[abs(T)]## name of the gene a<-dim.sum+single.ct ## beginning row for this cluster in View.Matrix b<-a ## ending row for this cluster in View.Matrix View.Matrix[a:b,3]<-1## setting View.Matrix CLUSTERDIM View.Matrix[a:b,4:(num.exp+4)]<-clust} ## setting up View.Matrix cluster info if(T>0) ##if child cluster is not a singleton {clust <- View.Matrix[I.list[[T]][1]:I.list[[T]][2],4:(num.exp+4)] ## begin and end row of View.Matrix of this cluster Dim <- I.list[[T]][3] height <- I.list[[T]][4] a<-I.list[[T]][1] b<-I.list[[T]][2] if(Dim==2) single<-paste(gene.names[abs(m[T,1])],gene.names[abs(m[T,2])]) if(Dim>2) single<-""} ## label is empty when clusters contain more than 2 genes pos<-fix.pos+k ## the children clusters are stored one and two rows down from their parent cluster v.stack[pos,1]<-l v.stack[pos,2]<-pos v.stack[pos,3]<-v.stack[ret.pos,2] v.stack[pos,4]<-z v.stack[pos,5]<-k v.stack[pos,6]<-T v.stack[pos,7]<-Dim v.stack[pos,8]<-height v.stack[pos,9]<-1 View.Matrix[a:b,1]<-rep(z,Dim) ## Archaic numbering scheme prefix View.Matrix[a:b,2]<-rep(k,Dim) ## Archaic numbering scheme prefix View.Matrix[a:b,3]<-rep(Dim,Dim) ## Cluster Dim single.LABEL[a:b]<-rep(single,Dim)} ## Singleton/Doubleton LABEL ## End of loop for(k in 1:2) -- done with pos's subclusters ## ret.pos<-pos ## set ret.pos to the second subcluster just examined while(ret.pos>1 && v.stack[(ret.pos),9]==1) ## traverse up the stack to find the first node (from the top) {ret.pos<-ret.pos-1} ## whose subclusters have not yet been examined ret.pos<-ret.pos+1 tmpT<-v.stack[ret.pos,6] ## tmpT is the cluster number of the cluster we next consider if(tmpT<0) tmp<-data.file[abs(tmpT),] ## the cluster is a singleton if(tmpT>0) tmp<-View.Matrix[I.list[[tmpT]][1]:I.list[[tmpT]][2],4:(num.exp+4)]} ## the cluster is not a singleton ## ## End (while |tmp|>2) ## v.stack[ret.pos,9]<-2 ## the cluster now is a singleton or doubleton ret.pos<-ret.pos+1 ## go to next cluster on stack tmpT<-v.stack[ret.pos,6] if(v.stack[ret.pos,9]==-1) tmp<-0 ## if the cluster hasn't been visited by now, it's the last (stop) line on v.stack if(v.stack[ret.pos,9]!=-1) ## if the cluster has been visited, continue loop {if(tmpT<0) tmp<-data.file[abs(tmpT),] if(tmpT>0) tmp<-View.Matrix[I.list[[tmpT]][1]:I.list[[tmpT]][2],4:(num.exp+4)]}} ##no more clstrs on stack ## Once we exit this loop there are no more clusters on stack rm(I.list) v.stack<-v.stack[!is.na(v.stack[,1]),] ## delete dummy (stop) row at bottom ## ## !is.na(View.Matrix[,1])] is the first subclustering which is listed twice but only counted once ## We are deleting the second listing here single.LABEL<-single.LABEL[!is.na(View.Matrix[,1])] View.Matrix<-View.Matrix[!is.na(View.Matrix[,1]),] o<-order(View.Matrix[,1],View.Matrix[,2]) View.Matrix<-View.Matrix[o,] ## order View.Matrix wrt archaic numbering scheme to put in order GRAPH.LABEL<-vector(mode="character") ## later, this will be View.Matrix[,2] single.LABEL<-single.LABEL[o] t<-table(v.stack[,1]) ## make sure that the user-defined stop level is less than the number of actual levels of the tree if(stop.level != "ALL") {adjusted.stop.level<-min(stop.level,length(t)-1) t<-t[1:(adjusted.stop.level+1)] stack.sum<-sum(t) v.stack<-v.stack[1:stack.sum,]} k<-0 b<-0 ## In this loop labels for clusters are created as "#,#", where the first number indicates the ## level in the clustering tree, and the second # represents this cluster's position in the level. for(i in 1:length(t)) {for(j in 1:t[i]) {k<-k+1 a<-b+1 b<-a+v.stack[k,7]-1 view.single.LABEL[a:b]<-rep(paste(i-1,j,"",sep=","),v.stack[k,7]) GRAPH.LABEL[a:b]<-rep(paste(i-1,j,single.LABEL[a],sep=","),v.stack[k,7]) }} View.Matrix<-View.Matrix[1:b,] view.single.LABEL<-view.single.LABEL[1:b] GRAPH.LABEL[1:(d+1)]<-rep(paste(0,0,"ORIG"),d+1) last.level<-v.stack[dim(v.stack)[1],1] print(paste("The Clustering Tree has ",last.level, " levels")) acc.vec<-c(rep(1,d+1),rep(-1,dim(View.Matrix)[1]-(d+1))) ## this is the ACC column of View.Matrix ## it is initialized to (-1,-1,...,-1) View.Matrix<-cbind(acc.vec,View.Matrix) v.stack<-cbind(c(1,rep(0,dim(v.stack)[1]-1)),v.stack) ## the first column of v.stack is also associated ## to the "accept" state. it is initialized to (1,1,....,1) ## v.dimnames<-list(NULL,c("ACC","LEVEL","NODE","PARENT","CLUSTER","NAME","Cl#","DIM","HEIGHT","Visit")) ## col names of v.stack ## V.dimnames<-list(NULL,c("ACC","CLUSTER","NAME","DIM","GeneNum",c(1:num.exp))) ## column names of View.Matrix ## In the following loop, v.stack is traversed from top to bottom. ## Note: v.stack[i,1]==1 means the cluster has been perused ## View.Matrix[i,1]==1 means the cluster's subclusters have been accepted b<-0 for(i in 1:length(v.stack[,8])) {a<-b+1 b<-v.stack[i,8]+a-1 if(v.stack[i,8]==1) {single.LABEL[a:b]<-rep(gene.names[abs(v.stack[i,7])],v.stack[i,8])} if(v.stack[i,8]==2) ## if the cluster dimension is 2 {x<-v.stack[i,7] ## x is a temp variable set to the cluster number to make the next line shorter single.LABEL[a:b]<-rep(paste(gene.names[abs(m[x,1])],gene.names[abs(m[x,2])]),v.stack[i,8])} ## generate the doubleton cluster label ## if(num.perms>=1) {if(v.stack[i,8]>2 && v.stack[i,1]==1) {clust.count<-clust.count+1 tstat.matrix[(3*clust.count-2):(3*clust.count),1]<-v.stack[i,5] tstat.matrix[(3*clust.count-2):(3*clust.count),2]<-v.stack[i,6] H0<-v.stack[i,9] C0<-View.Matrix[a:b,5:(5+num.exp)] Z<-v.stack[i,5] K<-v.stack[i,6] if(Z==1) {T1<-v.stack[i+1,7] T2<-v.stack[i+2,7] if(T1<0) {print("first splitting is actually a singleton!") D1<-H0} tstat.matrix[3*clust.count,num.perms+3]<-abs(T1) if(T2<0) D2<-H0 if(T1>0) D1<-H0-v.stack[i+1,9] if(T2>0) D2<-H0-v.stack[i+2,9]} if(Z!=1) {child.Z<-2*Z-(2-K) Mc<-!is.na(match(v.stack[,5],child.Z)) MM<-v.stack[Mc,] T1<-MM[1,7] T2<-MM[2,7] if(T1<0) D1<-H0 if(T2<0) D2<-H0 if(T1>0) D1<-H0-MM[1,9] if(T2>0) D2<-H0-MM[2,9]} if(T1<0) {orig.single.flag<-1 tstat.matrix[3*clust.count,num.perms+3]<-abs(T1)} if(T1>0) {orig.single.flag<-0 tstat.matrix[3*clust.count,num.perms+3]<-0} orig.t.stat<-D1+D2 t.seq<-vector(mode="numeric",length=num.perms+1) t.seq[num.perms+1]<-orig.t.stat single.flag.seq<-vector(mode="numeric",length=num.perms+1) single.flag.seq[num.perms+1]<-orig.single.flag ## the loop below permutes the cluster in question and computes the test statistic num.perms times for(p in 1:num.perms) {if(p%%100==0) print(paste("perm# ",p)) tmp.data<-permute(C0)$perm.data tmp.clst<-get.clust.list(tmp.data,gene.names,dist.metric,agglom.method) tmp.m<-tmp.clst$m tmp.d<-dim(tmp.m)[1] tmp.h<-tmp.clst$h rm(tmp.clst) H0<-tmp.h[tmp.d] T1<-tmp.m[tmp.d,1] T2<-tmp.m[tmp.d,2] if(T1<0) {D1<-H0 single.flag<-1 tstat.matrix[3*clust.count,(p+2)]<-abs(T1)} rm(tmp.data) if(T2<0) D2<-H0 if(T1>0) {D1<-H0-tmp.h[T1] single.flag<-0 tstat.matrix[3*clust.count,(p+2)]<-0} if(T2>0) D2<-H0-tmp.h[T2] t.stat<-D1+D2 t.seq[p]<-t.stat single.flag.seq[p]<-single.flag} ##end of permutation loop rm(C0) t.seq[1:num.perms]<-t.seq[order(t.seq[1:num.perms])] ## order the num.perms test statistics tstat.matrix[(3*clust.count-2),3:(num.perms+3)]<-round(t.seq,2) tstat.matrix[(3*clust.count-1),3:(num.perms+3)]<-single.flag.seq accept<-t.seq[num.perms+1]>= t.seq[ceiling(thresh*num.perms)] rm(t.seq,single.flag.seq) ## accept is a Boolean variable: ## accept is true when the original test stat is greater than thresh% ## of the test stats associated to the permutations if(v.stack[i,5]==1) z<-2 ## archaic numbering system special case: prefix of children if(v.stack[i,5]>1) z<-2*v.stack[i,5]-(2-v.stack[i,6]) ## archaic numbering system: prefix of children M<-match(v.stack[,5],z) ## find all clusters with this prefix N<-match(View.Matrix[,2],z) if(accept) {print("accepting the subclusters of") print(GRAPH.LABEL[a]) v.stack[!is.na(M),1]<-1 ## the children are accepted, so set their acc var to 1 View.Matrix[!is.na(N),1]<-1} if(!accept) {print("not accepting the subclusters of cluster") print(GRAPH.LABEL[a]) v.stack[!is.na(M),1]<-0 ## the children are not accepted, so set their acc var to 0 View.Matrix[!is.na(N),1]<-0 }}}} ## v.stack has been completely traversed View.Matrix<-View.Matrix[,-3] ## deleting archaic numbering scheme and putting in labels View.Matrix[,2]<-GRAPH.LABEL View.Matrix[1:(d+1),2]<-rep(paste(0,0,"ORIG", sep=","),d+1) Graph.Matrix<-data.frame(ACC=View.Matrix[!duplicated(View.Matrix[,2]),1], GRAPHLABEL=View.Matrix[!duplicated(View.Matrix[,2]),2], v.stack[,3:4], HEIGHT=v.stack[,9], Dim=View.Matrix[!duplicated(View.Matrix[,2]),3]) write.table(Graph.Matrix,file="Graph.Data",quote=F,sep="\t",col.names=TRUE,row.names=F) View.Matrix[View.Matrix[,1]==1,1]<-"Accept" View.Matrix[View.Matrix[,1]==0,1]<-"Reject" View.Matrix[View.Matrix[,1]==-1,1]<-"Reject" View.Matrix<-data.frame(ACC=View.Matrix[,1], CLUSTERLABEL=view.single.LABEL, CLUSTERDIMENSION=View.Matrix[,3], GENENUM=as.numeric(View.Matrix[,4])) write.table(View.Matrix, file="User.Data", quote=F,sep="\t",col.names=TRUE,row.names=F) mna<-match(NA,tstat.matrix[,1]) tstat.matrix<-tstat.matrix[1:(mna-1),] if(num.perms>1) {colnames(tstat.matrix)<-c("Cluster","Label",paste("tstat ",c(1:num.perms),sep=""),"orig tstat") perm.seq<-seq(1,(dim(tstat.matrix)[1]-2),by=3) tmp<-tstat.matrix[perm.seq,] write.table(rbind(colnames(tstat.matrix),tmp),file="tstats.txt",sep="\t",quote=F,row.names=F,col.names=F)} ##return(View.Matrix,tstat.matrix[1,]) rm(View.Matrix,Graph.Matrix,v.stack)} #################################################################################### ## The permute function permutes the data file generated by scan.data by permuting ## values within each column. By permuting in this fashion, we are trying to break ## any relationships across genes within experiments that are due to chance alone. ## INPUT: the data file output of scan.data ## OUTPUT: the input data file with values in each column permuted. ################################################################################### permute <- function(data.file) {num.genes <- dim(data.file)[1] num.cols <- dim(data.file)[2] perm.data <- matrix(nrow=num.genes, ncol=num.cols) perm.data[,1]<-c(1:num.genes) for(i in 2:num.cols) {r <- runif(num.genes) ## generated num.genes random numbers from a uniform distribution o <- order(r) perm.data[,i] <- data.file[o,i]} ## to test whether permute works correctly ## check.vector<-vector(mode="character",length=(num.cols-1)) for(c in 2:num.cols) {check.vector[c]<-round(mean(perm.data[,c],na.rm=T),5)==round(mean(data.file[,c],na.rm=T),5) if(check.vector[c]!="TRUE") {print("permute isn't working right") return(c)}} return(perm.data,check.vector)} ########################################################################## ## The other.dist.metrics function creates a "distance" matrix ## ## using the given gene expression matrix and code for distance metric ## ## "cos" ==> 1-cosine of the angle between the two vectors ## ## "pcos" ==> penalized cosine metric (Brian Munneke st al) ## ## "cor" ==> abs(cor-1) Pearson's correlation coefficient ## ########################################################################## other.dist.metrics<-function(expression.matrix,labels,dist.metric,agglom.method) ## the rows of expression.matrix are being clustered here ## if you want to cluster samples, make sure and use the transpose of the matrix {EM<-expression.matrix[,-1] ##delete gene.name column DM<-matrix(ncol=dim(EM)[1],nrow=dim(EM)[1]) if(length(EM[is.na(EM)])>0) ##if there are NA's (missing values)## {print("there are missing values") ##compute distances on pairs of existing values only## for(j in 1:dim(EM)[1]) {for(i in j:dim(EM)[1]) {x<-EM[j,] y<-EM[i,] na.posn<-vector(mode="numeric") ##record positions of NA's## na.posn<-NULL for(k in 1:length(x)) {if(is.na(x[k]) || is.na(y[k])) na.posn<-c(na.posn,k)} if(length(na.posn)==0) tmpd<-rbind(x,y) if(length(na.posn)>0) {tmp1<-rbind(x,y) nona.tmp<-tmp1[,-(na.posn)] ##do not include NA's## tmpd<-nona.tmp} if(dist.metric=="euc") distxy<-dist(tmpd,method="euclidean") if(dist.metric=="pcos") distxy<-munneke.dist(tmpd[1,],tmpd[2,]) if(dist.metric=="cos") distxy<-cos.dist(tmpd[1,],tmpd[2,]) if(dist.metric=="cor") distxy<-abs(1-cor(x,y,use="pairwise.complete.obs")) if(dist.metric=="both") distxy<-abs(1-cor(x,y,use="pairwise.complete.obs"))+dist(tmpd,method="euclidean") DM[i,j]<-distxy}} ##finish computing DM first ##Deleting NA's in the distance matrix - they cannot be used in clustering## del.DM.index<-NULL tmpDM<-DM ct<- -1 i<-0 while(i0) ##check## {print("there are NA's in DM") tmpcheck<-DM[-del.DM.index,-del.DM.index] for(i in 1:dim(tmpDM)[1]) {for(j in 1:i) {if(tmpDM[i,j]!=tmpcheck[i,j]) {print("something went wrong in delete row procedure!!!!") print(tmpDM[i,j]) print(tmpcheck[i,j]) print(del.DM.index) return(DM,tmpDM) pause()}}} A<-as.dist(tmpDM) labels<-labels[-del.DM.index] plot(hclust(A,method=agglom.method),labels=labels,lwd=2,cex.lab=.7,font.lab=2,cex=.8,font=2, main=paste("Clustering using ",dist.metric,sep=" "),xlab="",sub="",font.axis=2) return(A,DM,tmpDM,del.DM.index,labels)} } ##end of missing values case ################################################################################################# if(length(EM[is.na(EM)])==0) ##there are no NA's {print("there are no missing values") if(dist.metric=="euc") DM<-dist(EM,method="euclidean") if(dist.metric=="pcos") {for(j in 1:dim(EM)[1]) {for(i in j:dim(EM)[1]) {x<-EM[j,] y<-EM[i,] distxy<-munneke.dist(x,y) DM[i,j]<-distxy}}} if(dist.metric=="cos") {for(j in 1:dim(EM)[1]) {for(i in j:dim(EM)[1]) {x<-EM[j,] y<-EM[i,] distxy<-cos.dist(x,y) DM[i,j]<-distxy}}} if(dist.metric=="cor") {for(j in 1:dim(EM)[1]) {for(i in j:dim(EM)[1]) {x<-EM[j,] y<-EM[i,] distxy<-abs(1-cor(x,y)) DM[i,j]<-distxy}}} if(dist.metric=="euc") A<-DM if(dist.metric!="euc") A<-as.dist(DM)} ##end of no missing values case plot(hclust(A,method=agglom.method),labels=labels,lwd=2,cex.lab=.5,font.lab=2,hang=-1,font=2,cex=.8, main=paste("Clustering using ",dist.metric,sep=" "),xlab="",sub="") if(length(EM[is.na(EM)])>0) return(DM,A,del.DM.index) if(length(EM[is.na(EM)])==0) return(DM,A) } ##end of other.dist.metrics function ######################################################################## ##This function computes the Munneke distance between vectors x and y ## ######################################################################## munneke.dist<-function(x,y) {cosxy<-sum(x*y)/sqrt(sum(x^2)*sum(y^2)) diffxy<-sum(abs(x-y)) p<-diffxy/length(x) p<-2*(p/(1+p)) pcosxy<-1-cosxy+p return(pcosxy)} ######################################################################## ##This function computes the cosine distance between vectors x and y ## ######################################################################## cos.dist<-function(x,y) {cosxy<-sum(x*y)/sqrt(sum(x^2)*sum(y^2)) return(1-cosxy)}