# R functions for associating chromatin structure with gene expression # # first published as part of: # Identifying and Mapping Cell-type Specific # Chromatin Programming of Gene Expression # # Troels T. Marstrand & John D. Storey # #Copyright (C) 2010 by Troels T. Marstrand and John D. Storey # #This library is free software; you can redistribute it and/or #modify it under the terms of the GNU Lesser General Public #License as published by the Free Software Foundation; either #version 2.1 of the License, or (at your option) any later version. # #This library is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #Lesser General Public License for more details. # #You should have received a copy of the GNU Lesser General Public #License along with this library; if not, write to the Free Software #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ############## # Data loading ############## # # Here we assume that the data is located on the Desktop, if not # change according to your local data directory setwd('~/Desktop/ARS_data/') #summary files for normalization of the DHS volumes sumfiles = list.files(path=getwd(), pattern='.*summary$') sumdata = list() before.arm = matrix(0, ncol=24, nrow=length(sumfiles)) after.arm = matrix(0, ncol=24, nrow=length(sumfiles)) for(i in 1:length(sumfiles)){ sumdata[[i]] = read.table(sumfiles[i]) before.arm[i,] = sumdata[[i]][,2] after.arm[i,] = sumdata[[i]][,3] - sumdata[[i]][,2] } #using simple scaling to same number of intensities per arm for(j in 1:dim(before.arm)[2]){ before.arm[,j] = max(before.arm[,j])/before.arm[,j] after.arm[,j] = max(after.arm[,j])/after.arm[,j] } chr.names = as.character(sumdata[[1]][,1]) #gene specific dhs volumes dhsfiles = list.files(path=getwd(), pattern='.*DHS_windows$') dhsdata = list() for(i in 1:length(dhsfiles)){ dhsdata[[i]] = read.table(dhsfiles[i], header=TRUE) } #expression values expr.mean = list() expr.mean.files = list.files(path=getwd(), pattern='.*mean$') for(i in 1:length(expr.mean.files)){ #skip the header as the first line expr.mean[[i]] = read.table(expr.mean.files[i], skip=1) } # # extra, retrieve gene names from the expression # geneNames = expr.mean[[1]][,1] ############## # Data preprocessing ############## expr.matrix = matrix( 0, nrow=dim(expr.mean[[1]])[1], ncol=length(expr.mean) ) #as we have several different regions to test out we store it as an array #rows, cols, regions dhs.array = array( 0, dim=c(dim(expr.mean[[1]])[1], length(expr.mean), 5) ) #preprocessing expression values for(i in 1:dim(expr.mean[[1]])[1]){ tmp = numeric(length(expr.mean)) for(j in 1:length(expr.mean)){ tmp[j] = expr.mean[[j]][i,2] } expr.matrix[i,] = tmp } #preprocessing dhs data for(i in 1:dim(expr.mean[[1]])[1]){ #get location to get scaling factor this.chrom = dhsdata[[1]][i,]$chrom this.col = match(this.chrom, chr.names) loc = dhsdata[[1]][i,]$arm scale.factor = NULL if(loc=='A'){ scale.factor = after.arm[,this.col] } else { scale.factor = before.arm[,this.col] } #get the dhs data tmp = matrix(0,nrow=length(expr.mean),ncol=5) for(j in 1:length(expr.mean)){ tmp[j,] = as.numeric(dhsdata[[j]][i,4:8]) } #now put into array dhs.array[i,,] = tmp * scale.factor } ############## # Functions ############## # # Precomputed decay rates # # region decay KS-pvalue test-statistic # 2.5kb [1,] -0.049 0.4209586 0.009910597 # 50kb [2,] -0.045 0.6444371 0.008629821 # 100kb [3,] -0.047 0.6460140 0.008624784 # 150kb [4,] -0.045 0.7751450 0.007748073 # 200kb [5,] -0.045 0.7928256 0.007627697 # 100kb - 2.5 [6,] -0.049 0.9183644 0.006388505 # 200kb - 2.5 [7,] -0.047 0.7522481 0.007837903 #setting pos to 1,...,5 respectively decays = c(-.049, -.045, -.047, -.045, -.045) # # Utility functions # getp <- function(lr,lr0) { #this func calculates empirical p-values #based on the construction that the larger #the statistic, the more significant it is m <- length(lr) B <- length(lr0)/m v <- c(!rep(F,m),rep(F,m*B)) #v <- v[rev(order(c(lr,lr0)))] v <- v[order(-c(lr,lr0))] u <- 1:length(v) w <- 1:m p <- (u[v==TRUE]-w)/(B*m) p <- p[rank(-lr,ties.method="random")] return(p) } find.decay = function(decays, pos, expr, dhs, genes){ #Test a list of possible decay rates to find which ensures uniform null p-values ans = matrix(0, ncol=2, nrow=length(decays)) for(d in 1:length(decays)){ print(paste('Testing decay ', decays[d], sep='')) test = ARS.statistics(expr, dhs, pos, genes, decay=decays[d], noProx=FALSE) rand = Permute.ARS(test, nperm=100) pval = getp(test$st[,1], rand[,1]) high.p = pval[pval>=0.5] KS = ks.test((2*high.p)-1, 'punif') ans[d,] = c(decays[d], KS$p.value) } return(ans) } CreateOutput = function(pos, decay, expr, dhs, genes, noProx=FALSE){ require(qvalue) tissues = sub('.expr_mean', '', expr.mean.files) genes = expr.mean[[1]][,1] real = ARS.statistics(expr, dhs, pos, genes, decay, noProx=noProx) rand = Permute.ARS(real, nperm=100) pval = getp(real$st[,1], rand[,1]) #no 0 p-values pval[pval==0] = 1 / (dim(rand)[1] + 1) qv = qvalue(pval) tis = c() for(i in 1:length(real$st[,4])){ tis = c(tis, tissues[real$st[i,4]]) } ans = data.frame(cbind(real$genes, real$st[,1], real$st[,2], tis, qv$pvalues, qv$qvalues)) colnames(ans) = c('Gene', 'Stat', 'Angle', 'Cell', 'Pvalue', 'Qvalue') return(list('toPrint'=ans, 'pvalues'=pval)) } # # Main functions # ARS.statistics = function(expr, dhs, pos, genes, decay=-.05, noProx=FALSE){ # Function for calculating the ARS statistic # will store the data that is needed for permutation #extract and remove NAs expr.work = expr dhs.work = dhs[,,pos] if(noProx){ dhs.work = dhs.work - dhs[,,1] } tot.matrix = cbind(expr.work, dhs.work) g.mat = cbind(expr.work, dhs.work, as.character(genes)) g = na.omit(g.mat) genes = g[,((dim(expr.work)[2]*2)+1)] m = na.omit(tot.matrix) print(dim(m)) expr.work = m[,1:dim(expr.work)[2]] dhs.work = m[,-c(1:dim(expr.work)[2])] #also omit rows that sum to 0 e.ind = apply(expr.work, 1, sum) e.ind = which(e.ind <= 0) d.ind = apply(dhs.work, 1, sum) d.ind = which(d.ind <= 0) #print(length(union(e.ind, d.ind))) if(length(union(e.ind, d.ind)) > 0){ expr.work = expr.work[-union(e.ind, d.ind),] dhs.work = dhs.work[-union(e.ind, d.ind),] genes = genes[-union(e.ind, d.ind)] } #allocate memory to the needed data structures perm.dhs.scaled = matrix(0, nrow=dim(dhs.work)[1], ncol=dim(dhs.work)[2]) perm.expr.scaled = matrix(0, nrow=dim(dhs.work)[1], ncol=dim(dhs.work)[2]) perm.angle = matrix(0, nrow=dim(dhs.work)[1], ncol=dim(dhs.work)[2]) relativeRatio = matrix(0, nrow=dim(dhs.work)[1], ncol=dim(dhs.work)[2]) #combined statistic, angle, ratio, celltype statistics = matrix(0, nrow=dim(dhs.work)[1], ncol=4) pb = txtProgressBar(min = 0, max = dim(dhs.work)[1], style = 3) for(i in 1:dim(dhs.work)[1]){ #do the calculation setTxtProgressBar(pb, i) #expression scaling and centering #HACK #two expression values may be identical set.seed(42) if(length(which(expr.work[i,]==max(expr.work[i,]))) > 1){ expr.work[i,] = expr.work[i,] + jitter(rep(0,dim(dhs.work)[2])) } tmp.expr = expr.work[i,] / max(expr.work[i,]) perm.expr.scaled[i,] = tmp.expr tmp.dhs = dhs.work[i,] / max(dhs.work[i,]) perm.dhs.scaled[i,] = tmp.dhs norm.dhs = tmp.dhs - median(tmp.dhs) norm.expr = tmp.expr - median(tmp.expr) #calculate distance and apply angle penalty distance = sqrt(norm.dhs^2 + norm.expr^2) norm.distance = (distance / median(distance)) angle = atan2(norm.dhs, norm.expr)*(180/pi) perm.angle[i,] = angle #in order to get summetric values at 0, 90, 180 and -180 r1 = norm.distance * exp(decay*abs(45 - angle)) r2 = norm.distance * exp(decay*abs(-135 - angle)) r3 = norm.distance * exp(decay*abs(225 - angle)) stats = apply(cbind(r1, r2, r3), 1, max) cell = which.max(stats) stat = max(stats) relativeRatio[i,] = stats / stat statistics[i,] = c(stat, angle[cell], norm.distance[cell], cell) } close(pb) #make the return object ans = list('statistics'=statistics, 'dhsScaled'=perm.dhs.scaled, 'exprScaled'=perm.expr.scaled, 'angle'=perm.angle, 'decay'=decay, 'genes'=genes, 'ratios'=relativeRatio) return(ans) } Permute.ARS = function(obj, nperm=3, randomAngle=FALSE, returnRand=FALSE){ # Permutation strategy for null statistics # see suppl. material for details #extract values mab = cbind( as.vector(t(obj$dhsScaled)), as.vector(t(obj$exprScaled)), as.vector(t(obj$angle)) ) mymasterR = matrix(0, ncol=2, nrow=dim(obj$dhsScaled)[1]*nperm) #get number of cell-types celltypes = dim(obj$dhsScaled)[2] #generate cell vector for dependent sampling cell = rep(seq(1,celltypes), dim(mab)[1]/celltypes) #4th column is going to be the cell type cmab = cbind(mab, as.numeric(cell)) #construct C1 and C2 configurations dfNstd = cmab #construct the (1,1) points dfN11 = dfNstd[dfNstd[,1]==1 & dfNstd[,2]==1,] ind = which(dfNstd[,1]==1 & dfNstd[,2]==1) #construct the (1,x) points dfN1x = dfNstd[dfNstd[,1]==1 & dfNstd[,2]!=1,] #construct the (x,1) points dfNx1 = dfNstd[dfNstd[,1]!=1 & dfNstd[,2]==1,] #construct the points where there is no 1 values dfNres = dfNstd[dfNstd[,1]!=1 & dfNstd[,2]!=1,] #get the ordering of (1,x) and (x,1) in terms of celltypes to pick the right ones ord.1x = dfN1x[,4] ord.x1 = dfNx1[,4] ord = cbind(as.numeric(ord.1x), as.numeric(ord.x1)) #order for the permutation of (1,x) and (x,1) points ord = ord[order(ord[,1], ord[,2], decreasing=FALSE),] master.index = 0 print('Starting permutation...') #start permutation pb = txtProgressBar(min = 0, max = nrow(mymasterR), style = 3) for(j in 1:nperm){ set.seed(42+j) #permute rand.dfN11 = dfN11[sample(nrow(dfN11)),] rand.dfN1x = dfN1x[sample(nrow(dfN1x)),] rand.dfNx1 = dfNx1[sample(nrow(dfNx1)),] rand.dfNres = dfNres[sample(nrow(dfNres)),] rand.Angle = mab[sample(nrow(mab)),3] #test dimensions are OK if(dim(rand.dfN1x)[1] != dim(rand.dfNx1)[1]){ print(dim(rand.dfN11)) print(dim(rand.dfN1x)) print(dim(rand.dfNx1)) print(dim(rand.dfNres)) stop() } #get all the data into list structures dfN11.list = list() dfN1x.list = list() dfNx1.list = list() dfNres.list = list() for(l in 1:celltypes){ dfN11.list[[l]] = rand.dfN11[which(rand.dfN11[,4]==l),] dfN1x.list[[l]] = rand.dfN1x[which(rand.dfN1x[,4]==l),] dfNx1.list[[l]] = rand.dfNx1[which(rand.dfNx1[,4]==l),] dfNres.list[[l]] = rand.dfNres[which(rand.dfNres[,4]==l),] } selector = c(1,2,3) RandRecomb = matrix(0, ncol=3, nrow=dim(mab)[1]) tot.index = 0 #keep track of the (x,x) points across multiple celltypes res.index = numeric(celltypes) #do(1,1) points static.celltypes = seq(1:celltypes) for(k in 1:celltypes){ dyn.cell = static.celltypes[-k] for(n in 1:dim(dfN11.list[[k]])[1]){ #get first point tot.index = tot.index + 1 RandRecomb[tot.index,] = dfN11.list[[k]][n,selector] #fill with (x,x) from the rest of the tissues for(res in dyn.cell){ res.index[res] = res.index[res] + 1 tot.index = tot.index + 1 RandRecomb[tot.index,] = dfNres.list[[res]][res.index[res], selector] } } } #do the (1,x) and (x,1) points ord.index = 0 x1.index = numeric(celltypes) for(k in 1:celltypes){ for(n in 1:dim(dfN1x.list[[k]])[1]){ #get (1,x) point tot.index = tot.index + 1 ord.index = ord.index + 1 RandRecomb[tot.index,] = dfN1x.list[[k]][n,selector] tissue.x1 = ord[ord.index,2] x1.index[tissue.x1] = x1.index[tissue.x1] + 1 tot.index = tot.index + 1 RandRecomb[tot.index,] = dfNx1.list[[tissue.x1]][x1.index[tissue.x1], selector] #remove tissues dyn.cell = static.celltypes[-ord[ord.index,]] for(res in dyn.cell){ res.index[res] = res.index[res] + 1 tot.index = tot.index + 1 RandRecomb[tot.index,] = dfNres.list[[res]][res.index[res], selector] } } } if(returnRand){ return(RandRecomb) } #start the actual calculation #test_r = matrix(0,nrow=nrow(obj$dhsScaled), ncol=celltypes) #test_a = matrix(0,nrow=nrow(obj$dhsScaled), ncol=celltypes) for(y in seq(1, dim(RandRecomb)[1], celltypes)){ master.index = master.index + 1 setTxtProgressBar(pb, master.index) tRa = RandRecomb[y:(y+celltypes-1), 1] tRb = RandRecomb[y:(y+celltypes-1), 2] a = tRa - median(tRa) b = tRb - median(tRb) dist = sqrt(a^2 + b^2) r = dist / median(dist) angle = RandRecomb[y:(y+celltypes-1), 3] #test_r[master.index,] = r #test_a[master.index,] = angle r1 = r * exp(obj$decay*abs(45 - angle)) r2 = r * exp(obj$decay*abs(-135 - angle)) r3 = r * exp(obj$decay*abs(225 - angle)) stats = apply(cbind(r1, r2, r3), 1, max) ang = angle[which.max(stats)] stat = max(stats) mymasterR[master.index,] = c(stat, ang) } #return(list('rand_ratios'=test_r, 'rand_angles'=test_a)) } close(pb) return(mymasterR) } # # Here we create the output # CreateOutput = function(pos, decay, expr, dhs, genes) # decays = c(-.049, -.045, -.047, -.045, -.045) t = FALSE if(t){ print('region1') region1 = CreateOutput(1, decay=-0.049, expr.matrix, dhs.array, geneNames) #test that we get the same KS-test pvalues as before, when determining the decay rate tmp = region1[[2]] print(ks.test((2*tmp[tmp>=.5])-1, 'punif')) write.table(region1[[1]],'~/Desktop/ars_2.5KB.txt', quote=FALSE, row.names=FALSE, sep='\t') #rinse and repeat print('region2') region2 = CreateOutput(2, decay=-0.045, expr.matrix, dhs.array, geneNames) #test that we get the same KS-test pvalues as before, when determining the decay rate tmp = region2[[2]] print(ks.test((2*tmp[tmp>=.5])-1, 'punif')) write.table(region2[[1]],'~/Desktop/ars_50KB.txt', quote=FALSE, row.names=FALSE, sep='\t') print('region3') region3 = CreateOutput(3, decay=-0.047, expr.matrix, dhs.array, geneNames) #test that we get the same KS-test pvalues as before, when determining the decay rate tmp = region3[[2]] print(ks.test((2*tmp[tmp>=.5])-1, 'punif')) write.table(region3[[1]],'~/Desktop/ars_100KB.txt', quote=FALSE, row.names=FALSE, sep='\t') print('region4') region4 = CreateOutput(4, decay=-0.045, expr.matrix, dhs.array, geneNames) #test that we get the same KS-test pvalues as before, when determining the decay rate tmp = region4[[2]] print(ks.test((2*tmp[tmp>=.5])-1, 'punif')) write.table(region4[[1]],'~/Desktop/ars_150KB.txt', quote=FALSE, row.names=FALSE, sep='\t') print('region5') region5 = CreateOutput(5, decay=-0.045, expr.matrix, dhs.array, geneNames) #test that we get the same KS-test pvalues as before, when determining the decay rate tmp = region5[[2]] print(ks.test((2*tmp[tmp>=.5])-1, 'punif')) write.table(region5[[1]],'~/Desktop/ars_200KB.txt', quote=FALSE, row.names=FALSE, sep='\t') } # 50kb - 2.5 [6,] -0.049 0.9183644 0.006388505 # 200kb - 2.5 [7,] -0.047 0.7522481 0.007837903 print('region6') region = CreateOutput(3, decay=-0.049, expr.matrix, dhs.array, geneNames, noProx=TRUE) #test that we get the same KS-test pvalues as before, when determining the decay rate tmp = region[[2]] print(ks.test((2*tmp[tmp>=.5])-1, 'punif')) write.table(region[[1]],'~/Desktop/ars_100KB_no_2.5KB.txt', quote=FALSE, row.names=FALSE, sep='\t') #print('region7') #region = CreateOutput(5, decay=-0.047, expr.matrix, dhs.array, geneNames, noProx=TRUE) #test that we get the same KS-test pvalues as before, when determining the decay rate #tmp = region[[2]] #print(ks.test((2*tmp[tmp>=.5])-1, 'punif')) #write.table(region[[1]],'~/Desktop/ars_200KB_no_2.5KB.txt', quote=FALSE, row.names=FALSE, sep='\t')