## Yuanlong LIU ## Scripts used to call domains having high inter-domain score while low intra-domain score. Adapted from TopDom with the following authorship # @author : Hanjun Shin(shanjun@usc.edu) # @credit : Harris Lazaris(Ph.D Stduent, NYU), Dr. Gangqing Hu(Staff Scientist, NIH) # @brief : TopDom.R is a software package to identify topological domains for given Hi-C contact matrix. # @version 0.0.2 # @fn TopDom # @param matrixFile : string, matrixFile Address, # - Format = {chromosome, bin start, bin end, N numbers normalized value } # - N * (N + 3), where N is the number of bins # @param window.size :integer, number of bins to extend. # @param out_binSignal : string, binSignal file address to write # @param out_ext : string, ext file address to write remove_small_domains = function(domain_size_min, binSignal, domains) ## remove all domains smaller than 40E3, ADDED by Yuanlong, 2019-July-09 { # binSignal = res_input_mat$binSignal # domains = domains_bp # domains = domains[, c('from.id', 'to.id', 'size')] while(1) { i = min(setdiff(which(domains$size < domain_size_min/10E3), 1)) ## do not include the first one if(i==Inf) break ids = domains[(i-1):i, 'to.id'] index2rm = ((i-1):i)[which.max(binSignal[ids, 'mean.cf'])] domains = domains[-index2rm, ] domains[-1, 'from.id'] = domains[-nrow(domains), 'to.id'] + 1 domains$size = domains[, 'to.id'] - domains[, 'from.id'] + 1 cat(nrow(domains), '\n') if(nrow(domains) < 10) break ## avoid exceptions } # domains[, 'chr'] = domains_bp[1, 'chr'] domains[, 'from.coord'] = domains[, 'from.id'] - 1 domains[, 'to.coord'] = domains[, 'to.id'] rownames(domains) = 1:nrow(domains) # print(head(domains)) return(domains) } ## modified input file format and added several potentially useful parameter options, Yuanlong LIU TopDom_v2 <- function( A_extended, window.size, outFile=NULL, statFilter=T, p_thresh=0.05, return_peak_and_binsignal=FALSE, window.sizes=NULL, stat_window_size=NULL, domain_size_min=NULL) { if(!is.null(window.sizes)) window.size = window.sizes[1] # print("#########################################################################") # print("Step 0 : File Read ") # print("#########################################################################") window.size = as.numeric(window.size) matdf = A_extended if( ncol(matdf) - nrow(matdf) == 3) { colnames(matdf) <- c("chr", "from.coord", "to.coord") } else if( ncol(matdf) - nrow(matdf) ==4 ) { colnames(matdf) <- c("id", "chr", "from.coord", "to.coord") } else { print("Unknwon Type of matrix file") return(0) } n_bins = 1*nrow(matdf) mean.cf <- rep(0, n_bins) pvalue <- rep(1, n_bins) local.ext = rep(-0.5, n_bins) bins <- data.frame(id=1:n_bins, chr=matdf[, "chr"], from.coord=matdf[, "from.coord"], to.coord=matdf[, "to.coord"] ) matrix.data <- as.matrix( matdf[, (ncol(matdf) - nrow(matdf)+1 ):ncol(matdf)] ) ptm <- proc.time() ## Only compute for one track of signal if(is.null(window.sizes)){ for(i in 1:n_bins) { diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size) mean.cf[i] = mean(diamond) } } if(!is.null(window.sizes)) ## compute multiple tracks of signal { mean.cfs = matrix(, n_bins, length(window.sizes)) for(k in 1:length(window.sizes) ) { window.size.iter = window.sizes[k] for(i in 1:n_bins) { diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size.iter) mean.cfs[i, k] = mean(diamond) } } mean.cf = apply(mean.cfs, 1, mean) } eltm = proc.time() - ptm # print(paste("Step 1 Running Time : ", eltm[3])) # print("Step 1 : Done !!") # print("#########################################################################") # print("Step 2 : Detect TD boundaries based on binSignals") # print("#########################################################################") ptm = proc.time() #gap.idx = Which.Gap.Region(matrix.data=matrix.data) #gap.idx = Which.Gap.Region2(mean.cf) gap.idx = Which.Gap.Region2(matrix.data=matrix.data, window.size) proc.regions = Which.process.region(rmv.idx=gap.idx, n_bins=n_bins, min.size=3) #print(proc.regions) for( i in 1:nrow(proc.regions)) { start = proc.regions[i, "start"] end = proc.regions[i, "end"] # print(paste("Process Regions from ", start, "to", end)) local.ext[start:end] = Detect.Local.Extreme(x=mean.cf[start:end]) } local.ext_bp = local.ext eltm = proc.time() - ptm # print(paste("Step 2 Running Time : ", eltm[3])) # print("Step 2 : Done !!") if(statFilter) { # print("#########################################################################") # print("Step 3 : Statistical Filtering of false positive TD boundaries") # print("#########################################################################") if(is.null(stat_window_size)) stat_window_size=window.size ptm = proc.time() # print("-- Matrix Scaling....") scale.matrix.data = matrix.data ## This is to normalize each off diag values -- 2018-09-20 # for( i in 1:(2*window.size) ) for( i in 1:(2*stat_window_size) ) { #diag(scale.matrix.data[, i:n_bins] ) = scale( diag( matrix.data[, i:n_bins] ) ) scale.matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] = scale( matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] ) } # print("-- Compute p-values by Wilcox Ranksum Test") for( i in 1:nrow(proc.regions)) { start = proc.regions[i, "start"] end = proc.regions[i, "end"] # print(paste("Process Regions from ", start, "to", end)) # pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=window.size, scale=1) pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=stat_window_size, scale=1) } # print("-- Done!") # print("-- Filtering False Positives") local.ext[intersect( union(which( local.ext==-1), which(local.ext==-1)), which(pvalue= n_proc.set ) { proc.regions = rbind(proc.regions, c(start=start, end=proc.set[j-1]) ) break } } colnames(proc.regions) = c("start", "end") proc.regions <- proc.regions[ which( abs(proc.regions[,"end"] - proc.regions[, "start"]) >= min.size ), ] return(proc.regions) } # @fn Which.Gap.Region # @breif version 0.0.1 used # @param matrix.data : n by n matrix # @return gap index Which.Gap.Region <- function(matrix.data) { n_bins = nrow(matrix.data) gap = rep(0, n_bins) i=1 while(i < n_bins) { j = i + 1 while( j <= n_bins) { if( sum( matrix.data[i:j, i:j]) == 0 ) { gap[i:j] = -0.5 j = j+1 #if(j-i > 1) gap[i:j]=-0.5 #j=j+1 } else break } i = j } idx = which(gap == -0.5) return(idx) } # @fn Which.Gap.Region3 # @param matrix.data : n by n matrix # @return gap index Which.Gap.Region3 <- function(mean.cf) { n_bins = length(mean.cf) gapidx = which(mean.cf==0) return(gapidx) } # @fn Which.Gap.Region2 # @breif version 0.0.2 used # @param matrix.data : n by n matrix # @return gap index Which.Gap.Region2 <- function(matrix.data, w) { n_bins = nrow(matrix.data) gap = rep(0, n_bins) for(i in 1:n_bins) { if( sum( matrix.data[i, max(1, i-w):min(i+w, n_bins)] ) == 0 ) gap[i]=-0.5 } idx = which(gap == -0.5) return(idx) } # @fn Detect.Local.Extreme # @param x : original signal to find local minima # @return vector of local extrme, -1 if the index is local minimum, 1 if the index is local maxima, 0 otherwise. Detect.Local.Extreme <- function(x) { n_bins = length(x) ret = rep(0, n_bins) x[is.na(x)]=0 if(n_bins <= 3) { ret[which.min(x)]=-1 ret[which.max(x)]=1 return(ret) } # Norm##################################################3 new.point = Data.Norm(x=1:n_bins, y=x) x=new.point$y ################################################## cp = Change.Point(x=1:n_bins, y=x) if( length(cp$cp) <= 2 ) return(ret) if( length(cp$cp) == n_bins) return(ret) for(i in 2:(length(cp$cp)-1)) { if( x[cp$cp[i]] >= x[cp$cp[i]-1] && x[cp$cp[i]] >= x[cp$cp[i]+1] ) ret[cp$cp[i]] = 1 else if(x[cp$cp[i]] < x[cp$cp[i]-1] && x[cp$cp[i]] < x[cp$cp[i]+1]) ret[cp$cp[i]] = -1 min.val = min( x[ cp$cp[i-1] ], x[ cp$cp[i] ] ) max.val = max( x[ cp$cp[i-1] ], x[ cp$cp[i] ] ) if( min( x[cp$cp[i-1]:cp$cp[i]] ) < min.val ) ret[ cp$cp[i-1] - 1 + which.min( x[cp$cp[i-1]:cp$cp[i]] ) ] = -1 if( max( x[cp$cp[i-1]:cp$cp[i]] ) > max.val ) ret[ cp$cp[i-1] - 1 + which.max( x[cp$cp[i-1]:cp$cp[i]] ) ] = 1 } return(ret) } # @fn Data.Norm # @param x : x axis vector # @param x : y axis vector # @return list of normalized x and y Data.Norm <- function(x, y) { ret.x = rep(0, length(x)) ret.y = rep(0, length(y)) ret.x[1] = x[1] ret.y[1] = y[1] diff.x = diff(x) diff.y = diff(y) scale.x = 1 / mean( abs(diff(x) ) ) scale.y = 1 / mean( abs( diff(y) ) ) #print(scale.x) #print(scale.y) for(i in 2:length(x)) { ret.x[i] = ret.x[i-1] + (diff.x[i-1]*scale.x) ret.y[i] = ret.y[i-1] + (diff.y[i-1]*scale.y) } return(list(x=ret.x, y=ret.y)) } # @fn Change.Point # @param x : x axis vector # @param x : y axis vector # @return change point index in x vector, # Note that the first and the last point will be always change point Change.Point <- function( x, y ) { if( length(x) != length(y)) { print("ERROR : The length of x and y should be the same") return(0) } n_bins <- length(x) Fv <- rep(NA, n_bins) Ev <- rep(NA, n_bins) cp <- 1 i=1 Fv[1]=0 while( i < n_bins ) { j=i+1 Fv[j] = sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 ) while(j0) { from.coord = bins[proc.region[, "start"]+1, "from.coord"] boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F) ret = rbind(ret, boundary) } ret = rbind(gap, domain) ret = ret[order(ret[,3]), ] ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"]) ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] ) ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"]) ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"] ## HERE THE SIZE IS COMPUTED / Yuanlong LIU if(!is.null(pvalues) && !is.null(pvalue.cut)) { for(i in 1:nrow(ret)) { if(ret[i, "tag"]=="domain") { domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"] p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut ) if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary" } } } new.bdr.set = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) i=1 while(i <= nrow(ret)) { if( ret[i, "tag"] == "boundary" ) { stack.bdr = rbind(stack.bdr, ret[i, ]) } else if(nrow(stack.bdr)>0) { new.bdr = data.frame(chr=bins[1, "chr"], from.id = min( stack.bdr[, "from.id"]), from.coord=min(stack.bdr[, "from.coord"]), to.id = max( stack.bdr[, "to.id"]), to.coord=max(stack.bdr[, "to.coord"]), tag="boundary", size=max(stack.bdr[, "to.coord"]) - min(stack.bdr[, "from.coord"])) new.bdr.set = rbind(new.bdr.set, new.bdr) stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) } i = i + 1 } ret = rbind( ret[ ret[, "tag"]!="boundary", ], new.bdr.set ) ret = ret[order(ret[, "to.coord"]), ] return(ret) } # @fn Convert.Bin.To.Domain # @param bins : bin information # @param signal.idx : signal index # @param signal.idx : gap index # @param pvalues : pvalue vector # @param pvalue.cut : pvalue threshold # @return dataframe storing domain information Convert.Bin.To.Domain.TMP <- function(bins, signal.idx, gap.idx, pvalues=NULL, pvalue.cut=NULL) { n_bins = nrow(bins) ret = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) levels( x=ret[, "tag"] ) = c("domain", "gap", "boundary") rmv.idx = setdiff(1:n_bins, gap.idx) proc.region = Which.process.region(rmv.idx, n_bins, min.size=0) from.coord = bins[proc.region[, "start"], "from.coord"] n_procs = nrow(proc.region) gap = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("gap", n_procs), size=rep(0, n_procs), stringsAsFactors=F) rmv.idx = union(signal.idx, gap.idx) proc.region = Which.process.region(rmv.idx, n_bins, min.size=0) n_procs = nrow(proc.region) from.coord = bins[proc.region[, "start"], "from.coord"] domain = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("domain", n_procs), size=rep(0, n_procs), stringsAsFactors=F) rmv.idx = setdiff(1:n_bins, signal.idx) proc.region = as.data.frame( Which.process.region(rmv.idx, n_bins, min.size=1) ) n_procs = nrow(proc.region) if(n_procs>0) { from.coord = bins[proc.region[, "start"]+1, "from.coord"] boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F) ret = rbind(ret, boundary) } ret = rbind(gap, domain) ret = ret[order(ret[,3]), ] ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"]) ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] ) ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"]) ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"] if(!is.null(pvalues) && !is.null(pvalue.cut)) { for(i in 1:nrow(ret)) { if(ret[i, "tag"]=="domain") { domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"] p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut ) if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary" } } } return(ret) }