call_domains.R 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. ## Yuanlong LIU
  2. ## Scripts used to call domains having high inter-domain score while low intra-domain score. Adapted from TopDom with the following authorship
  3. # @author : Hanjun Shin(shanjun@usc.edu)
  4. # @credit : Harris Lazaris(Ph.D Stduent, NYU), Dr. Gangqing Hu(Staff Scientist, NIH)
  5. # @brief : TopDom.R is a software package to identify topological domains for given Hi-C contact matrix.
  6. # @version 0.0.2
  7. # @fn TopDom
  8. # @param matrixFile : string, matrixFile Address,
  9. # - Format = {chromosome, bin start, bin end, N numbers normalized value }
  10. # - N * (N + 3), where N is the number of bins
  11. # @param window.size :integer, number of bins to extend.
  12. # @param out_binSignal : string, binSignal file address to write
  13. # @param out_ext : string, ext file address to write
  14. remove_small_domains = function(domain_size_min, binSignal, domains) ## remove all domains smaller than 40E3, ADDED by Yuanlong, 2019-July-09
  15. {
  16. # binSignal = res_input_mat$binSignal
  17. # domains = domains_bp
  18. # domains = domains[, c('from.id', 'to.id', 'size')]
  19. while(1)
  20. {
  21. i = min(setdiff(which(domains$size < domain_size_min/10E3), 1)) ## do not include the first one
  22. if(i==Inf) break
  23. ids = domains[(i-1):i, 'to.id']
  24. index2rm = ((i-1):i)[which.max(binSignal[ids, 'mean.cf'])]
  25. domains = domains[-index2rm, ]
  26. domains[-1, 'from.id'] = domains[-nrow(domains), 'to.id'] + 1
  27. domains$size = domains[, 'to.id'] - domains[, 'from.id'] + 1
  28. cat(nrow(domains), '\n')
  29. if(nrow(domains) < 10) break ## avoid exceptions
  30. }
  31. # domains[, 'chr'] = domains_bp[1, 'chr']
  32. domains[, 'from.coord'] = domains[, 'from.id'] - 1
  33. domains[, 'to.coord'] = domains[, 'to.id']
  34. rownames(domains) = 1:nrow(domains)
  35. # print(head(domains))
  36. return(domains)
  37. }
  38. ## modified input file format and added several potentially useful parameter options, Yuanlong LIU
  39. TopDom_v2 <- function( A_extended, window.size, outFile=NULL, statFilter=T, p_thresh=0.05, return_peak_and_binsignal=FALSE, window.sizes=NULL, stat_window_size=NULL, domain_size_min=NULL)
  40. {
  41. if(!is.null(window.sizes)) window.size = window.sizes[1]
  42. # print("#########################################################################")
  43. # print("Step 0 : File Read ")
  44. # print("#########################################################################")
  45. window.size = as.numeric(window.size)
  46. matdf = A_extended
  47. if( ncol(matdf) - nrow(matdf) == 3) {
  48. colnames(matdf) <- c("chr", "from.coord", "to.coord")
  49. } else if( ncol(matdf) - nrow(matdf) ==4 ) {
  50. colnames(matdf) <- c("id", "chr", "from.coord", "to.coord")
  51. } else {
  52. print("Unknwon Type of matrix file")
  53. return(0)
  54. }
  55. n_bins = 1*nrow(matdf)
  56. mean.cf <- rep(0, n_bins)
  57. pvalue <- rep(1, n_bins)
  58. local.ext = rep(-0.5, n_bins)
  59. bins <- data.frame(id=1:n_bins,
  60. chr=matdf[, "chr"],
  61. from.coord=matdf[, "from.coord"],
  62. to.coord=matdf[, "to.coord"] )
  63. matrix.data <- as.matrix( matdf[, (ncol(matdf) - nrow(matdf)+1 ):ncol(matdf)] )
  64. ptm <- proc.time()
  65. ## Only compute for one track of signal
  66. if(is.null(window.sizes)){
  67. for(i in 1:n_bins)
  68. {
  69. diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size)
  70. mean.cf[i] = mean(diamond)
  71. }
  72. }
  73. if(!is.null(window.sizes)) ## compute multiple tracks of signal
  74. {
  75. mean.cfs = matrix(, n_bins, length(window.sizes))
  76. for(k in 1:length(window.sizes) )
  77. {
  78. window.size.iter = window.sizes[k]
  79. for(i in 1:n_bins)
  80. {
  81. diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size.iter)
  82. mean.cfs[i, k] = mean(diamond)
  83. }
  84. }
  85. mean.cf = apply(mean.cfs, 1, mean)
  86. }
  87. eltm = proc.time() - ptm
  88. # print(paste("Step 1 Running Time : ", eltm[3]))
  89. # print("Step 1 : Done !!")
  90. # print("#########################################################################")
  91. # print("Step 2 : Detect TD boundaries based on binSignals")
  92. # print("#########################################################################")
  93. ptm = proc.time()
  94. #gap.idx = Which.Gap.Region(matrix.data=matrix.data)
  95. #gap.idx = Which.Gap.Region2(mean.cf)
  96. gap.idx = Which.Gap.Region2(matrix.data=matrix.data, window.size)
  97. proc.regions = Which.process.region(rmv.idx=gap.idx, n_bins=n_bins, min.size=3)
  98. #print(proc.regions)
  99. for( i in 1:nrow(proc.regions))
  100. {
  101. start = proc.regions[i, "start"]
  102. end = proc.regions[i, "end"]
  103. # print(paste("Process Regions from ", start, "to", end))
  104. local.ext[start:end] = Detect.Local.Extreme(x=mean.cf[start:end])
  105. }
  106. local.ext_bp = local.ext
  107. eltm = proc.time() - ptm
  108. # print(paste("Step 2 Running Time : ", eltm[3]))
  109. # print("Step 2 : Done !!")
  110. if(statFilter)
  111. {
  112. # print("#########################################################################")
  113. # print("Step 3 : Statistical Filtering of false positive TD boundaries")
  114. # print("#########################################################################")
  115. if(is.null(stat_window_size)) stat_window_size=window.size
  116. ptm = proc.time()
  117. # print("-- Matrix Scaling....")
  118. scale.matrix.data = matrix.data
  119. ## This is to normalize each off diag values -- 2018-09-20
  120. # for( i in 1:(2*window.size) )
  121. for( i in 1:(2*stat_window_size) )
  122. {
  123. #diag(scale.matrix.data[, i:n_bins] ) = scale( diag( matrix.data[, i:n_bins] ) )
  124. scale.matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] = scale( matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] )
  125. }
  126. # print("-- Compute p-values by Wilcox Ranksum Test")
  127. for( i in 1:nrow(proc.regions))
  128. {
  129. start = proc.regions[i, "start"]
  130. end = proc.regions[i, "end"]
  131. # print(paste("Process Regions from ", start, "to", end))
  132. # pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=window.size, scale=1)
  133. pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=stat_window_size, scale=1)
  134. }
  135. # print("-- Done!")
  136. # print("-- Filtering False Positives")
  137. local.ext[intersect( union(which( local.ext==-1), which(local.ext==-1)), which(pvalue<p_thresh))] = -2
  138. local.ext[which(local.ext==-1)] = 0
  139. local.ext[which(local.ext==-2)] = -1
  140. # print("-- Done!")
  141. eltm = proc.time() - ptm
  142. # print(paste("Step 3 Running Time : ", eltm[3]))
  143. # print("Step 3 : Done!")
  144. } else pvalue = 0
  145. domains = Convert.Bin.To.Domain.TMP(bins=bins,
  146. signal.idx=which(local.ext==-1),
  147. gap.idx=which(local.ext==-0.5),
  148. pvalues=pvalue,
  149. pvalue.cut=p_thresh)
  150. domains = subset(domains, size!=0) ## ADDED 06-April-2019, Yuanlong
  151. # print(head(domains))
  152. # stop('HELLO')
  153. if( sum(domains$size)!=nrow(A_extended) ) stop('In TopDom, some rows are missing')
  154. binSignal = cbind(bins,
  155. local.ext = local.ext,
  156. mean.cf = mean.cf,
  157. pvalue = pvalue)
  158. bins = binSignal
  159. if(!is.null(domain_size_min))
  160. {
  161. domains = remove_small_domains(domain_size_min=domain_size_min, binSignal=binSignal, domains=domains)
  162. }
  163. bedform = domains[, c("chr", "from.coord", "to.coord", "tag")]
  164. colnames(bedform) = c("chrom", "chromStart", "chromEnd", "name")
  165. if( !is.null(outFile) ) {
  166. # print("#########################################################################")
  167. # print("Writing Files")
  168. # print("#########################################################################")
  169. outBinSignal = paste(outFile, ".binSignal", sep="")
  170. # print(paste("binSignal File :", outBinSignal) )
  171. write.table(bins, file=outBinSignal, quote=F, row.names=F, col.names=T, sep="\t")
  172. outDomain = paste(outFile, ".domain", sep="")
  173. # print(paste("Domain File :", outDomain) )
  174. write.table( domains, file=outDomain, quote=F, row.names=F, col.names=T, sep="\t")
  175. outBed = paste(outFile, ".bed", sep="")
  176. # print(paste("Bed File : ", outBed))
  177. write.table( bedform, file=outBed, quote=F, row.names=F, col.names=F, sep="\t")
  178. }
  179. # print("Done!!")
  180. # print("Chromatin domains generated !")
  181. if(return_peak_and_binsignal==TRUE) return( list( local_ext=local.ext_bp, mean_cf=mean.cf, domains=domains ) )
  182. return(list(binSignal=bins, domain=domains, bed=bedform))
  183. }
  184. # @fn Get.Diamond.Matrix
  185. # @param mat.data : N by N matrix, where each element indicate contact frequency
  186. # @param i :integer, bin index
  187. # @param size : integer, window size to expand from bin
  188. # @retrun : matrix.
  189. Get.Diamond.Matrix <- function(mat.data, i, size)
  190. {
  191. n_bins = nrow( mat.data )
  192. if(i==n_bins) return(NA)
  193. lowerbound = max( 1, i-size+1 )
  194. upperbound = min( i+size, n_bins)
  195. return( mat.data[lowerbound:i, (i+1):upperbound] )
  196. }
  197. # @fn Which.process.region
  198. # @param rmv.idx : vector of idx, remove index vector
  199. # @param n_bins : total number of bins
  200. # @param min.size : minimum size of bins
  201. # @retrun : data.frame of proc.regions
  202. Which.process.region <- function(rmv.idx, n_bins, min.size=3)
  203. {
  204. gap.idx = rmv.idx
  205. proc.regions = data.frame(start=numeric(0), end=numeric(0))
  206. proc.set = setdiff(1:n_bins, gap.idx)
  207. n_proc.set = length(proc.set)
  208. i=1
  209. while(i < n_proc.set )
  210. {
  211. start = proc.set[i]
  212. j = i+1
  213. while(j <= n_proc.set)
  214. {
  215. if( proc.set[j] - proc.set[j-1] <= 1) j = j + 1
  216. else {
  217. proc.regions = rbind(proc.regions, c(start=start, end=proc.set[j-1]) )
  218. i = j
  219. break
  220. }
  221. }
  222. if(j >= n_proc.set ) {
  223. proc.regions = rbind(proc.regions, c(start=start, end=proc.set[j-1]) )
  224. break
  225. }
  226. }
  227. colnames(proc.regions) = c("start", "end")
  228. proc.regions <- proc.regions[ which( abs(proc.regions[,"end"] - proc.regions[, "start"]) >= min.size ), ]
  229. return(proc.regions)
  230. }
  231. # @fn Which.Gap.Region
  232. # @breif version 0.0.1 used
  233. # @param matrix.data : n by n matrix
  234. # @return gap index
  235. Which.Gap.Region <- function(matrix.data)
  236. {
  237. n_bins = nrow(matrix.data)
  238. gap = rep(0, n_bins)
  239. i=1
  240. while(i < n_bins)
  241. {
  242. j = i + 1
  243. while( j <= n_bins)
  244. {
  245. if( sum( matrix.data[i:j, i:j]) == 0 ) {
  246. gap[i:j] = -0.5
  247. j = j+1
  248. #if(j-i > 1) gap[i:j]=-0.5
  249. #j=j+1
  250. } else break
  251. }
  252. i = j
  253. }
  254. idx = which(gap == -0.5)
  255. return(idx)
  256. }
  257. # @fn Which.Gap.Region3
  258. # @param matrix.data : n by n matrix
  259. # @return gap index
  260. Which.Gap.Region3 <- function(mean.cf)
  261. {
  262. n_bins = length(mean.cf)
  263. gapidx = which(mean.cf==0)
  264. return(gapidx)
  265. }
  266. # @fn Which.Gap.Region2
  267. # @breif version 0.0.2 used
  268. # @param matrix.data : n by n matrix
  269. # @return gap index
  270. Which.Gap.Region2 <- function(matrix.data, w)
  271. {
  272. n_bins = nrow(matrix.data)
  273. gap = rep(0, n_bins)
  274. for(i in 1:n_bins)
  275. {
  276. if( sum( matrix.data[i, max(1, i-w):min(i+w, n_bins)] ) == 0 ) gap[i]=-0.5
  277. }
  278. idx = which(gap == -0.5)
  279. return(idx)
  280. }
  281. # @fn Detect.Local.Extreme
  282. # @param x : original signal to find local minima
  283. # @return vector of local extrme, -1 if the index is local minimum, 1 if the index is local maxima, 0 otherwise.
  284. Detect.Local.Extreme <- function(x)
  285. {
  286. n_bins = length(x)
  287. ret = rep(0, n_bins)
  288. x[is.na(x)]=0
  289. if(n_bins <= 3)
  290. {
  291. ret[which.min(x)]=-1
  292. ret[which.max(x)]=1
  293. return(ret)
  294. }
  295. # Norm##################################################3
  296. new.point = Data.Norm(x=1:n_bins, y=x)
  297. x=new.point$y
  298. ##################################################
  299. cp = Change.Point(x=1:n_bins, y=x)
  300. if( length(cp$cp) <= 2 ) return(ret)
  301. if( length(cp$cp) == n_bins) return(ret)
  302. for(i in 2:(length(cp$cp)-1))
  303. {
  304. if( x[cp$cp[i]] >= x[cp$cp[i]-1] && x[cp$cp[i]] >= x[cp$cp[i]+1] ) ret[cp$cp[i]] = 1
  305. else if(x[cp$cp[i]] < x[cp$cp[i]-1] && x[cp$cp[i]] < x[cp$cp[i]+1]) ret[cp$cp[i]] = -1
  306. min.val = min( x[ cp$cp[i-1] ], x[ cp$cp[i] ] )
  307. max.val = max( x[ cp$cp[i-1] ], x[ cp$cp[i] ] )
  308. if( min( x[cp$cp[i-1]:cp$cp[i]] ) < min.val ) ret[ cp$cp[i-1] - 1 + which.min( x[cp$cp[i-1]:cp$cp[i]] ) ] = -1
  309. if( max( x[cp$cp[i-1]:cp$cp[i]] ) > max.val ) ret[ cp$cp[i-1] - 1 + which.max( x[cp$cp[i-1]:cp$cp[i]] ) ] = 1
  310. }
  311. return(ret)
  312. }
  313. # @fn Data.Norm
  314. # @param x : x axis vector
  315. # @param x : y axis vector
  316. # @return list of normalized x and y
  317. Data.Norm <- function(x, y)
  318. {
  319. ret.x = rep(0, length(x))
  320. ret.y = rep(0, length(y))
  321. ret.x[1] = x[1]
  322. ret.y[1] = y[1]
  323. diff.x = diff(x)
  324. diff.y = diff(y)
  325. scale.x = 1 / mean( abs(diff(x) ) )
  326. scale.y = 1 / mean( abs( diff(y) ) )
  327. #print(scale.x)
  328. #print(scale.y)
  329. for(i in 2:length(x))
  330. {
  331. ret.x[i] = ret.x[i-1] + (diff.x[i-1]*scale.x)
  332. ret.y[i] = ret.y[i-1] + (diff.y[i-1]*scale.y)
  333. }
  334. return(list(x=ret.x, y=ret.y))
  335. }
  336. # @fn Change.Point
  337. # @param x : x axis vector
  338. # @param x : y axis vector
  339. # @return change point index in x vector,
  340. # Note that the first and the last point will be always change point
  341. Change.Point <- function( x, y )
  342. {
  343. if( length(x) != length(y))
  344. {
  345. print("ERROR : The length of x and y should be the same")
  346. return(0)
  347. }
  348. n_bins <- length(x)
  349. Fv <- rep(NA, n_bins)
  350. Ev <- rep(NA, n_bins)
  351. cp <- 1
  352. i=1
  353. Fv[1]=0
  354. while( i < n_bins )
  355. {
  356. j=i+1
  357. Fv[j] = sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 )
  358. while(j<n_bins)
  359. {
  360. j=j+1
  361. k=(i+1):(j-1)
  362. Ev[j] = ( sum( abs( (y[j]-y[i] )*x[k] - (x[j] -x[i])*y[k] - (x[i]*y[j]) + (x[j]*y[i]) ) ) / sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 ) )
  363. Fv[j] = sqrt( (x[j]-x[i])^2 + (y[j] - y[i])^2 ) - ( sum( abs( (y[j]-y[i] )*x[k] - (x[j] -x[i])*y[k] - (x[i]*y[j]) + (x[j]*y[i]) ) ) / sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 ) )
  364. #################################################
  365. #Not Original Code
  366. if( is.na(Fv[j]) || is.na(Fv[j-1]) ) {
  367. j = j-1
  368. cp <- c(cp, j)
  369. break
  370. }
  371. ####################################################3
  372. if(Fv[j] < Fv[j-1] ) {
  373. j = j - 1
  374. cp <- c(cp, j )
  375. break
  376. }
  377. }
  378. i=j
  379. }
  380. cp <- c(cp, n_bins)
  381. return(list(cp=cp, objF=Fv, errF=Ev))
  382. }
  383. # @fn Get.Pvalue
  384. # @param matrix.data : matrix
  385. # @param size : size to extend
  386. # @param scale : scale parameter if necessary. deprecated parameter
  387. # @return computed p-value vector
  388. Get.Pvalue <- function( matrix.data, size, scale=1 )
  389. {
  390. n_bins = nrow(matrix.data)
  391. pvalue <- rep(1, n_bins)
  392. for( i in 1:(n_bins-1) )
  393. {
  394. dia = as.vector( Get.Diamond.Matrix2(matrix.data, i, size=size) )
  395. ups = as.vector( Get.Upstream.Triangle(matrix.data, i, size=size) )
  396. downs = as.vector( Get.Downstream.Triangle(matrix.data, i, size=size) )
  397. wil.test = wilcox.test(x=dia*scale, y=c(ups, downs), alternative="less", exact=F)
  398. pvalue[i] = wil.test$p.value
  399. #print(paste(i, "=", wil.test$p.value) )
  400. }
  401. pvalue[ is.na(pvalue) ] = 1
  402. return(pvalue)
  403. }
  404. # @fn Get.Upstream.Triangle
  405. # @param mat.data : matrix data
  406. # @param i : bin index
  407. # @param size : size of window to extend
  408. # @return upstream triangle matrix
  409. Get.Upstream.Triangle <- function(mat.data, i, size)
  410. {
  411. n_bins = nrow(mat.data)
  412. lower = max(1, i-size)
  413. tmp.mat = mat.data[lower:i, lower:i]
  414. return( tmp.mat[ upper.tri( tmp.mat, diag=F ) ] )
  415. }
  416. # @fn Get.Downstream.Triangle
  417. # @param mat.data : matrix data
  418. # @param i : bin index
  419. # @param size : size of window to extend
  420. # @return downstream triangle matrix
  421. Get.Downstream.Triangle <- function(mat.data, i, size)
  422. {
  423. n_bins = nrow(mat.data)
  424. if(i==n_bins) return(NA)
  425. upperbound = min(i+size, n_bins)
  426. tmp.mat = mat.data[(i+1):upperbound, (i+1):upperbound]
  427. return( tmp.mat[ upper.tri( tmp.mat, diag=F ) ] )
  428. }
  429. # @fn Get.Diamond.Matrix2
  430. # @param mat.data : matrix data
  431. # @param i : bin index
  432. # @param size : size of window to extend
  433. # @return diamond matrix
  434. Get.Diamond.Matrix2 <- function(mat.data, i, size)
  435. {
  436. n_bins = nrow(mat.data)
  437. new.mat = matrix(rep(NA, size*size), nrow=size, ncol=size)
  438. for(k in 1:size)
  439. {
  440. if(i-(k-1) >= 1 && i < n_bins)
  441. {
  442. lower = min(i+1, n_bins)
  443. upper = min(i+size, n_bins)
  444. new.mat[size-(k-1), 1:(upper-lower+1)] = mat.data[i-(k-1), lower:upper]
  445. }
  446. }
  447. return(new.mat)
  448. }
  449. # @fn Convert.Bin.To.Domain
  450. # @param bins : bin information
  451. # @param signal.idx : signal index
  452. # @param signal.idx : gap index
  453. # @param pvalues : pvalue vector
  454. # @param pvalue.cut : pvalue threshold
  455. # @return dataframe storing domain information
  456. Convert.Bin.To.Domain <- function(bins, signal.idx, gap.idx, pvalues=NULL, pvalue.cut=NULL)
  457. {
  458. n_bins = nrow(bins)
  459. ret = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
  460. levels( x=ret[, "tag"] ) = c("domain", "gap", "boundary")
  461. rmv.idx = setdiff(1:n_bins, gap.idx)
  462. proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
  463. from.coord = bins[proc.region[, "start"], "from.coord"]
  464. n_procs = nrow(proc.region)
  465. gap = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("gap", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
  466. rmv.idx = union(signal.idx, gap.idx)
  467. proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
  468. n_procs = nrow(proc.region)
  469. from.coord = bins[proc.region[, "start"], "from.coord"]
  470. domain = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("domain", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
  471. rmv.idx = setdiff(1:n_bins, signal.idx)
  472. proc.region = as.data.frame( Which.process.region(rmv.idx, n_bins, min.size=1) )
  473. n_procs = nrow(proc.region)
  474. if(n_procs>0)
  475. {
  476. from.coord = bins[proc.region[, "start"]+1, "from.coord"]
  477. boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
  478. ret = rbind(ret, boundary)
  479. }
  480. ret = rbind(gap, domain)
  481. ret = ret[order(ret[,3]), ]
  482. ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"])
  483. ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] )
  484. ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"])
  485. ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"] ## HERE THE SIZE IS COMPUTED / Yuanlong LIU
  486. if(!is.null(pvalues) && !is.null(pvalue.cut))
  487. {
  488. for(i in 1:nrow(ret))
  489. {
  490. if(ret[i, "tag"]=="domain")
  491. {
  492. domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"]
  493. p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut )
  494. if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary"
  495. }
  496. }
  497. }
  498. new.bdr.set = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
  499. stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
  500. i=1
  501. while(i <= nrow(ret))
  502. {
  503. if( ret[i, "tag"] == "boundary" )
  504. {
  505. stack.bdr = rbind(stack.bdr, ret[i, ])
  506. } else if(nrow(stack.bdr)>0) {
  507. new.bdr = data.frame(chr=bins[1, "chr"],
  508. from.id = min( stack.bdr[, "from.id"]),
  509. from.coord=min(stack.bdr[, "from.coord"]),
  510. to.id = max( stack.bdr[, "to.id"]),
  511. to.coord=max(stack.bdr[, "to.coord"]),
  512. tag="boundary",
  513. size=max(stack.bdr[, "to.coord"]) - min(stack.bdr[, "from.coord"]))
  514. new.bdr.set = rbind(new.bdr.set, new.bdr)
  515. stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
  516. }
  517. i = i + 1
  518. }
  519. ret = rbind( ret[ ret[, "tag"]!="boundary", ], new.bdr.set )
  520. ret = ret[order(ret[, "to.coord"]), ]
  521. return(ret)
  522. }
  523. # @fn Convert.Bin.To.Domain
  524. # @param bins : bin information
  525. # @param signal.idx : signal index
  526. # @param signal.idx : gap index
  527. # @param pvalues : pvalue vector
  528. # @param pvalue.cut : pvalue threshold
  529. # @return dataframe storing domain information
  530. Convert.Bin.To.Domain.TMP <- function(bins, signal.idx, gap.idx, pvalues=NULL, pvalue.cut=NULL)
  531. {
  532. n_bins = nrow(bins)
  533. ret = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
  534. levels( x=ret[, "tag"] ) = c("domain", "gap", "boundary")
  535. rmv.idx = setdiff(1:n_bins, gap.idx)
  536. proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
  537. from.coord = bins[proc.region[, "start"], "from.coord"]
  538. n_procs = nrow(proc.region)
  539. gap = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("gap", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
  540. rmv.idx = union(signal.idx, gap.idx)
  541. proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
  542. n_procs = nrow(proc.region)
  543. from.coord = bins[proc.region[, "start"], "from.coord"]
  544. domain = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("domain", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
  545. rmv.idx = setdiff(1:n_bins, signal.idx)
  546. proc.region = as.data.frame( Which.process.region(rmv.idx, n_bins, min.size=1) )
  547. n_procs = nrow(proc.region)
  548. if(n_procs>0)
  549. {
  550. from.coord = bins[proc.region[, "start"]+1, "from.coord"]
  551. boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
  552. ret = rbind(ret, boundary)
  553. }
  554. ret = rbind(gap, domain)
  555. ret = ret[order(ret[,3]), ]
  556. ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"])
  557. ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] )
  558. ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"])
  559. ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"]
  560. if(!is.null(pvalues) && !is.null(pvalue.cut))
  561. {
  562. for(i in 1:nrow(ret))
  563. {
  564. if(ret[i, "tag"]=="domain")
  565. {
  566. domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"]
  567. p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut )
  568. if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary"
  569. }
  570. }
  571. }
  572. return(ret)
  573. }