HighResolution2Low.R 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. ## These functions try to aggregate bin-wise contact matrix into a domain-wise contact matrix
  2. HighResolution2Low_k_complete <-function( A, max_nbins )
  3. {
  4. nbins = nrow( A )
  5. max_nbins_bp = max_nbins
  6. max_nbinss = max_nbins + (-5:20)
  7. expand_fold_info = t(sapply(max_nbinss, fold_expand_rate, K=nbins))
  8. index = max(which(expand_fold_info[,'rescale_rate']==min(expand_fold_info[,'rescale_rate'])))
  9. max_nbins = max_nbinss[ index ]
  10. n2one_head = expand_fold_info[index, 'n2one_head']
  11. n2one_last = expand_fold_info[index, 'n2one_last']
  12. if(n2one_head==1) return(list())
  13. rescale_rate = n2one_last / n2one_head
  14. if(max_nbins!=max_nbins_bp) warning('Value of max_nbins has been adjusted from ', max_nbins_bp, ' to ', max_nbins, '. This results in rescale_rate as: ', rescale_rate, '\n')
  15. if(rescale_rate!=1) ## only do when rescale is not 1
  16. {
  17. aa_end = n2one_head*(max_nbins-1)
  18. bb_end = nrow(A)
  19. A_aa = A[1:aa_end, 1:aa_end]
  20. A_bb = A[(aa_end+1):bb_end, (aa_end+1):bb_end]
  21. A_ab = A[1:aa_end, (aa_end+1):bb_end, drop=FALSE]
  22. A_aa_low = HighResolution2Low_k(mat=A_aa, k=max_nbins-1)
  23. A_bb_low = sum( A_bb )
  24. ## https://stackoverflow.com/questions/15265512/summing-every-n-points-in-r
  25. v = apply(A_ab, 1, sum)
  26. A_ab_low = unname(tapply(v, (seq_along(v)-1) %/% n2one_head, sum))
  27. A_ab_low_new = A_ab_low / n2one_last*n2one_head
  28. A_bb_low_new = A_bb_low / n2one_last^2*n2one_head^2
  29. A_low = rbind(cbind(A_aa_low, A_ab_low_new), c( A_ab_low_new, A_bb_low_new ))
  30. colnames(A_low) = NULL
  31. }
  32. if(rescale_rate==1) A_low = HighResolution2Low_k(mat=A, k=max_nbins) ## k is the dimension of A_final_low
  33. ## A_low has no row/col name
  34. res = list(A_low=A_low, n2one_head=n2one_head, n2one_last=n2one_last)
  35. return(res)
  36. }
  37. split_info <-function( K, max_nbins )
  38. {
  39. x = floor( K/max_nbins ) + 1
  40. n2 = max_nbins*x - K
  41. n1 = max_nbins - n2
  42. vec = c(rep(x, n1), rep(x-1, n2))
  43. split_vec = split(1:K, rep(1:max_nbins, vec))
  44. expand_fold_ratio = min(n1/n2, n2/n1)
  45. res = list(vec=vec, expand_fold_ratio=expand_fold_ratio, split_vec=split_vec)
  46. return(res)
  47. }
  48. ## diagonal values are summed twice
  49. HighResolution2Low <-function( A, rescale, which=2 )
  50. {
  51. nbins = nrow(A)
  52. nbins_low = floor(nbins/rescale)
  53. A_low = matrix( , nbins_low, nbins_low)
  54. keep_rows = nbins - nbins%%nbins_low
  55. A_truncated = A[ 1:keep_rows, 1:keep_rows ]
  56. if(which==1) A_low = matsplitter_sum(A_truncated, keep_rows/nbins_low, keep_rows/nbins_low)
  57. if(which==2) A_low = mat_split_sum(A_truncated, keep_rows/nbins_low, keep_rows/nbins_low)
  58. return( A_low )
  59. }
  60. ## https://stackoverflow.com/questions/24299171/function-to-split-a-matrix-into-sub-matrices-in-r
  61. ## Function to split a matrix into sub-matrices in R
  62. matsplitter_sum <- function(M, r, c)
  63. {
  64. rg <- (row(M)-1)%/%r + 1
  65. cg <- (col(M)-1)%/%c + 1
  66. rci <- (rg-1)*max(cg) + cg
  67. N <- prod(dim(M))/r/c
  68. cv <- unlist(lapply(1:N, function(x) M[rci==x]))
  69. dim(cv)<-c(r, c, N)
  70. res = matrix(apply(cv, 3, sum), nrow(M)/r, byrow=TRUE)
  71. return(res)
  72. }
  73. mat_split_sum <- function(M, r, c)
  74. {
  75. nr <- ceiling(nrow(M)/r)
  76. nc <- ceiling(ncol(M)/c)
  77. newM <- matrix(NA, nr*r, nc*c)
  78. newM[1:nrow(M), 1:ncol(M)] <- M
  79. div_k <- kronecker(matrix(seq_len(nr*nc), nr, byrow = TRUE), matrix(1, r, c))
  80. matlist <- split(newM, div_k)
  81. res = matrix(sapply(matlist, sum), nrow(M)/r, byrow=TRUE)
  82. return(res)
  83. }
  84. ## this is much faster
  85. HighResolution2Low_k <- function(mat, k)
  86. {
  87. chunk2 <- function(x, n) split(x, cut(seq_along(x), n, labels = FALSE))
  88. n = nrow(mat)
  89. A_low = matrix( , k, k )
  90. indices = chunk2( 1:n, k )
  91. for(row_index in 1:k)
  92. {
  93. A_sub = mat[indices[[row_index]], ]
  94. for(col_index in 1:k) A_low[row_index, col_index] = sum( A_sub[ , indices[[col_index]]] )
  95. }
  96. return( A_low )
  97. }
  98. HighResolution2Low_k_rectangle <- function(mat, row_split, col_split, sum_or_mean=c('sum', 'mean', 'median', 'p_above_one'))
  99. {
  100. # if(remove_zero==TRUE) mat[mat==0] = NA
  101. k_row = length(row_split)
  102. k_col = length(col_split)
  103. A_low = matrix( , k_row, k_col )
  104. if(sum_or_mean=='sum')
  105. {
  106. for(row_index in 1:k_row)
  107. {
  108. A_sub = mat[row_split[[row_index]], , drop=FALSE ]
  109. for(col_index in 1:k_col) A_low[row_index, col_index] = sum( A_sub[ , col_split[[col_index]]] )
  110. }
  111. }
  112. if(sum_or_mean=='mean')
  113. {
  114. for(row_index in 1:k_row)
  115. {
  116. A_sub = mat[row_split[[row_index]], , drop=FALSE ]
  117. for(col_index in 1:k_col) A_low[row_index, col_index] = mean( A_sub[ , col_split[[col_index]]], na.rm=TRUE )
  118. }
  119. }
  120. if(sum_or_mean=='median')
  121. {
  122. for(row_index in 1:k_row)
  123. {
  124. A_sub = mat[row_split[[row_index]], , drop=FALSE ]
  125. for(col_index in 1:k_col) A_low[row_index, col_index] = median( A_sub[ , col_split[[col_index]]], na.rm=TRUE )
  126. }
  127. }
  128. if(sum_or_mean=='p_above_one')
  129. {
  130. for(row_index in 1:k_row)
  131. {
  132. A_sub = mat[row_split[[row_index]], , drop=FALSE ]
  133. for(col_index in 1:k_col) A_low[row_index, col_index] = mean( A_sub[ , col_split[[col_index]]] > 1 )
  134. }
  135. }
  136. # if(remove_zero==TRUE) A_low[is.na(A_low)] = 0
  137. return( A_low )
  138. }
  139. ## for a n x n matrix, this function generate a nxm matrix, with m*compress_size = n
  140. ## this function is used for generating the nxm matrix, where the i,jth value is the contact value
  141. ## 08-08-2018
  142. compress_mat_fast = function(input_mat, compress_size)
  143. {
  144. mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) )
  145. n = ncol(input_mat)
  146. mat2prod = mat_block(n, compress_size)
  147. # return(t(input_mat%*%mat2prod / compress_size))
  148. return(t(matrix_multiplication_cpp(input_mat, mat2prod)) / compress_size)
  149. }