compartment_data_generation_fun.R 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. compress_mat_fast_tmp = function(input_mat, compress_size)
  2. {
  3. mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) )
  4. n = ncol(input_mat)
  5. mat2prod = mat_block(n, compress_size)
  6. # return(t(input_mat%*%mat2prod / compress_size))
  7. return(t(matrix_multiplication_cpp(input_mat, mat2prod)))
  8. }
  9. mat_10to40kb = function(mat2compress, bin_size2look, bin_size_input) ## small bin size to bigger bin_size
  10. {
  11. compress_size = bin_size2look / bin_size_input
  12. len = nrow(mat2compress) - nrow(mat2compress)%%compress_size
  13. mat2compress = mat2compress[1:len, 1:len]
  14. c_mat_compressed = compress_mat_fast_tmp( as.matrix(mat2compress), compress_size=compress_size )
  15. mat_compressed = compress_mat_fast_tmp( c_mat_compressed, compress_size=compress_size )
  16. rownames(mat_compressed) = colnames(mat_compressed) = as.character( 1:nrow(mat_compressed) )
  17. return(mat_compressed)
  18. }
  19. contact_mat_processing_v2 = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chr_num, bin_size_input, bin_size2look, black_list_bins=NULL)
  20. {
  21. compress_size = ifelse(bin_size2look < 40E3, 1, 1)
  22. zero_ratio = 0.01
  23. if(!is.null(contact_tab_dump)) contact_mat_raw = contact_tab_dump ## if contact matrix in data.frame or data.table of strawr format is provided
  24. if(!is.null(contact_file_dump)) contact_mat_raw = data.table::fread(contact_file_dump) ## if contact matrix in strawr format is provided
  25. if(!is.null(contact_file_hic)) ## if contact matrix in hic format is provided
  26. {
  27. bin_sizes_in_hic = strawr::readHicBpResolutions(contact_file_hic)
  28. chrs_in_hic = as.vector(strawr::readHicChroms(contact_file_hic)[["name"]])
  29. chr2query = chrs_in_hic[match(toupper(chr_num), gsub('chr', '', toupper(chrs_in_hic), ignore.case=TRUE))]
  30. if(!(bin_size_input %in% bin_sizes_in_hic)) stop(sprintf('Your provided hic file only contains resolutions of: %s', paste0(bin_sizes_in_hic, collapse=' ')))
  31. if(!(chr2query %in% chrs_in_hic)) stop(sprintf('Your provided hic file only contains chrs of: %s', paste0(chrs_in_hic, collapse=' ')))
  32. ## try different normalization to get available dataset
  33. contact_mat_raw = try(strawr::straw("KR", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input))
  34. if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) contact_mat_raw = try(strawr::straw("VC_SQRT", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input))
  35. if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) contact_mat_raw = try(strawr::straw("VC", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input))
  36. if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) stop(sprintf('Your provided hic file does not contain information given the bin_size=%s and any of the normalization method KR/VC/VC_SQRT', bin_size_input))
  37. contact_mat_raw = data.table::as.data.table(contact_mat_raw)
  38. }
  39. colnames(contact_mat_raw) = c('pos_1', 'pos_2', 'val')
  40. contact_mat = subset(contact_mat_raw, !is.na(val))
  41. contact_mat[,1] = contact_mat[,1]/bin_size_input
  42. contact_mat[,2] = contact_mat[,2]/bin_size_input
  43. if(!all(contact_mat[[2]] >= contact_mat[[1]])) stop('\nYour provided matrix does not represent an upper triangular matrix!\n\n')
  44. n_bins = max(max(contact_mat[[1]]), max(contact_mat[[2]])) + 1 ## should +1 because contact_mat index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19)
  45. mat_sparse = Matrix::Matrix(0, nrow=n_bins, ncol=n_bins)
  46. mat_sparse[cbind(contact_mat[[1]]+1, contact_mat[[2]]+1)] <- contact_mat[[3]]
  47. rownames(mat_sparse) = colnames(mat_sparse) = as.character( 1:nrow(mat_sparse) )
  48. ########################################################## remove black listed regions
  49. if(length(black_list_bins) > 0)
  50. {
  51. black_list_bins = intersect(as.character(black_list_bins), rownames(mat_sparse))
  52. if(length(black_list_bins) > 0) mat_sparse[black_list_bins, ] = mat_sparse[, black_list_bins] = 0
  53. }
  54. ########################################################## remove bins having too sparse contacts
  55. mat_sparse = Matrix::forceSymmetric(mat_sparse, uplo='U')
  56. if(bin_size2look!=bin_size_input) mat_sparse = mat_10to40kb( mat_sparse, bin_size2look, bin_size_input )
  57. mat_dense = remove_blank_cols(mat_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A
  58. if(nrow(mat_dense) < 100) mat_dense = remove_blank_cols(mat_sparse, sparse=TRUE, ratio=0) ## when all are dense
  59. while(min(apply(mat_dense, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0
  60. {
  61. mat_dense = remove_blank_cols(mat_dense, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A
  62. if(nrow(mat_dense) < 1) stop('Error in generating mat_dense at the data generating step')
  63. }
  64. ##########################################################
  65. nrow2keep = nrow(mat_dense) - nrow(mat_dense)%%compress_size
  66. mat_dense_2_compress = mat_dense[, 1:nrow2keep]
  67. bin_names = rownames(mat_dense)
  68. mat_dense_compressed = compress_mat_fast( as.matrix(mat_dense_2_compress), compress_size=compress_size )
  69. colnames(mat_dense_compressed) = bin_names
  70. rm(mat_dense_2_compress); gc()
  71. # range(mat_dense_compressed)
  72. # # sum(mat_dense_compressed > 1000)
  73. # # mat_dense_compressed[mat_dense_compressed > 1000] = 1000
  74. # mat_dense_compressed_sparse = mat_dense_compressed
  75. mat_dense_compressed = as.matrix(mat_dense_compressed)
  76. mat_dense_compressed_log = log2(mat_dense_compressed + 1)
  77. # #########################################################
  78. # cat('compute correlation matrix ... ')
  79. cmat_dense_compressed_log = fast_cor(mat_dense_compressed_log)
  80. ccmat_dense_compressed_log = fast_cor(cmat_dense_compressed_log)
  81. # cat('compute correlation matrix done ... ')
  82. # #########################################################
  83. # # ccmat_dense_compressed_atanh = atanh(ccmat_dense_compressed - 1E-7)
  84. ccmat_dense_compressed_log_atanh = atanh(ccmat_dense_compressed_log / (1+1E-7))
  85. # rm(mat_dense_compressed, mat_dense_compressed_sparse, cmat_dense_compressed_log)
  86. gc()
  87. # #########################################################
  88. # cat('ready to compute compartment domains\n')
  89. out = list(mat_dense=mat_dense, atanh_score=ccmat_dense_compressed_log_atanh)
  90. return(out)
  91. }
  92. # if(!file.exists(contact_mat_file)) contact_mat_file = paste0('/mnt/ndata/Yuanlong/2.Results/1.Juicer/', CELL_LINE, '/contact_mat/mat_chr', chr, '_', bin_size_initial_kb, 'kb_ob.txt.gz')
  93. ## bin_size_initial is the binsize of your input matrix, can be different from the bin_size of your planned analysis
  94. # contact_mat_processing = function(contact_mat_file, bin_size, bin_size_initial=bin_size)
  95. # {
  96. # compress_size = ifelse(bin_size < 40E3, 1, 1)
  97. # zero_ratio = 0.01
  98. # combined_xk_oe_raw = data.table::fread(contact_mat_file)
  99. # ## this code generates the compartment domains
  100. # combined_xk_oe_raw = subset(combined_xk_oe_raw, !is.na(V3))
  101. # combined_xk_oe_raw[,1] = combined_xk_oe_raw[,1]/bin_size_initial
  102. # combined_xk_oe_raw[,2] = combined_xk_oe_raw[,2]/bin_size_initial
  103. # combined_xk_oe = combined_xk_oe_raw
  104. # colnames(combined_xk_oe) = c('pos_1', 'pos_2', 'val')
  105. # if(!all(combined_xk_oe[[2]] >= combined_xk_oe[[1]])) stop('\nYou provided matrix does not represent an upper triangular matrix!\n\n')
  106. # oe_size = max(max(combined_xk_oe[[1]]), max(combined_xk_oe[[2]])) + 1 ## should +1 because combined_xk_oe index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19)
  107. # mat_oe_sparse = Matrix::Matrix(0, nrow=oe_size, ncol=oe_size)
  108. # mat_oe_sparse[cbind(combined_xk_oe[[1]]+1, combined_xk_oe[[2]]+1)] <- combined_xk_oe[[3]]
  109. # rownames(mat_oe_sparse) = colnames(mat_oe_sparse) = as.character( 1:nrow(mat_oe_sparse) )
  110. # mat_oe_sparse = Matrix::forceSymmetric(mat_oe_sparse, uplo='U')
  111. # if(bin_size!=bin_size_initial) mat_oe_sparse = mat_10to40kb( mat_oe_sparse, bin_size, bin_size_initial )
  112. # A_oe = remove_blank_cols(mat_oe_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A
  113. # if(nrow(A_oe) < 100) A_oe = remove_blank_cols(mat_oe_sparse, sparse=TRUE, ratio=0) ## when all are dense
  114. # while(min(apply(A_oe, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0
  115. # {
  116. # A_oe = remove_blank_cols(A_oe, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A
  117. # if(nrow(A_oe) < 1) stop('ERROR IN GENERATING MEANINGFUL A_oe at the data generating step')
  118. # }
  119. # ##########################################################
  120. # len = nrow(A_oe) - nrow(A_oe)%%compress_size
  121. # A_oe_2_compress = A_oe[, 1:len]
  122. # bin_names = rownames(A_oe)
  123. # A_oe_compressed = compress_mat_fast( as.matrix(A_oe_2_compress), compress_size=compress_size )
  124. # colnames(A_oe_compressed) = bin_names
  125. # rm(A_oe_2_compress); gc()
  126. # range(A_oe_compressed)
  127. # # # sum(A_oe_compressed > 1000)
  128. # # # A_oe_compressed[A_oe_compressed > 1000] = 1000
  129. # A_oe_compressed_sparse = A_oe_compressed
  130. # A_oe_compressed = as.matrix(A_oe_compressed)
  131. # A_oe_compressed_log = log2(A_oe_compressed + 1)
  132. # # #########################################################
  133. # # cat('compute correlation matrix ... ')
  134. # cA_oe_compressed_log = fast_cor(A_oe_compressed_log)
  135. # ccA_oe_compressed_log = fast_cor(cA_oe_compressed_log)
  136. # # cat('compute correlation matrix done ... ')
  137. # # #########################################################
  138. # # # ccA_oe_compressed_atanh = atanh(ccA_oe_compressed - 1E-7)
  139. # ccA_oe_compressed_log_atanh = atanh(ccA_oe_compressed_log / (1+1E-7))
  140. # # rm(A_oe_compressed, A_oe_compressed_sparse, cA_oe_compressed_log)
  141. # gc()
  142. # # #########################################################
  143. # # cat('ready to compute compartment domains\n')
  144. # out = list(A_oe=A_oe, atanh_score=ccA_oe_compressed_log_atanh)
  145. # return(out)
  146. # }