123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201 |
-
- compress_mat_fast_tmp = function(input_mat, compress_size)
- {
- mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) )
- n = ncol(input_mat)
- mat2prod = mat_block(n, compress_size)
-
- return(t(matrix_multiplication_cpp(input_mat, mat2prod)))
- }
- mat_10to40kb = function(mat2compress, bin_size2look, bin_size_input)
- {
- compress_size = bin_size2look / bin_size_input
- len = nrow(mat2compress) - nrow(mat2compress)%%compress_size
- mat2compress = mat2compress[1:len, 1:len]
- c_mat_compressed = compress_mat_fast_tmp( as.matrix(mat2compress), compress_size=compress_size )
- mat_compressed = compress_mat_fast_tmp( c_mat_compressed, compress_size=compress_size )
- rownames(mat_compressed) = colnames(mat_compressed) = as.character( 1:nrow(mat_compressed) )
- return(mat_compressed)
- }
- contact_mat_processing_v2 = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chr_num, bin_size_input, bin_size2look, black_list_bins=NULL)
- {
- compress_size = ifelse(bin_size2look < 40E3, 1, 1)
- zero_ratio = 0.01
- if(!is.null(contact_tab_dump)) contact_mat_raw = contact_tab_dump
- if(!is.null(contact_file_dump)) contact_mat_raw = data.table::fread(contact_file_dump)
- if(!is.null(contact_file_hic))
- {
- bin_sizes_in_hic = strawr::readHicBpResolutions(contact_file_hic)
- chrs_in_hic = as.vector(strawr::readHicChroms(contact_file_hic)[[1]])
- chr2query = chrs_in_hic[match(toupper(chr_num), gsub('chr', '', toupper(chrs_in_hic), ignore.case=TRUE))]
- if(!(bin_size_input %in% bin_sizes_in_hic)) stop(sprintf('Your provided hic file only contains resolutions of: %s', paste0(bin_sizes_in_hic, collapse=' ')))
- if(!(chr2query %in% chrs_in_hic)) stop(sprintf('Your provided hic file only contains chrs of: %s', paste0(chrs_in_hic, collapse=' ')))
-
- contact_mat_raw = try(strawr::straw("KR", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input))
- if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) contact_mat_raw = try(strawr::dump("VC_SQRT", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input))
- if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) contact_mat_raw = try(strawr::dump("VC", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input))
- if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) stop(sprintf('Your provided hic file does not contain information given the bin_size=%s and any of the normalization method KR/VC/VC_SQRT', bin_size_input))
- contact_mat_raw = data.table::as.data.table(contact_mat_raw)
- }
- colnames(contact_mat_raw) = c('pos_1', 'pos_2', 'val')
- contact_mat = subset(contact_mat_raw, !is.na(val))
- contact_mat[,1] = contact_mat[,1]/bin_size_input
- contact_mat[,2] = contact_mat[,2]/bin_size_input
- if(!all(contact_mat[[2]] >= contact_mat[[1]])) stop('\nYour provided matrix does not represent an upper triangular matrix!\n\n')
- n_bins = max(max(contact_mat[[1]]), max(contact_mat[[2]])) + 1
- mat_sparse = Matrix::Matrix(0, nrow=n_bins, ncol=n_bins)
- mat_sparse[cbind(contact_mat[[1]]+1, contact_mat[[2]]+1)] <- contact_mat[[3]]
- rownames(mat_sparse) = colnames(mat_sparse) = as.character( 1:nrow(mat_sparse) )
-
-
- if(length(black_list_bins) > 0)
- {
- black_list_bins = intersect(as.character(black_list_bins), rownames(mat_sparse))
- if(length(black_list_bins) > 0) mat_sparse[black_list_bins, ] = mat_sparse[, black_list_bins] = 0
- }
-
- mat_sparse = Matrix::forceSymmetric(mat_sparse, uplo='U')
- if(bin_size2look!=bin_size_input) mat_sparse = mat_10to40kb( mat_sparse, bin_size2look, bin_size_input )
- mat_dense = remove_blank_cols(mat_sparse, sparse=TRUE, ratio=zero_ratio)
- if(nrow(mat_dense) < 100) mat_dense = remove_blank_cols(mat_sparse, sparse=TRUE, ratio=0)
- while(min(apply(mat_dense, 1, sd))==0)
- {
- mat_dense = remove_blank_cols(mat_dense, sparse=TRUE, ratio=1E-7)
- if(nrow(mat_dense) < 1) stop('Error in generating mat_dense at the data generating step')
- }
-
- nrow2keep = nrow(mat_dense) - nrow(mat_dense)%%compress_size
- mat_dense_2_compress = mat_dense[, 1:nrow2keep]
- bin_names = rownames(mat_dense)
- mat_dense_compressed = compress_mat_fast( as.matrix(mat_dense_2_compress), compress_size=compress_size )
- colnames(mat_dense_compressed) = bin_names
- rm(mat_dense_2_compress); gc()
-
-
-
-
-
- mat_dense_compressed = as.matrix(mat_dense_compressed)
- mat_dense_compressed_log = log2(mat_dense_compressed + 1)
-
-
-
- cmat_dense_compressed_log = fast_cor(mat_dense_compressed_log)
- ccmat_dense_compressed_log = fast_cor(cmat_dense_compressed_log)
-
-
-
- ccmat_dense_compressed_log_atanh = atanh(ccmat_dense_compressed_log / (1+1E-7))
-
- gc()
-
-
- out = list(mat_dense=mat_dense, atanh_score=ccmat_dense_compressed_log_atanh)
- return(out)
- }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
|