123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374 |
-
-
-
- make_comp_tab = function(comp_raw, bin_size)
- {
-
-
- bin_indices_li = apply(comp_raw[, 2:3], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=bin_size)/bin_size; bins[1:(length(bins) - 1)]})
- expand_len = sapply(bin_indices_li, length)
- n_domains = length(unique(comp_raw[[4]]))
- continous_rank = (n_domains + 1 - rank(unique(comp_raw[[4]]))) / n_domains
- names(continous_rank) = unique(comp_raw[[4]])
- comp_tab = data.table::data.table(chr=rep(comp_raw[[1]], expand_len), bin_index=unlist(bin_indices_li), comp_name=rep(comp_raw[[4]], expand_len), comp_rank=rep(comp_raw[[5]], expand_len))
- comp_tab$continous_rank = unname(continous_rank[comp_tab$comp_name])
- comp_tab$pos_start = (comp_tab$bin_index - 1)*bin_size + 1
- comp_tab$pos_end = (comp_tab$bin_index)*bin_size
- return(comp_tab)
- }
- make_comp_tab_calder = function(comp_raw, bin_size)
- {
- `%dopar%` <- foreach::`%dopar%`
- `%do%` <- foreach::`%do%`
- colnames(comp_raw)[1] = 'chr'
-
-
- make_comp_tab_calder_chr = function(comp_raw_chr, bin_size)
- {
- bin_indices_li = apply(comp_raw_chr[, 2:3], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=bin_size)/bin_size; bins[1:(length(bins) - 1)]})
- expand_len = sapply(bin_indices_li, length)
- n_domains = length(unique(comp_raw_chr[[4]]))
- continous_rank = (n_domains + 1 - rank(unique(comp_raw_chr[[4]]))) / n_domains
- names(continous_rank) = unique(comp_raw_chr[[4]])
- comp_tab = data.table::data.table(chr=rep(comp_raw_chr[[1]], expand_len), bin_index=unlist(bin_indices_li), comp_name=rep(comp_raw_chr[[4]], expand_len), comp_rank=rep(comp_raw_chr[[5]], expand_len))
- comp_tab$continous_rank = unname(continous_rank[comp_tab$comp_name])
- comp_tab$pos_start = (comp_tab$bin_index - 1)*bin_size + 1
- comp_tab$pos_end = (comp_tab$bin_index)*bin_size
- return(comp_tab)
- }
- comp_tab_ALL = foreach::foreach(comp_raw_chr=split(comp_raw, by="chr"), .combine=rbind) %do%
- {
- make_comp_tab_calder_chr(comp_raw_chr, bin_size)
- }
- return(comp_tab_ALL)
- }
- generate_domain_2D_bedpe = function(sub_compartment_file, bin_size, bedpe_file, n_sub=8)
- {
- generate_domain_2D_bedpe_chr = function(comp_raw)
- {
- comp_tab = c(comp_raw, bin_size=10E3)
- comp_tab$comp_name = substr(comp_tab$comp_name, 1, log2(n_sub)*2 - 1)
- n_row = nrow(comp_tab)
- comp_tab$is_boudary = c(1*(comp_tab$comp_name[2:n_row] != comp_tab$comp_name[1:(n_row-1)]), 0)
- pos_end = union( comp_tab[is_boudary==1]$pos_end, comp_tab[n_row, ]$pos_end )
- pos_start = union( comp_tab[1, ]$pos_start, pos_end[1:(length(pos_end) -1)] + 1 )
- pos_start = as.character(pos_start)
- pos_end = as.character(pos_end)
- chr_name = comp_tab$chr[1]
- bedpe = data.table::data.table(chr_name, pos_start, pos_end, chr_name, pos_start, pos_end)
- return(bedpe)
- }
- my_skip = as.numeric(strsplit(system(intern=TRUE, sprintf('zgrep -n "chr" %s', sub_compartment_file)), ':')[[1]][1])
- comp_raw_li = split(data.table::fread(sub_compartment_file, skip=my_skip), by="chr")
- bedpe = do.call(rbind, lapply(comp_raw_li, generate_domain_2D_bedpe_chr))
- data.table::fwrite(bedpe, file=bedpe_file, col.names=FALSE, sep="\t")
- return(bedpe)
- }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- CALDER_CD_hierarchy_v2 = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chr, bin_size_input, bin_size2look, save_dir, save_intermediate_data=FALSE, swap_AB, ref_compartment_file, black_list_bins=NULL, annotation_track=NULL)
- {
- chr_num = gsub('chr', '', chr, ignore.case=TRUE)
- chr_name = paste0('chr', chr_num)
- get_cor_with_ref = function(chr_bin_pc, bin_size, ref_compartment_file=NULL, annotation_track=NULL)
- {
- ext_chr_bin_pc = function(chr_bin_pc)
- {
- bin_indices_li = unlist(apply(chr_bin_pc[, 4:5], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=5E3)/5E3; list(bins[1:(length(bins) - 1)])}), recursive=FALSE)
- expand_len = sapply(bin_indices_li, length)
- chr_bin_pc_ext = data.table::data.table(chr=rep(chr_bin_pc[[1]], expand_len), bin_index=unlist(bin_indices_li),PC1_val=rep(chr_bin_pc$PC1_val, expand_len))
- chr_bin_pc_ext$pos_start = (chr_bin_pc_ext$bin_index - 1)*5E3 + 1
- chr_bin_pc_ext$pos_end = (chr_bin_pc_ext$bin_index)*5E3
- return(chr_bin_pc_ext)
- }
-
-
- get_binned_vals = function(annotation_track_chr, bin_size=5E3)
- {
-
- binnedMean <- function(bins, numvar, mcolname)
- {
- stopifnot(is(bins, "GRanges"))
- stopifnot(is(numvar, "RleList"))
- stopifnot(identical(seqlevels(bins), names(numvar)))
- bins_per_chrom <- split(ranges(bins), GenomicRanges::seqnames(bins))
- sums_list <- lapply(names(numvar),
- function(seqname) {
- views <- Views(numvar[[seqname]],
- bins_per_chrom[[seqname]])
- viewMeans(views)
- })
- new_mcol <- unsplit(sums_list, as.factor(GenomicRanges::seqnames(bins)))
- mcols(bins)[[mcolname]] <- new_mcol
- bins
- }
- GR = GenomicRanges::makeGRangesFromDataFrame(annotation_track_chr, keep.extra.columns=TRUE)
- GR_chrs = split(GR, GenomicRanges::seqnames(GR))
- seq_lens = sapply(GR_chrs, function(v) max(end(v)))
- GR_RleList = GenomicRanges::coverage(GR, weight="score")
- seq_info = GenomicRanges::seqinfo(GR_RleList)
- GenomeInfoDb::seqlengths(seq_info) = seq_lens
- bins = GenomicRanges::tileGenome(seq_info, tilewidth=bin_size, cut.last.tile.in.chrom=TRUE)
- bins = bins[width(bins)==bin_size]
- bin_val_tmp = binnedMean(bins, GR_RleList, "bin_val")
- bin_val_tmp = data.table::as.data.table(bin_val_tmp)
- bin_val = data.table::data.table(chr=bin_val_tmp$seqnames, bin_index=bin_val_tmp$end / bin_size, continous_rank=log2(1 + bin_val_tmp$bin_val - min(bin_val_tmp$bin_val)))
- return(bin_val)
- }
- chr2query = chr_bin_pc$chr[1]
- if(!is.null(ref_compartment_file))
- {
- domain_rank = data.table::fread(ref_compartment_file)
- colnames(domain_rank)[1] = 'chr'
- domain_rank_chr = domain_rank[domain_rank[[1]]==chr2query, ]
- ref_tab = make_comp_tab_calder(domain_rank_chr, bin_size=5E3)
- ref_tab = ref_tab[, c(1,2,5)]
- }
- if(!is.null(annotation_track))
- {
- colnames(annotation_track) = c('chr', 'start', 'end', 'score')
- annotation_track$chr = gsub('chr', '', annotation_track$chr)
- annotation_track_chr = annotation_track[annotation_track$chr==chr_num, ]
- ref_tab = get_binned_vals(annotation_track_chr)
- }
- chr_name = gsub("chrchr", "chr", paste0("chr", ref_tab$chr))
-
- ref_tab$chr = chr_name
- colnames(ref_tab)[2] = "bin_index"
-
- chr_bin_pc_ext = ext_chr_bin_pc(chr_bin_pc)
- ref_and_pc = merge(ref_tab, chr_bin_pc_ext, by=c("chr", "bin_index"))
- cor_with_ref = cor(method='spearman', ref_and_pc[, c("continous_rank", "PC1_val")])[1,2]
- return(cor_with_ref)
- }
-
-
-
- time0 = Sys.time()
- log_file = paste0(save_dir, '/chr', chr_num, '_log.txt')
- warning_file = paste0(save_dir, '/WARNING_chr', chr_num, '.txt')
- cor_log_file = paste0(save_dir, '/cor_with_ref.txt')
- cat('\n')
- cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE)
- cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n')
- processed_data = contact_mat_processing_v2(contact_tab_dump, contact_file_dump=contact_file_dump, contact_file_hic=contact_file_hic, chr=chr_num, bin_size_input=bin_size_input, bin_size2look=bin_size2look, black_list_bins=black_list_bins)
-
- mat_dense = processed_data$mat_dense
- ccmat_dense_compressed_log_atanh = processed_data$atanh_score
- cat('\r', '>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time()))
- cat('>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
- p_thresh = ifelse(bin_size2look < 40000, 0.05, 1)
- window.sizes = 3
- compartments = vector("list", 2)
- chr_name = paste0("chr", chr)
- cat('>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
- cat('\r', '>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time()))
- compartments[[2]] = generate_compartments_bed(chr = chr, bin_size = bin_size2look, window.sizes = window.sizes, ccmat_dense_compressed_log_atanh, p_thresh, out_file_name = NULL, stat_window_size = NULL)
- topDom_output = compartments[[2]]
- bin_names = rownames(mat_dense)
- mat_dense = as.matrix(mat_dense)
- initial_clusters = apply(topDom_output$domain[, c("from.id", "to.id")], 1, function(v) v[1]:v[2])
- if (sum(sapply(initial_clusters, length)) != max(unlist(initial_clusters))) {
- stop(CELL_LINE, " initial_clusters error in topDom")
- }
- n_clusters = length(initial_clusters)
- mat_dense_cluster_mean = HighResolution2Low_k_rectangle(mat_dense, initial_clusters, initial_clusters, sum_or_mean = "mean")
-
- trend_mean_list = lapply( 1:4, function(v) 1*(mat_dense_cluster_mean[, -(1:v)] > mat_dense_cluster_mean[, - n_clusters - 1 + (v:1)]) )
- trend_mean = do.call(cbind, trend_mean_list)
- c_trend_mean = cor(t(trend_mean))
- atanh_c_trend_mean= atanh(c_trend_mean / (1+1E-7))
-
- {
- trend_mean = scale(trend_mean)
- c_trend_mean = scale(c_trend_mean)
- atanh_c_trend_mean= scale(atanh_c_trend_mean)
- }
- PC_12_atanh = get_PCs(atanh_c_trend_mean, which=1:10)
- PC_12_atanh[, 2:10] = PC_12_atanh[, 2:10]/5
- rownames(PC_12_atanh) = 1:nrow(PC_12_atanh)
-
- PC_direction = 1
-
- {
- initial_clusters_ori_bins = lapply(initial_clusters, function(v) as.numeric(bin_names[v]))
- chr_bin_pc = data.table::data.table(chr=chr_name, bin=unlist(initial_clusters_ori_bins), PC1_val=rep(PC_12_atanh[,1], sapply(initial_clusters_ori_bins, length)))
- chr_bin_pc$start = (chr_bin_pc$bin - 1)*bin_size2look + 1
- chr_bin_pc$end = chr_bin_pc$bin*bin_size2look
-
-
-
-
-
-
-
-
-
-
-
-
-
- if(!is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, ref_compartment_file=ref_compartment_file))
- if(is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, annotation_track=annotation_track))
- if(class(cor_with_ref)=='try-error') cor_with_ref = 1
- if(!is.null(ref_compartment_file)) cat('\r', ">>>> Correlation between PC1 and reference compartment is :", format(abs(cor_with_ref), digits=5), '\n')
- if(is.null(ref_compartment_file)) cat('\r', ">>>> Correlation between PC1 and annotation_track is :", format(abs(cor_with_ref), digits=5), '\n')
- PC_direction = PC_direction*sign(cor_with_ref)
- if(swap_AB==1) PC_direction = -PC_direction
- PC_12_atanh = PC_12_atanh*PC_direction
- }
- project_info = project_to_major_axis(PC_12_atanh)
- x_pro = project_info$x_pro
-
-
- hc_disect_kmeans_PC12 = bisecting_kmeans(PC_12_atanh[, 1:10, drop=FALSE])
- hc_hybrid_PC12 = hc_disect_kmeans_PC12
- {
- reordered_names = reorder_dendro(hc_hybrid_PC12, x_pro, aggregateFun=mean)
- hc_hybrid_PC12_reordered = dendextend::rotate(hc_hybrid_PC12, reordered_names)
- hc_hybrid_x_pro = hc_disect_kmeans_PC12
- reordered_names_x_pro = get_best_reorder(hc_hybrid_x_pro, x_pro)
- CALDER_hc = dendextend::rotate(hc_hybrid_x_pro, reordered_names_x_pro)
- }
-
- parameters = list(bin_size = bin_size2look, p_thresh = p_thresh)
- res = list( CALDER_hc=CALDER_hc, initial_clusters=initial_clusters, bin_names=bin_names, x_pro=x_pro, parameters=parameters)
- intermediate_data_file = paste0(save_dir, '/chr', chr, '_intermediate_data.Rds')
-
- hc = res$CALDER_hc
- hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels)
- bin_comp = data.table::data.table(chr=chr, bin_index=res$bin_names, comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length)))
- rownames(bin_comp) = NULL
- res$comp = bin_comp
- res$CDs = lapply(res$initial_clusters, function(v) res$bin_names[v])
- res$mat = mat_dense
- res$chr = chr
- generate_hierachy_bed(chr=chr, res=res, save_dir=save_dir, bin_size=bin_size2look)
- cat('>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
- cat('\r', '>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()))
- if(!is.null(ref_compartment_file)) cat('Correlation between PC1 and reference compartment domain rank on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE)
- if(is.null(ref_compartment_file)) cat('Correlation between PC1 and annotation_track on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE)
-
-
- time1 = Sys.time()
-
- delta_time <- time1 - time0
- timediff <- format(round(delta_time, 2), nsmall = 2)
- cat('\n\n', 'Total time used for computing compartment domains and their hierachy:', timediff, '\n', file=log_file, append=TRUE)
-
-
- intermediate_data = res
- if(save_intermediate_data==TRUE) saveRDS(intermediate_data, file=intermediate_data_file)
-
- return(intermediate_data_file)
- }
|