CALDER_hierarchy_v2.R 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. ## Compute sub-compartments, V2. This version uses already computed compartments as reference to derive the A/B compartment direction
  2. ## Yuanlong LIU, 16-07-2021
  3. ## This code converts the compartment into table format
  4. make_comp_tab = function(comp_raw, bin_size)
  5. {
  6. # comp_raw[V1=="chrX"]$V1 = "chr23"
  7. # comp_raw[[1]] = gsub('chr', '', comp_raw[[1]])
  8. bin_indices_li = apply(comp_raw[, 2:3], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=bin_size)/bin_size; bins[1:(length(bins) - 1)]})
  9. expand_len = sapply(bin_indices_li, length)
  10. n_domains = length(unique(comp_raw[[4]]))
  11. continous_rank = (n_domains + 1 - rank(unique(comp_raw[[4]]))) / n_domains
  12. names(continous_rank) = unique(comp_raw[[4]])
  13. comp_tab = data.table::data.table(chr=rep(comp_raw[[1]], expand_len), bin_index=unlist(bin_indices_li), comp_name=rep(comp_raw[[4]], expand_len), comp_rank=rep(comp_raw[[5]], expand_len))
  14. comp_tab$continous_rank = unname(continous_rank[comp_tab$comp_name])
  15. comp_tab$pos_start = (comp_tab$bin_index - 1)*bin_size + 1
  16. comp_tab$pos_end = (comp_tab$bin_index)*bin_size
  17. return(comp_tab)
  18. }
  19. make_comp_tab_calder = function(comp_raw, bin_size)
  20. {
  21. `%dopar%` <- foreach::`%dopar%`
  22. `%do%` <- foreach::`%do%`
  23. colnames(comp_raw)[1] = 'chr'
  24. # comp_raw[V1=="chrX"]$V1 = "chr23"
  25. # comp_raw[[1]] = as.numeric(gsub('chr', '', comp_raw[[1]]))
  26. make_comp_tab_calder_chr = function(comp_raw_chr, bin_size)
  27. {
  28. bin_indices_li = apply(comp_raw_chr[, 2:3], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=bin_size)/bin_size; bins[1:(length(bins) - 1)]})
  29. expand_len = sapply(bin_indices_li, length)
  30. n_domains = length(unique(comp_raw_chr[[4]]))
  31. continous_rank = (n_domains + 1 - rank(unique(comp_raw_chr[[4]]))) / n_domains
  32. names(continous_rank) = unique(comp_raw_chr[[4]])
  33. comp_tab = data.table::data.table(chr=rep(comp_raw_chr[[1]], expand_len), bin_index=unlist(bin_indices_li), comp_name=rep(comp_raw_chr[[4]], expand_len), comp_rank=rep(comp_raw_chr[[5]], expand_len))
  34. comp_tab$continous_rank = unname(continous_rank[comp_tab$comp_name])
  35. comp_tab$pos_start = (comp_tab$bin_index - 1)*bin_size + 1
  36. comp_tab$pos_end = (comp_tab$bin_index)*bin_size
  37. return(comp_tab)
  38. }
  39. comp_tab_ALL = foreach::foreach(comp_raw_chr=split(comp_raw, by="chr"), .combine=rbind) %do%
  40. {
  41. make_comp_tab_calder_chr(comp_raw_chr, bin_size)
  42. }
  43. return(comp_tab_ALL)
  44. }
  45. generate_domain_2D_bedpe = function(sub_compartment_file, bin_size, bedpe_file, n_sub=8)
  46. {
  47. generate_domain_2D_bedpe_chr = function(comp_raw)
  48. {
  49. comp_tab = c(comp_raw, bin_size=10E3)
  50. comp_tab$comp_name = substr(comp_tab$comp_name, 1, log2(n_sub)*2 - 1)
  51. n_row = nrow(comp_tab)
  52. comp_tab$is_boudary = c(1*(comp_tab$comp_name[2:n_row] != comp_tab$comp_name[1:(n_row-1)]), 0)
  53. pos_end = union( comp_tab[is_boudary==1]$pos_end, comp_tab[n_row, ]$pos_end )
  54. pos_start = union( comp_tab[1, ]$pos_start, pos_end[1:(length(pos_end) -1)] + 1 )
  55. pos_start = as.character(pos_start)
  56. pos_end = as.character(pos_end)
  57. chr_name = comp_tab$chr[1]
  58. bedpe = data.table::data.table(chr_name, pos_start, pos_end, chr_name, pos_start, pos_end)
  59. return(bedpe)
  60. }
  61. my_skip = as.numeric(strsplit(system(intern=TRUE, sprintf('zgrep -n "chr" %s', sub_compartment_file)), ':')[[1]][1])
  62. comp_raw_li = split(data.table::fread(sub_compartment_file, skip=my_skip), by="chr")
  63. bedpe = do.call(rbind, lapply(comp_raw_li, generate_domain_2D_bedpe_chr))
  64. data.table::fwrite(bedpe, file=bedpe_file, col.names=FALSE, sep="\t")
  65. return(bedpe)
  66. }
  67. ## cool format is not accepted for the current version
  68. # cool2mat <- function(cool_file, chr_num, bin_size) ## convert cool format to long format / pos_1, pos_2, val / modified from https://rdrr.io/bioc/HiCcompare/src/R/hicpro2bedpe.R
  69. # {
  70. # dump <- rhdf5::h5dump(cool_file)
  71. # if(names(dump)=='resolutions')
  72. # {
  73. # cat('\nYour input contact matrix is in mcool format and contains resolutions of:', names(dump$resolutions), '\n')
  74. # which2keep = which(bin_size==as.numeric(names(dump$resolutions)))
  75. # dump = dump$resolutions[[which2keep]]
  76. # }
  77. # ids <- data.table::data.table(chr = dump$bins$chrom, start = dump$bins$start, id = seq(1, length(dump$bins$chrom), by = 1))
  78. # ids = ids[chr==chr_num][, c('id', 'start')]
  79. # # make sparse matrix
  80. # mat <- data.table::data.table(bin1 = dump$pixels$bin1_id, bin2 = dump$pixels$bin2_id, val = dump$pixels$count)
  81. # mat_chr = mat[(bin1 %in% ids$id) & (bin2 %in% ids$id)] ## keep only cis contacts
  82. # colnames(ids)[2] = 'pos_1'
  83. # contact_mat = left_join(mat_chr, ids, by = c('bin1' = 'id'))
  84. # colnames(ids)[2] = 'pos_2'
  85. # contact_mat <- left_join(contact_mat, ids, by = c('bin2' = 'id'))
  86. # contact_mat = contact_mat[, c('pos_1', 'pos_2', 'val')]
  87. # return(contact_mat)
  88. # }
  89. CALDER_CD_hierarchy_v2 = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chr, bin_size_input, bin_size2look, save_dir, save_intermediate_data=FALSE, swap_AB, ref_compartment_file, black_list_bins=NULL, feature_track=NULL)
  90. {
  91. chr_num = gsub('chr', '', chr, ignore.case=TRUE)
  92. chr_name = paste0('chr', chr_num)
  93. get_cor_with_ref = function(chr_bin_pc, bin_size, ref_compartment_file=NULL, feature_track=NULL) ## correlation of PC1 with comp. domain rank of genome
  94. {
  95. ext_chr_bin_pc = function(chr_bin_pc) ## spand chr_bin_pc using 5kb bin
  96. {
  97. bin_indices_li = unlist(apply(chr_bin_pc[, 4:5], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=5E3)/5E3; list(bins[1:(length(bins) - 1)])}), recursive=FALSE)
  98. expand_len = sapply(bin_indices_li, length)
  99. chr_bin_pc_ext = data.table::data.table(chr=rep(chr_bin_pc[[1]], expand_len), bin_index=unlist(bin_indices_li),PC1_val=rep(chr_bin_pc$PC1_val, expand_len))
  100. chr_bin_pc_ext$pos_start = (chr_bin_pc_ext$bin_index - 1)*5E3 + 1
  101. chr_bin_pc_ext$pos_end = (chr_bin_pc_ext$bin_index)*5E3
  102. return(chr_bin_pc_ext)
  103. }
  104. ## function to generate binning scores // https://divingintogeneticsandgenomics.rbind.io/post/compute-averages-sums-on-granges-or-equal-length-bins/
  105. # feature_track = data.table::data.table(chr=as.vector(GenomicRanges::seqnames(bw_val)), start=start(bw_val), end=end(bw_val), score=bw_val$score)
  106. get_binned_vals = function(feature_track_chr, bin_size=5E3)
  107. {
  108. ## helper to compute binned average // https://divingintogeneticsandgenomics.rbind.io/post/compute-averages-sums-on-granges-or-equal-length-bins/
  109. binnedMean <- function(bins, numvar, mcolname)
  110. {
  111. stopifnot(is(bins, "GRanges"))
  112. stopifnot(is(numvar, "RleList"))
  113. stopifnot(identical(GenomeInfoDb::seqlevels(bins), names(numvar)))
  114. bins_per_chrom <- split(GenomicRanges::ranges(bins), GenomicRanges::seqnames(bins))
  115. sums_list <- lapply(names(numvar),
  116. function(seqname) {
  117. views <- IRanges::Views(numvar[[seqname]],
  118. bins_per_chrom[[seqname]])
  119. IRanges::viewMeans(views)
  120. })
  121. new_mcol <- unsplit(sums_list, as.factor(GenomicRanges::seqnames(bins)))
  122. GenomicRanges::mcols(bins)[[mcolname]] <- new_mcol
  123. return(bins)
  124. }
  125. GR = GenomicRanges::makeGRangesFromDataFrame(feature_track_chr, keep.extra.columns=TRUE)
  126. GR_chrs = split(GR, GenomicRanges::seqnames(GR))
  127. seq_lens = sapply(GR_chrs, function(v) max(GenomicRanges::end(v)))
  128. GR_RleList = GenomicRanges::coverage(GR, weight="score")
  129. seq_info = GenomicRanges::seqinfo(GR_RleList)
  130. GenomeInfoDb::seqlengths(seq_info) = seq_lens
  131. bins = GenomicRanges::tileGenome(seq_info, tilewidth=bin_size, cut.last.tile.in.chrom=TRUE)
  132. bins = bins[GenomicRanges::width(bins)==bin_size]
  133. bin_val_tmp = binnedMean(bins, GR_RleList, "bin_val")
  134. bin_val_tmp = data.table::as.data.table(bin_val_tmp)
  135. bin_val = data.table::data.table(chr=bin_val_tmp$seqnames, bin_index=bin_val_tmp$end / bin_size, continous_rank=log2(1 + bin_val_tmp$bin_val - min(bin_val_tmp$bin_val)))
  136. return(bin_val)
  137. }
  138. chr2query = chr_bin_pc$chr[1]
  139. if(!is.null(ref_compartment_file))
  140. {
  141. domain_rank = data.table::fread(ref_compartment_file)
  142. colnames(domain_rank)[1] = 'chr'
  143. domain_rank_chr = domain_rank[domain_rank[[1]]==chr2query, ]
  144. ref_tab = make_comp_tab_calder(domain_rank_chr, bin_size=5E3) ## 5kb is convenient for most of the bin sizes
  145. ref_tab = ref_tab[, c(1,2,5)]
  146. }
  147. if(!is.null(feature_track))
  148. {
  149. colnames(feature_track) = c('chr', 'start', 'end', 'score')
  150. feature_track$chr = gsub('chr', '', feature_track$chr)
  151. feature_track_chr = feature_track[feature_track$chr==chr_num, ]
  152. ref_tab = get_binned_vals(feature_track_chr)
  153. }
  154. chr_name = gsub("chrchr", "chr", paste0("chr", ref_tab$chr))
  155. # chr_name = ifelse(chr_name=="chr23", "chrX", chr_name)
  156. ref_tab$chr = chr_name
  157. colnames(ref_tab)[2] = "bin_index"
  158. ################################# convert chr_bin_pc into 5kb
  159. chr_bin_pc_ext = ext_chr_bin_pc(chr_bin_pc)
  160. ref_and_pc = merge(ref_tab, chr_bin_pc_ext, by=c("chr", "bin_index"))
  161. cor_with_ref = cor(method='spearman', ref_and_pc[, c("continous_rank", "PC1_val")])[1,2]
  162. return(cor_with_ref)
  163. }
  164. ### The main function starts here
  165. time0 = Sys.time()
  166. log_file = paste0(save_dir, '/chr', chr_num, '_log.txt')
  167. warning_file = paste0(save_dir, '/WARNING_chr', chr_num, '.txt')
  168. cor_log_file = paste0(save_dir, '/cor_with_ref.txt')
  169. cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE)
  170. cat('[', as.character(chr),'] Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n')
  171. processed_data = contact_mat_processing_v2(contact_tab_dump, contact_file_dump=contact_file_dump, contact_file_hic=contact_file_hic, chr=chr_num, bin_size_input=bin_size_input, bin_size2look=bin_size2look, black_list_bins=black_list_bins)
  172. mat_dense = processed_data$mat_dense
  173. ccmat_dense_compressed_log_atanh = processed_data$atanh_score
  174. cat('[', as.character(chr),'] Finish process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n')
  175. cat('>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
  176. p_thresh = ifelse(bin_size2look < 40000, 0.05, 1)
  177. window.sizes = 3
  178. compartments = vector("list", 2)
  179. cat('>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
  180. cat('[', as.character(chr),'] Begin compute compartment domains and their hierachy at:', as.character(Sys.time()), '\n')
  181. compartments[[2]] = generate_compartments_bed(chr = chr, bin_size = bin_size2look, window.sizes = window.sizes, ccmat_dense_compressed_log_atanh, p_thresh, out_file_name = NULL, stat_window_size = NULL)
  182. topDom_output = compartments[[2]]
  183. bin_names = rownames(mat_dense)
  184. mat_dense = as.matrix(mat_dense)
  185. initial_clusters = apply(topDom_output$domain[, c("from.id", "to.id")], 1, function(v) v[1]:v[2])
  186. if (sum(sapply(initial_clusters, length)) != max(unlist(initial_clusters))) {
  187. stop(CELL_LINE, " initial_clusters error in topDom")
  188. }
  189. n_clusters = length(initial_clusters)
  190. mat_dense_cluster_mean = HighResolution2Low_k_rectangle(mat_dense, initial_clusters, initial_clusters, sum_or_mean = "mean")
  191. trend_mean_list = lapply( 1:4, function(v) 1*(mat_dense_cluster_mean[, -(1:v)] > mat_dense_cluster_mean[, - n_clusters - 1 + (v:1)]) )
  192. trend_mean = do.call(cbind, trend_mean_list)
  193. c_trend_mean = cor(t(trend_mean))
  194. atanh_c_trend_mean= atanh(c_trend_mean / (1+1E-7))
  195. # if(to_scale)
  196. {
  197. trend_mean = scale(trend_mean)
  198. c_trend_mean = scale(c_trend_mean)
  199. atanh_c_trend_mean= scale(atanh_c_trend_mean)
  200. }
  201. PC_12_atanh = get_PCs(atanh_c_trend_mean, which=1:10)
  202. PC_12_atanh[, 2:10] = PC_12_atanh[, 2:10]/5 ## xx-xx-xxxx: compress PC2
  203. rownames(PC_12_atanh) = 1:nrow(PC_12_atanh)
  204. ############################################################
  205. PC_direction = 1
  206. ## switch PC direction based on correlation with "ground_truth"
  207. {
  208. initial_clusters_ori_bins = lapply(initial_clusters, function(v) as.numeric(bin_names[v]))
  209. chr_bin_pc = data.table::data.table(chr=chr_name, bin=unlist(initial_clusters_ori_bins), PC1_val=rep(PC_12_atanh[,1], sapply(initial_clusters_ori_bins, length)))
  210. chr_bin_pc$start = (chr_bin_pc$bin - 1)*bin_size2look + 1
  211. chr_bin_pc$end = chr_bin_pc$bin*bin_size2look
  212. # chr_bin_pc_range = makeGRangesFromDataFrame(chr_bin_pc, keep.extra.columns=TRUE)
  213. # gene_info_chr = subset(gene_info, seqnames==chr_name)
  214. # refGR = chr_bin_pc_range
  215. # testGR = gene_info_chr
  216. # hits <- findOverlaps(refGR, testGR)
  217. # overlaps <- pintersect(refGR[queryHits(hits)], testGR[subjectHits(hits)])
  218. # overlaps_bins = unique(data.table::data.table(overlap_ratio=width(overlaps)/bin_size, bin=overlaps$bin))
  219. # bin_pc_gene_coverage = merge(chr_bin_pc, overlaps_bins, all.x=TRUE)
  220. # bin_pc_gene_coverage$overlap_ratio[is.na(bin_pc_gene_coverage$overlap_ratio)] = 0
  221. # gene_density_cor = cor(method='spearman', subset(bin_pc_gene_coverage, (PC1_val < quantile(PC1_val, 0.25)) | (PC1_val > quantile(PC1_val, 0.75)) , c('PC1_val', 'overlap_ratio')))[1,2]
  222. # if(abs(gene_density_cor) < 0.2) warning('correlation between gene density and PC1 is too weak')
  223. if(!is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, ref_compartment_file=ref_compartment_file)) ## get correlation with supplied "ground truth or reference"
  224. if(is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, feature_track=feature_track))
  225. if(class(cor_with_ref)=='try-error') cor_with_ref = 1 ## psudo cor
  226. if(!is.null(ref_compartment_file)) cat('[', as.character(chr),'] Correlation between PC1 and reference compartment is :', format(abs(cor_with_ref), digits=5), '\n')
  227. if(is.null(ref_compartment_file)) cat('[', as.character(chr),'] Correlation between PC1 and feature_track is :', format(abs(cor_with_ref), digits=5), '\n')
  228. PC_direction = PC_direction*sign(cor_with_ref)
  229. if(swap_AB==1) PC_direction = -PC_direction ## force swap PC direction if in some case the A/B direction is reverted
  230. PC_12_atanh = PC_12_atanh*PC_direction
  231. }
  232. project_info = project_to_major_axis(PC_12_atanh)
  233. x_pro = project_info$x_pro
  234. ############################################################
  235. hc_disect_kmeans_PC12 = bisecting_kmeans(PC_12_atanh[, 1:10, drop=FALSE])
  236. hc_hybrid_PC12 = hc_disect_kmeans_PC12
  237. {
  238. reordered_names = reorder_dendro(hc_hybrid_PC12, x_pro, aggregateFun=mean)
  239. hc_hybrid_PC12_reordered = dendextend::rotate(hc_hybrid_PC12, reordered_names)
  240. hc_hybrid_x_pro = hc_disect_kmeans_PC12
  241. reordered_names_x_pro = get_best_reorder(hc_hybrid_x_pro, x_pro)
  242. CALDER_hc = dendextend::rotate(hc_hybrid_x_pro, reordered_names_x_pro)
  243. }
  244. ############################################################
  245. parameters = list(bin_size = bin_size2look, p_thresh = p_thresh)
  246. res = list( CALDER_hc=CALDER_hc, initial_clusters=initial_clusters, bin_names=bin_names, x_pro=x_pro, parameters=parameters)
  247. intermediate_data_file = paste0(save_dir, '/chr', chr, '_intermediate_data.Rds')
  248. hc = res$CALDER_hc
  249. hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels)
  250. bin_comp = data.table::data.table(chr=chr, bin_index=res$bin_names, comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length)))
  251. rownames(bin_comp) = NULL
  252. res$comp = bin_comp
  253. res$CDs = lapply(res$initial_clusters, function(v) res$bin_names[v])
  254. res$mat = mat_dense
  255. res$chr = chr
  256. generate_hierachy_bed(chr=chr, res=res, save_dir=save_dir, bin_size=bin_size2look)
  257. cat('>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
  258. cat('[', as.character(chr),'] Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()), '\n')
  259. if(!is.null(ref_compartment_file)) cat('Correlation between PC1 and reference compartment domain rank on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE)
  260. if(is.null(ref_compartment_file)) cat('Correlation between PC1 and feature_track on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE)
  261. # cat(sprintf("chr%s: %s\n", chr, format(abs(cor_with_ref), 1, 5)), file=cor_log_file, append=TRUE)
  262. # if(abs(cor_with_ref) < 0.3) cat('WARNING: correlation between PC1 and reference compartment domain rank on this chr is: ', format(cor_with_ref, digits=5), ', which is a bit low. Possible reason could be that this chromosome has some big structural variations (translocation, inversion for example). We suggest to overlay the compartment track with the hic map together with histone modification or gene expression track to double check the reliability of compartment calling on this chr.', '\n', file=warning_file)
  263. time1 = Sys.time()
  264. # delta_time = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0))
  265. delta_time <- time1 - time0
  266. timediff <- format(round(delta_time, 2), nsmall = 2)
  267. cat('\n\n', 'Total time used for computing compartment domains and their hierachy:', timediff, '\n', file=log_file, append=TRUE)
  268. # if(abs(gene_density_cor) > 0.2) cat('The gene density and PC1 correlation on this chr is: ', substr(gene_density_cor, 1, 4), '\n', file=log_file, append=TRUE)
  269. ############################################################
  270. intermediate_data = res
  271. if(save_intermediate_data==TRUE) saveRDS(intermediate_data, file=intermediate_data_file)
  272. # cat(intermediate_data_file)
  273. return(intermediate_data_file)
  274. }
  275. project_to_major_axis <- function(PC_12_atanh)
  276. {
  277. Data = data.frame(x=PC_12_atanh[,1], y=PC_12_atanh[,2])
  278. Data = Data[order(Data$x),]
  279. loess_fit <- loess(y ~ x, Data)
  280. more_x = seq(min(PC_12_atanh[,1]), max(PC_12_atanh[,1]), len=10*length(PC_12_atanh[,1]))
  281. major_axis = cbind(x=more_x, y=predict(loess_fit, newdata=more_x))
  282. new_x_axis = cumsum(c(0, sqrt(diff(major_axis[,1])^2 + diff(major_axis[,2])^2))) ## the new xaxis on the curved major_axis
  283. dis = fields::rdist(PC_12_atanh[, 1:2], major_axis)
  284. projected_x = new_x_axis[apply(dis, 1, which.min)]
  285. names(projected_x) = rownames(PC_12_atanh)
  286. # projected_x = major_axis[apply(dis, 1, which.min)]
  287. project_info = list(x_pro=projected_x, major_axis=major_axis)
  288. return(project_info)
  289. }
  290. get_best_reorder <- function(hc_hybrid_x_pro, x_pro)
  291. {
  292. n = length(x_pro)
  293. reordered_names_x_pro_list = list()
  294. reordered_names_x_pro_list[[1]] = reorder_dendro(hc_hybrid_x_pro, (x_pro), aggregateFun=mean) ## here the clusters are assigned into A.1 A.2 B.1 B.2
  295. best_index = which.max(sapply(reordered_names_x_pro_list, function(v) cor(1:n, unname(rank(x_pro, ties.method='first')[v]))))
  296. return(reordered_names_x_pro_list[[1]])
  297. }
  298. generate_hierachy_bed = function(chr, res, save_dir, bin_size)
  299. {
  300. chr_name = paste0('chr', chr)
  301. # res = reses[[chr_name]][[CELL_LINE]]
  302. hc = res$CALDER_hc
  303. hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels)
  304. bin_comp = data.table::data.table(chr=chr, bin_index=as.numeric(res$bin_names), comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length)))
  305. chr_bin_domain = bin_comp
  306. chr_bin_domain$chr = paste0('chr', chr_bin_domain$chr)
  307. # chr_bin_domain = chr_bin_domain[order(bin_index)]
  308. compartment_info_tab = create_compartment_bed_v4(chr_bin_domain, bin_size=bin_size)
  309. boundaries = unname(sapply(res$initial_clusters, max))
  310. boundaries_ori = as.numeric(res$bin_names[boundaries])*bin_size
  311. compartment_info_tab$is_boundary = 'gap'
  312. compartment_info_tab[compartment_info_tab$pos_end %in% boundaries_ori, 'is_boundary'] = 'boundary'
  313. colnames(compartment_info_tab)[4] = 'compartment_label'
  314. compartments_tsv_file = paste0(save_dir, '/chr', chr, '_domain_hierachy.tsv')
  315. compartments_bed_file = paste0(save_dir, '/chr', chr, '_sub_compartments.bed')
  316. boundary_bed_file = paste0(save_dir, '/chr', chr, '_domain_boundaries.bed')
  317. options(scipen=999)
  318. write.table( compartment_info_tab, file=compartments_tsv_file, quote=FALSE, sep='\t', row.names=FALSE )
  319. comp_cols = c("#FF0000", "#FF4848", "#FF9191", "#FFDADA", "#DADAFF", "#9191FF", "#4848FF", "#0000FF")
  320. names(comp_cols) = c('A.1.1', 'A.1.2', 'A.2.1', 'A.2.2', 'B.1.1', 'B.1.2', 'B.2.1', 'B.2.2')
  321. comp_val = (8:1)/8
  322. names(comp_val) = names(comp_cols)
  323. comp_8 = substr(compartment_info_tab$compartment_label, 1, 5)
  324. compartment_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[, 2:4], comp_val[comp_8], '.', compartment_info_tab[, 2:3], comp_cols[comp_8])
  325. write.table( compartment_bed, file=compartments_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
  326. bounday_bed_raw = subset(compartment_info_tab, is_boundary=='boundary')
  327. bounday_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[,3], compartment_info_tab[,3], '', '.', compartment_info_tab[,3], compartment_info_tab[,3], '#000000')
  328. write.table( bounday_bed, file=boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
  329. }
  330. create_compartment_bed_v4 = function(chr_bin_domain, bin_size)
  331. {
  332. # for( chr in chrs )
  333. {
  334. v = chr_bin_domain
  335. # v$intra_domain = as.character(6 - (as.numeric(v$intra_domain))) ## invert the labeling
  336. # v$intra_domain = names(cols)[(as.numeric(v$intra_domain))]
  337. v = v[order(v$bin_index), ]
  338. borders_non_consecutive = which(diff(v$bin_index)!=1)
  339. borders_domain = cumsum(rle(v$comp)$lengths)
  340. borders = sort(union(borders_non_consecutive, borders_domain))
  341. bins = v$bin_index
  342. to_id = as.numeric(bins[borders])
  343. from_id = as.numeric(bins[c(1, head(borders, length(borders)-1)+1)])
  344. pos_start = (from_id-1)*bin_size + 1
  345. pos_end = to_id*bin_size
  346. # chr = as.numeric( gsub('chr', '', v$chr) )
  347. chr = gsub('chr', '', v$chr) ## no need for as.numeric, also makes it compatible with chrX
  348. compartment_info_tab = data.frame(chr=rep(unique(chr), length(pos_start)), pos_start=pos_start, pos_end=pos_end, domain=v$comp[borders])
  349. }
  350. return(compartment_info_tab)
  351. }
  352. CALDER_sub_domains = function(intermediate_data_file=NULL, intermediate_data=NULL, chr, save_dir, bin_size)
  353. {
  354. time0 = Sys.time()
  355. log_file = paste0(save_dir, '/chr', chr, '_sub_domains_log.txt')
  356. cat('[', as.character(chr),']Begin compute sub-domains at:', as.character(Sys.time()))
  357. cat('>>>> Begin compute sub-domains at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE)
  358. if(is.null(intermediate_data)) intermediate_data = readRDS(intermediate_data_file)
  359. {
  360. if(intermediate_data$chr!=chr) stop('intermediate_data$chr!=chr; check your input parameters\n')
  361. if( !setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) ) stop('!setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) \n')
  362. compartment_segs = generate_compartment_segs( intermediate_data$initial_clusters )
  363. cat('[', as.character(chr),'] Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n')
  364. cat('>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
  365. sub_domains_raw = HRG_zigzag_compartment_domain_main_fun(intermediate_data$mat, './', compartment_segs, min_n_bins=2)
  366. no_output = post_process_sub_domains(chr, sub_domains_raw, ncores=1, out_dir=save_dir, bin_size=bin_size)
  367. cat('>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
  368. cat('[', as.character(chr),'] Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n')
  369. time1 = Sys.time()
  370. # delta_time = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0))
  371. delta_time <- time1 - time0
  372. timediff <- format(round(delta_time, 2), nsmall = 2)
  373. cat('\n\n', 'Total time used for computing sub-domains:', timediff, '\n', file=log_file, append=TRUE)
  374. }
  375. # return(NULL)
  376. }