Browse Source

updated package contents

Yuanlong LIU 2 years ago
parent
commit
d80898012e
8 changed files with 191 additions and 430 deletions
  1. 0 4
      R/CALDER.R
  2. 0 308
      R/CALDER_hierarchy.R
  3. 159 13
      R/CALDER_hierarchy_v2.R
  4. 10 6
      R/CALDER_main.R
  5. 0 1
      R/bisecting_kmeans.R
  6. 2 3
      R/build_comp_table_opt.R
  7. 0 95
      R/zigzag_nested_domain.R
  8. 20 0
      total_execution.time

+ 0 - 4
R/CALDER.R

@@ -1,4 +0,0 @@
-	
-
-		
-	

+ 0 - 308
R/CALDER_hierarchy.R

@@ -1,308 +0,0 @@
-############################################################
-
-
-get_gene_info <- function(genome) {
-
-	if(genome == 'hg19') {
-		gene_info_file = system.file("extdata", "TxDb.Hsapiens.UCSC.hg19.knownGene.rds", package = 'CALDER')
-	} else if(genome == 'mm9') {
-		gene_info_file = system.file("extdata", "TxDb.Mmusculus.UCSC.mm9.knownGene.rds", package = 'CALDER')
-	} else {
-		stop(paste0("Unknown genome (", genome, ")"))
-	}
-
-	
-	gene_info = readRDS(gene_info_file)
-}
-
-
-CALDER_CD_hierarchy = function(contact_mat_file, 
-							   chr, 
-							   bin_size, 
-							   save_dir, 
-							   save_intermediate_data=FALSE,
-							   genome = 'hg19')
-{
-    time0 = Sys.time()
-    log_file = paste0(save_dir, '/chr', chr, '_log.txt')
-
-    cat('\n')
-
-    cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE)
-    cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n')
-    processed_data = contact_mat_processing(contact_mat_file, bin_size=bin_size)
-   
-    A_oe = processed_data$A_oe
-    ccA_oe_compressed_log_atanh = processed_data$atanh_score
-
-    cat('\r', '>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time()))
-    cat('>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
-
-    p_thresh = ifelse(bin_size < 40000, 0.05, 1)
-    window.sizes = 3
-    compartments = vector("list", 2)
-    chr_name = paste0("chr", chr)
-    
-    cat('>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
-    cat('\r', '>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time()))
-
-    compartments[[2]] = generate_compartments_bed(chr = chr, bin_size = bin_size, window.sizes = window.sizes, ccA_oe_compressed_log_atanh, p_thresh, out_file_name = NULL, stat_window_size = NULL)
-    topDom_output = compartments[[2]]
-    bin_names = rownames(A_oe)
-    A_oe = as.matrix(A_oe)
-    initial_clusters = apply(topDom_output$domain[, c("from.id", "to.id")], 1, function(v) v[1]:v[2])
-
-    if (sum(sapply(initial_clusters, length)) != max(unlist(initial_clusters))) {
-        stop(CELL_LINE, " initial_clusters error in topDom")
-    }
-
-    n_clusters = length(initial_clusters)
-		A_oe_cluster_mean = HighResolution2Low_k_rectangle(A_oe, initial_clusters, initial_clusters, sum_or_mean = "mean")
-
-	trend_mean_list = lapply( 1:4, function(v) 1*(A_oe_cluster_mean[, -(1:v)] > A_oe_cluster_mean[, - n_clusters - 1 + (v:1)]) )
-	trend_mean = do.call(cbind, trend_mean_list)
-	c_trend_mean = cor(t(trend_mean))
-	atanh_c_trend_mean= atanh(c_trend_mean / (1+1E-7))
-
-
-	# if(to_scale)
-	{
-		trend_mean = scale(trend_mean)
-		c_trend_mean = scale(c_trend_mean)
-		atanh_c_trend_mean= scale(atanh_c_trend_mean)
-	}
-
-
-	PC_12_atanh = get_PCs(atanh_c_trend_mean, which=1:10)
-	PC_12_atanh[, 2:10] = PC_12_atanh[, 2:10]/5 ## xx-xx-xxxx: compress PC2
-	rownames(PC_12_atanh) = 1:nrow(PC_12_atanh)
-
-	############################################################
-	PC_direction = 1
-
-	gene_info <- get_gene_info(genome)
-
-	## switch PC direction based on gene density
-	{
-		initial_clusters_ori_bins = lapply(initial_clusters, function(v) as.numeric(bin_names[v]))
-		chr_bin_pc = data.table::data.table(chr=chr_name, bin=unlist(initial_clusters_ori_bins), PC1_val=rep(PC_12_atanh[,1], sapply(initial_clusters_ori_bins, length)))
-		chr_bin_pc$start = (chr_bin_pc$bin - 1)*bin_size + 1
-		chr_bin_pc$end = chr_bin_pc$bin*bin_size
-		chr_bin_pc_range = makeGRangesFromDataFrame(chr_bin_pc, keep.extra.columns=TRUE)
-		gene_info_chr = subset(gene_info, seqnames==chr_name)
-
-		refGR = chr_bin_pc_range
-		testGR = gene_info_chr
-		hits <- findOverlaps(refGR, testGR)
-	    overlaps <- pintersect(refGR[queryHits(hits)], testGR[subjectHits(hits)])
-	    overlaps_bins = unique(data.table::data.table(overlap_ratio=width(overlaps)/bin_size, bin=overlaps$bin))
-	    bin_pc_gene_coverage = merge(chr_bin_pc, overlaps_bins, all.x=TRUE)
-	    bin_pc_gene_coverage$overlap_ratio[is.na(bin_pc_gene_coverage$overlap_ratio)] = 0
-		
-	    gene_density_cor = cor(method='spearman', subset(bin_pc_gene_coverage, (PC1_val < quantile(PC1_val, 0.25)) | (PC1_val > quantile(PC1_val, 0.75)) , c('PC1_val', 'overlap_ratio')))[1,2]
-	    if(abs(gene_density_cor) < 0.2) warning('correlation between gene density and PC1 is too weak')
-	    PC_direction = PC_direction*sign(gene_density_cor)
-
-	    PC_12_atanh = PC_12_atanh*PC_direction
-	}
-
-
-	project_info = project_to_major_axis(PC_12_atanh)
-	x_pro = project_info$x_pro
-	
-	############################################################
-	hc_disect_kmeans_PC12 = bisecting_kmeans(PC_12_atanh[, 1:10, drop=FALSE])
-
-	hc_hybrid_PC12 = hc_disect_kmeans_PC12
-
-	{
-		reordered_names = reorder_dendro(hc_hybrid_PC12, x_pro, aggregateFun=mean)
-		hc_hybrid_PC12_reordered = dendextend::rotate(hc_hybrid_PC12, reordered_names)
-		hc_hybrid_x_pro = hc_disect_kmeans_PC12
-		reordered_names_x_pro = get_best_reorder(hc_hybrid_x_pro, x_pro)
-		CALDER_hc = dendextend::rotate(hc_hybrid_x_pro, reordered_names_x_pro)	
-	}
-
-	############################################################
-	parameters = list(bin_size = bin_size, p_thresh = p_thresh)
-	res = list( CALDER_hc=CALDER_hc, initial_clusters=initial_clusters, bin_names=bin_names, x_pro=x_pro, parameters=parameters)
-	intermediate_data_file = paste0(save_dir, '/chr', chr, '_intermediate_data.Rds')
-	
-	hc = res$CALDER_hc
-	hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels)
-	bin_comp = data.table::data.table(chr=chr, bin_index=res$bin_names, comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length)))
-
-	rownames(bin_comp) = NULL
-	res$comp = bin_comp
-	res$CDs = lapply(res$initial_clusters, function(v) res$bin_names[v])
-	res$mat = A_oe
-	res$chr = chr
-	generate_hierachy_bed(chr=chr, res=res, save_dir=save_dir, bin_size=bin_size)
-
-
-    cat('>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
-    cat('\r', '>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()))
-
-   	if(abs(gene_density_cor) < 0.2) cat('WARNING: correlation between gene density and PC1 on this chr is: ', substr(gene_density_cor, 1, 4), ', which is a bit low', '\n', file=log_file, append=TRUE)
-
-    time1 = Sys.time()
-    # delta_time  = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0))
-
-    delta_time <- time1 - time0
-	timediff <- format(round(delta_time, 2), nsmall = 2)
-
-    cat('\n\n', 'Total time used for computing compartment domains and their hierachy:', timediff, '\n', file=log_file, append=TRUE)
-   	# if(abs(gene_density_cor) > 0.2) cat('The gene density and PC1 correlation on this chr is: ', substr(gene_density_cor, 1, 4), '\n', file=log_file, append=TRUE)
-
-	############################################################
-	intermediate_data = res
-	if(save_intermediate_data==TRUE) saveRDS(intermediate_data, file=intermediate_data_file)
-	# cat(intermediate_data_file)
-	return(intermediate_data)
-}
-
-CALDER_sub_domains = function(intermediate_data_file=NULL, intermediate_data=NULL, chr, save_dir, bin_size)
-{	
-    time0 = Sys.time()
-    log_file = paste0(save_dir, '/chr', chr, '_sub_domains_log.txt')
-
-   	cat('\r', '>>>> Begin compute sub-domains at:', as.character(Sys.time()))
-   	cat('>>>> Begin compute sub-domains at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE)
-
-	if(is.null(intermediate_data)) intermediate_data = readRDS(intermediate_data_file)
-	{
-
-	    if(intermediate_data$chr!=chr) stop('intermediate_data$chr!=chr; check your input parameters\n') 
-	    if( !setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) ) stop('!setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) \n')     
-	    compartment_segs = generate_compartment_segs( intermediate_data$initial_clusters )
-
-		cat('\r', '>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()))   			
-		cat('>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
-
-		sub_domains_raw = HRG_zigzag_compartment_domain_main_fun(intermediate_data$mat, './', compartment_segs, min_n_bins=2)   
-
-	    no_output = post_process_sub_domains(chr, sub_domains_raw, ncores=1, out_dir=save_dir, bin_size=bin_size)
-
-	    cat('>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
-	    cat('\r', '>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n')
-
-	   	time1 = Sys.time()
-        # delta_time  = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0))
-        delta_time <- time1 - time0
-		timediff <- format(round(delta_time, 2), nsmall = 2)
-
-        cat('\n\n', 'Total time used for computing sub-domains:', timediff, '\n', file=log_file, append=TRUE)
-	}
-	# return(NULL)
-}
-
-
-
-############################################################
-
-create_compartment_bed_v4 = function(chr_bin_domain, bin_size)
-{
-	# for( chr in chrs )
-	{
-		v = chr_bin_domain
-		# v$intra_domain = as.character(6 - (as.numeric(v$intra_domain))) ## invert the labeling
-		# v$intra_domain = names(cols)[(as.numeric(v$intra_domain))]
-		v = v[order(v$bin_index), ]
-
-
-		borders_non_consecutive = which(diff(v$bin_index)!=1)
-		borders_domain = cumsum(rle(v$comp)$lengths)
-		borders = sort(union(borders_non_consecutive, borders_domain))
-		bins = v$bin_index
-		to_id = as.numeric(bins[borders])
-		from_id = as.numeric(bins[c(1, head(borders, length(borders)-1)+1)])
-
-		pos_start = (from_id-1)*bin_size + 1
-		pos_end = to_id*bin_size
-		# chr = as.numeric( gsub('chr', '', v$chr) )
-		chr = gsub('chr', '', v$chr) ## no need for as.numeric, also makes it compatible with chrX
-
-		compartment_info_tab = data.frame(chr=rep(unique(chr), length(pos_start)), pos_start=pos_start, pos_end=pos_end, domain=v$comp[borders])
-	}
-	return(compartment_info_tab)
-}
-
-############################################################
-
-generate_hierachy_bed = function(chr, res, save_dir, bin_size)
-{
-	chr_name = paste0('chr', chr)
-	# res = reses[[chr_name]][[CELL_LINE]]
-	hc = res$CALDER_hc
-
-	hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels)
-	bin_comp = data.table::data.table(chr=chr, bin_index=as.numeric(res$bin_names), comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length)))
-	chr_bin_domain = bin_comp
-	chr_bin_domain$chr = paste0('chr', chr_bin_domain$chr)
-
-	# chr_bin_domain = chr_bin_domain[order(bin_index)]
-
-	compartment_info_tab = create_compartment_bed_v4(chr_bin_domain, bin_size=bin_size)
-
-	boundaries = unname(sapply(res$initial_clusters, max))
-	boundaries_ori = as.numeric(res$bin_names[boundaries])*bin_size
-
-	compartment_info_tab$is_boundary = 'gap'
-	compartment_info_tab[compartment_info_tab$pos_end %in% boundaries_ori, 'is_boundary'] = 'boundary'
-	
-	colnames(compartment_info_tab)[4] = 'compartment_label'
-	compartments_tsv_file = paste0(save_dir, '/chr', chr, '_domain_hierachy.tsv')
-	compartments_bed_file = paste0(save_dir, '/chr', chr, '_sub_compartments.bed')
-	boundary_bed_file = paste0(save_dir, '/chr', chr, '_domain_boundaries.bed')
-
-	options(scipen=999)
-	write.table( compartment_info_tab, file=compartments_tsv_file, quote=FALSE, sep='\t', row.names=FALSE )
-
-
-	comp_cols = c("#FF0000", "#FF4848", "#FF9191", "#FFDADA", "#DADAFF", "#9191FF", "#4848FF", "#0000FF") 	
-	names(comp_cols) = c('A.1.1', 'A.1.2', 'A.2.1', 'A.2.2', 'B.1.1', 'B.1.2', 'B.2.1', 'B.2.2')
-	comp_val = (8:1)/8
-	names(comp_val) = names(comp_cols)
-
-	comp_8 = substr(compartment_info_tab$compartment_label, 1, 5)
-
-	compartment_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[, 2:4], comp_val[comp_8], '.', compartment_info_tab[, 2:3], comp_cols[comp_8])
-	write.table( compartment_bed, file=compartments_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
-
-	bounday_bed_raw = subset(compartment_info_tab, is_boundary=='boundary')
-	bounday_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[,3], compartment_info_tab[,3], '', '.', compartment_info_tab[,3], compartment_info_tab[,3], '#000000')
-	write.table( bounday_bed, file=boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
-}
-
-
-
-project_to_major_axis <- function(PC_12_atanh)
-{
-	Data = data.frame(x=PC_12_atanh[,1], y=PC_12_atanh[,2])
-	Data = Data[order(Data$x),]
-	loess_fit <- loess(y ~ x, Data)
-
-	more_x = seq(min(PC_12_atanh[,1]), max(PC_12_atanh[,1]), len=10*length(PC_12_atanh[,1]))
-	major_axis = cbind(x=more_x, y=predict(loess_fit, newdata=more_x))
-	new_x_axis = cumsum(c(0, sqrt(diff(major_axis[,1])^2 + diff(major_axis[,2])^2))) ## the new xaxis on the curved major_axis
-
-	dis = fields::rdist(PC_12_atanh[, 1:2], major_axis)
-	projected_x = new_x_axis[apply(dis, 1, which.min)]
-	names(projected_x) = rownames(PC_12_atanh)
-	# projected_x = major_axis[apply(dis, 1, which.min)]
-	project_info = list(x_pro=projected_x, major_axis=major_axis)
-	return(project_info)
-}
-
-
-get_best_reorder <- function(hc_hybrid_x_pro, x_pro)
-{
-	n = length(x_pro)
-	reordered_names_x_pro_list = list()
-
-	reordered_names_x_pro_list[[1]] = reorder_dendro(hc_hybrid_x_pro, (x_pro), aggregateFun=mean) ## here the clusters are assigned into A.1 A.2 B.1 B.2
-
-	best_index = which.max(sapply(reordered_names_x_pro_list, function(v) cor(1:n, unname(rank(x_pro, ties.method='first')[v]))))
-	return(reordered_names_x_pro_list[[1]])
-}
-

+ 159 - 13
R/CALDER_hierarchy_v2.R

@@ -119,12 +119,12 @@
 		# }
 
 
-		CALDER_CD_hierarchy_v2 = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chr, bin_size_input, bin_size2look, save_dir, save_intermediate_data=FALSE, swap_AB, ref_compartment_file, black_list_bins=NULL, annotation_track=NULL)
+		CALDER_CD_hierarchy_v2 = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chr, bin_size_input, bin_size2look, save_dir, save_intermediate_data=FALSE, swap_AB, ref_compartment_file, black_list_bins=NULL, feature_track=NULL)
 		{
 			chr_num = gsub('chr', '', chr, ignore.case=TRUE)
 			chr_name = paste0('chr', chr_num)
 
-		    get_cor_with_ref = function(chr_bin_pc, bin_size, ref_compartment_file=NULL, annotation_track=NULL) ## correlation of PC1 with comp. domain rank of ref_genome
+		    get_cor_with_ref = function(chr_bin_pc, bin_size, ref_compartment_file=NULL, feature_track=NULL) ## correlation of PC1 with comp. domain rank of genome
 		    {
 		        ext_chr_bin_pc = function(chr_bin_pc) ## spand chr_bin_pc using 5kb bin
 		        {
@@ -138,10 +138,10 @@
 		        }
 
 		        ## function to generate binning scores // https://divingintogeneticsandgenomics.rbind.io/post/compute-averages-sums-on-granges-or-equal-length-bins/
-			 	# annotation_track = data.table::data.table(chr=as.vector(GenomicRanges::seqnames(bw_val)), start=start(bw_val), end=end(bw_val), score=bw_val$score)
+			 	# feature_track = data.table::data.table(chr=as.vector(GenomicRanges::seqnames(bw_val)), start=start(bw_val), end=end(bw_val), score=bw_val$score)
 
 
-			 	get_binned_vals = function(annotation_track_chr, bin_size=5E3)
+			 	get_binned_vals = function(feature_track_chr, bin_size=5E3)
 			 	{
 			 		## helper to compute binned average // https://divingintogeneticsandgenomics.rbind.io/post/compute-averages-sums-on-granges-or-equal-length-bins/
 			  		binnedMean <- function(bins, numvar, mcolname)
@@ -162,7 +162,7 @@
 					}
 
 
-			 		GR = GenomicRanges::makeGRangesFromDataFrame(annotation_track_chr, keep.extra.columns=TRUE)
+			 		GR = GenomicRanges::makeGRangesFromDataFrame(feature_track_chr, keep.extra.columns=TRUE)
 			 		GR_chrs = split(GR, GenomicRanges::seqnames(GR))
 			 		seq_lens = sapply(GR_chrs, function(v) max(end(v)))
 
@@ -191,12 +191,12 @@
 			        ref_tab = ref_tab[, c(1,2,5)]
 		        }
 
-		        if(!is.null(annotation_track)) 
+		        if(!is.null(feature_track)) 
 		        {
-		        	colnames(annotation_track) = c('chr', 'start', 'end', 'score')
-		        	annotation_track$chr = gsub('chr', '', annotation_track$chr)
-		        	annotation_track_chr = annotation_track[annotation_track$chr==chr_num, ]
-		        	ref_tab = get_binned_vals(annotation_track_chr)
+		        	colnames(feature_track) = c('chr', 'start', 'end', 'score')
+		        	feature_track$chr = gsub('chr', '', feature_track$chr)
+		        	feature_track_chr = feature_track[feature_track$chr==chr_num, ]
+		        	ref_tab = get_binned_vals(feature_track_chr)
 		        }
 
 
@@ -300,13 +300,13 @@
 				
 
 				if(!is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, ref_compartment_file=ref_compartment_file)) ## get correlation with supplied "ground truth or reference"
-				if(is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, annotation_track=annotation_track))
+				if(is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, feature_track=feature_track))
 
 
 
 				if(class(cor_with_ref)=='try-error') cor_with_ref = 1 ## psudo cor
 	        	if(!is.null(ref_compartment_file)) cat('\r', ">>>> Correlation between PC1 and reference compartment is :", format(abs(cor_with_ref), digits=5), '\n')
-	        	if(is.null(ref_compartment_file)) cat('\r', ">>>> Correlation between PC1 and annotation_track is :", format(abs(cor_with_ref), digits=5), '\n')
+	        	if(is.null(ref_compartment_file)) cat('\r', ">>>> Correlation between PC1 and feature_track is :", format(abs(cor_with_ref), digits=5), '\n')
 
 				PC_direction = PC_direction*sign(cor_with_ref)
 			    if(swap_AB==1) PC_direction = -PC_direction ## force swap PC direction if in some case the A/B direction is reverted
@@ -352,7 +352,7 @@
 	        cat('\r', '>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()))
 
 	       	if(!is.null(ref_compartment_file)) cat('Correlation between PC1 and reference compartment domain rank on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE)
-	       	if(is.null(ref_compartment_file)) cat('Correlation between PC1 and annotation_track on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE)	       	
+	       	if(is.null(ref_compartment_file)) cat('Correlation between PC1 and feature_track on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE)	       	
 
 	       	# cat(sprintf("chr%s: %s\n", chr, format(abs(cor_with_ref), 1, 5)), file=cor_log_file, append=TRUE)
 	       	# if(abs(cor_with_ref) < 0.3) cat('WARNING: correlation between PC1 and reference compartment domain rank on this chr is: ', format(cor_with_ref, digits=5), ', which is a bit low. Possible reason could be that this chromosome has some big structural variations (translocation, inversion for example). We suggest to overlay the compartment track with the hic map together with histone modification or gene expression track to double check the reliability of compartment calling on this chr.', '\n', file=warning_file)
@@ -372,3 +372,149 @@
 			# cat(intermediate_data_file)
 			return(intermediate_data_file)
 		}
+
+
+
+
+
+		project_to_major_axis <- function(PC_12_atanh)
+		{
+			Data = data.frame(x=PC_12_atanh[,1], y=PC_12_atanh[,2])
+			Data = Data[order(Data$x),]
+			loess_fit <- loess(y ~ x, Data)
+
+			more_x = seq(min(PC_12_atanh[,1]), max(PC_12_atanh[,1]), len=10*length(PC_12_atanh[,1]))
+			major_axis = cbind(x=more_x, y=predict(loess_fit, newdata=more_x))
+			new_x_axis = cumsum(c(0, sqrt(diff(major_axis[,1])^2 + diff(major_axis[,2])^2))) ## the new xaxis on the curved major_axis
+
+			dis = fields::rdist(PC_12_atanh[, 1:2], major_axis)
+			projected_x = new_x_axis[apply(dis, 1, which.min)]
+			names(projected_x) = rownames(PC_12_atanh)
+			# projected_x = major_axis[apply(dis, 1, which.min)]
+			project_info = list(x_pro=projected_x, major_axis=major_axis)
+			return(project_info)
+		}
+
+
+		get_best_reorder <- function(hc_hybrid_x_pro, x_pro)
+		{
+			n = length(x_pro)
+			reordered_names_x_pro_list = list()
+
+			reordered_names_x_pro_list[[1]] = reorder_dendro(hc_hybrid_x_pro, (x_pro), aggregateFun=mean) ## here the clusters are assigned into A.1 A.2 B.1 B.2
+
+			best_index = which.max(sapply(reordered_names_x_pro_list, function(v) cor(1:n, unname(rank(x_pro, ties.method='first')[v]))))
+			return(reordered_names_x_pro_list[[1]])
+		}
+
+
+
+		generate_hierachy_bed = function(chr, res, save_dir, bin_size)
+		{
+			chr_name = paste0('chr', chr)
+			# res = reses[[chr_name]][[CELL_LINE]]
+			hc = res$CALDER_hc
+
+			hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels)
+			bin_comp = data.table::data.table(chr=chr, bin_index=as.numeric(res$bin_names), comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length)))
+			chr_bin_domain = bin_comp
+			chr_bin_domain$chr = paste0('chr', chr_bin_domain$chr)
+
+			# chr_bin_domain = chr_bin_domain[order(bin_index)]
+
+			compartment_info_tab = create_compartment_bed_v4(chr_bin_domain, bin_size=bin_size)
+
+			boundaries = unname(sapply(res$initial_clusters, max))
+			boundaries_ori = as.numeric(res$bin_names[boundaries])*bin_size
+
+			compartment_info_tab$is_boundary = 'gap'
+			compartment_info_tab[compartment_info_tab$pos_end %in% boundaries_ori, 'is_boundary'] = 'boundary'
+			
+			colnames(compartment_info_tab)[4] = 'compartment_label'
+			compartments_tsv_file = paste0(save_dir, '/chr', chr, '_domain_hierachy.tsv')
+			compartments_bed_file = paste0(save_dir, '/chr', chr, '_sub_compartments.bed')
+			boundary_bed_file = paste0(save_dir, '/chr', chr, '_domain_boundaries.bed')
+
+			options(scipen=999)
+			write.table( compartment_info_tab, file=compartments_tsv_file, quote=FALSE, sep='\t', row.names=FALSE )
+
+
+			comp_cols = c("#FF0000", "#FF4848", "#FF9191", "#FFDADA", "#DADAFF", "#9191FF", "#4848FF", "#0000FF") 	
+			names(comp_cols) = c('A.1.1', 'A.1.2', 'A.2.1', 'A.2.2', 'B.1.1', 'B.1.2', 'B.2.1', 'B.2.2')
+			comp_val = (8:1)/8
+			names(comp_val) = names(comp_cols)
+
+			comp_8 = substr(compartment_info_tab$compartment_label, 1, 5)
+
+			compartment_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[, 2:4], comp_val[comp_8], '.', compartment_info_tab[, 2:3], comp_cols[comp_8])
+			write.table( compartment_bed, file=compartments_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
+
+			bounday_bed_raw = subset(compartment_info_tab, is_boundary=='boundary')
+			bounday_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[,3], compartment_info_tab[,3], '', '.', compartment_info_tab[,3], compartment_info_tab[,3], '#000000')
+			write.table( bounday_bed, file=boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
+		}
+
+
+		create_compartment_bed_v4 = function(chr_bin_domain, bin_size)
+		{
+			# for( chr in chrs )
+			{
+				v = chr_bin_domain
+				# v$intra_domain = as.character(6 - (as.numeric(v$intra_domain))) ## invert the labeling
+				# v$intra_domain = names(cols)[(as.numeric(v$intra_domain))]
+				v = v[order(v$bin_index), ]
+
+
+				borders_non_consecutive = which(diff(v$bin_index)!=1)
+				borders_domain = cumsum(rle(v$comp)$lengths)
+				borders = sort(union(borders_non_consecutive, borders_domain))
+				bins = v$bin_index
+				to_id = as.numeric(bins[borders])
+				from_id = as.numeric(bins[c(1, head(borders, length(borders)-1)+1)])
+
+				pos_start = (from_id-1)*bin_size + 1
+				pos_end = to_id*bin_size
+				# chr = as.numeric( gsub('chr', '', v$chr) )
+				chr = gsub('chr', '', v$chr) ## no need for as.numeric, also makes it compatible with chrX
+
+				compartment_info_tab = data.frame(chr=rep(unique(chr), length(pos_start)), pos_start=pos_start, pos_end=pos_end, domain=v$comp[borders])
+			}
+			return(compartment_info_tab)
+		}
+
+
+		CALDER_sub_domains = function(intermediate_data_file=NULL, intermediate_data=NULL, chr, save_dir, bin_size)
+		{	
+		    time0 = Sys.time()
+		    log_file = paste0(save_dir, '/chr', chr, '_sub_domains_log.txt')
+
+		   	cat('\r', '>>>> Begin compute sub-domains at:', as.character(Sys.time()))
+		   	cat('>>>> Begin compute sub-domains at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE)
+
+			if(is.null(intermediate_data)) intermediate_data = readRDS(intermediate_data_file)
+			{
+
+			    if(intermediate_data$chr!=chr) stop('intermediate_data$chr!=chr; check your input parameters\n') 
+			    if( !setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) ) stop('!setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) \n')     
+			    compartment_segs = generate_compartment_segs( intermediate_data$initial_clusters )
+
+				cat('\r', '>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()))   			
+				cat('>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
+
+				sub_domains_raw = HRG_zigzag_compartment_domain_main_fun(intermediate_data$mat, './', compartment_segs, min_n_bins=2)   
+
+			    no_output = post_process_sub_domains(chr, sub_domains_raw, ncores=1, out_dir=save_dir, bin_size=bin_size)
+
+			    cat('>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
+			    cat('\r', '>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n')
+
+			   	time1 = Sys.time()
+		        # delta_time  = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0))
+		        delta_time <- time1 - time0
+				timediff <- format(round(delta_time, 2), nsmall = 2)
+
+		        cat('\n\n', 'Total time used for computing sub-domains:', timediff, '\n', file=log_file, append=TRUE)
+			}
+			# return(NULL)
+		}
+

+ 10 - 6
R/CALDER_main.R

@@ -1,5 +1,5 @@
 
-    CALDER = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chrs, bin_size, save_dir, save_intermediate_data=FALSE, swap_AB=0, ref_genome=c("hg19", 'hg38', "mm10", 'dm6')[1], annotation_track=NULL, black_list_bins=NULL, n_cores=1, sub_domains=FALSE, single_binsize_only=FALSE)
+    CALDER = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chrs, bin_size, save_dir, save_intermediate_data=FALSE, swap_AB=0, genome='others', feature_track=NULL, black_list_bins=NULL, n_cores=1, sub_domains=FALSE, single_binsize_only=FALSE)
     {
     		########################### https://stackoverflow.com/questions/30216613/how-to-use-dopar-when-only-import-foreach-in-description-of-a-package
 
@@ -36,12 +36,16 @@
             if(bin_size==50E3) bin_sizes = c(50E3, 100E3)
             if(!(bin_size %in% c(5E3, 10E3, 20E3, 25E3, 40E3, 50E3))) bin_sizes = bin_size
 
-            if(is.null(ref_genome)) single_binsize_only = TRUE
+            if(genome=='others') single_binsize_only = TRUE
             if(single_binsize_only) bin_sizes = bin_size ## do not try multiple bin_sizes
 
             ## choose the already computed good compartment as reference to decide correctly the A/B compartment direction
-            if(!is.null(ref_genome)) ref_compartment_file = system.file("extdata", sprintf('%s_all_sub_compartments.bed', ref_genome), package = 'CALDER')
-            if(is.null(ref_genome)) ref_compartment_file = NULL
+            if(genome!='others') ref_compartment_file = system.file("extdata", sprintf('%s_all_sub_compartments.bed', genome), package = 'CALDER')
+            if(genome=='others') ref_compartment_file = NULL
+
+            ###########################
+
+            if(is.null(ref_compartment_file) & is.null(feature_track)) stop('Should either specify genome in one of hg19, hg38, mm9, mm10, or provide a feature_track\n')
 
             ###########################
 
@@ -69,12 +73,12 @@
 
             	save_intermediate_data_tmp = save_intermediate_data*(bin_size2look==bin_size)
 
-	            CALDER_CD_hierarchy_v2(contact_tab_dump=contact_tab_dump[[chr]], contact_file_dump=contact_file_dump[[chr]], contact_file_hic=contact_file_hic, chr=chr, bin_size_input=bin_size, bin_size2look=bin_size2look, save_dir=save_dir_binsize, save_intermediate_data=save_intermediate_data_tmp, swap_AB=swap_AB, ref_compartment_file=ref_compartment_file, annotation_track=annotation_track, black_list_bins=black_list_bins)
+	            CALDER_CD_hierarchy_v2(contact_tab_dump=contact_tab_dump[[chr]], contact_file_dump=contact_file_dump[[chr]], contact_file_hic=contact_file_hic, chr=chr, bin_size_input=bin_size, bin_size2look=bin_size2look, save_dir=save_dir_binsize, save_intermediate_data=save_intermediate_data_tmp, swap_AB=swap_AB, ref_compartment_file=ref_compartment_file, feature_track=feature_track, black_list_bins=black_list_bins)
 	        }
 
             ########################### build optimal compartment from multiple resolutions
 
-            try(build_comp_table_opt(save_dir=save_dir, chrs=chrs, bin_sizes=bin_sizes, with_ref=!is.null(ref_genome)))
+            try(build_comp_table_opt(save_dir=save_dir, chrs=chrs, bin_sizes=bin_sizes, with_ref=!is.null(genome)))
 
             ########################### if computing sub-domains
 

+ 0 - 1
R/bisecting_kmeans.R

@@ -1,5 +1,4 @@
 
-	
  	## k-means with replicatable seeds
  	my_kmeans = function(iter.max=1E3, nstart=50, ...)
  	{

+ 2 - 3
R/build_comp_table_opt.R

@@ -1,8 +1,7 @@
 	
+	## function to obtain the optimal compartment calling from various resolutions (bin_sizes)
 
-	
-	
-	build_comp_table_opt = function(save_dir, chrs, bin_sizes, with_ref) ## function to obtain the optimal compartment calling from various resolutions (= bin_sizes)
+	build_comp_table_opt = function(save_dir, chrs, bin_sizes, with_ref) 
 	{
 
     	`%dopar%` <- foreach::`%dopar%`

+ 0 - 95
R/zigzag_nested_domain.R

@@ -1,95 +0,0 @@
-	## Yuanlong LIU
-	## 12/04/2018
-	## 06/06/2018
-	## 16/06/2018
-	## This code runs zigzag search on each compartment domain as a whole without dividing into 400
-
-	## 09-10-2018
-	
-	## A should be already named
-	## "zigzag" resembles computational path of finding the optimum  
-	# HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, chr, min_n_bins=2)
-	HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, min_n_bins=2)
-	{
-		arg_list = as.list(environment())
-		
-		res_folder = file.path(res_dir)
-		dir.create(res_folder, recursive=TRUE, showWarnings = FALSE)
-		
-		total_execution_time_file = file.path(res_dir, 'total_execution.time')
-		time_begin = Sys.time()
-		cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE)
-	
-		## check whether your input matrix is symmetric
-		# A_sym = as.matrix( Matrix::forceSymmetric(data.matrix(A), uplo='U') )
-		# A_sym = Matrix::forceSymmetric(data.matrix(A), uplo='U') ## keep sparse, 2018-11-11
-		A_sym = Matrix::forceSymmetric(A, uplo='U') ## keep sparse, 2018-11-11
-
-		tol = 100 * .Machine$double.eps
-		max_diff = max(abs(A_sym - A))
-		notSym_flag = max_diff > tol
-		if( notSym_flag ) warning('Your input contact matrix is not symmetric. The maximum difference between symmetric values is: ', max_diff, '\nBy default, the contact profile used for downstream analysis is taken from the upper triangular part. To use the lower triangular part you can transpose the input contact matrix first')
-		
-		if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices')
-		
-		# pA_sym = rm_zeros(A_sym) ## pA_sym: positive A
-		pA_sym = remove_blank_cols(A_sym, sparse=TRUE, ratio=0)		
-		n_zero_rows = nrow(A_sym) - nrow(pA_sym)
-		zero_rows_flag = n_zero_rows > 0
-		if( zero_rows_flag )
-		{
-			warning('There are ', n_zero_rows, ' rows/columns in your input contact matrix that have all their values being 0. These rows/columns are removed for downstream analysis')
-			original_row_names = rownames(A_sym)
-			kept_row_names = rownames(pA_sym)
-		}
-
-		# res_inner = rep(list(), nrow(compartment_segs)) ## for each compartment domain
-		# for( i in 1:nrow(compartment_segs) )
-		# {
-		# 	seg = compartment_segs[i,1]:compartment_segs[i,2]
-		# 	cat('Compute seg:', i, 'of length:', length(seg), '\n')
-
-		# 	A_seg = as.matrix(pA_sym[seg, seg])
-		# 	res_zigzag = zigzag_loglik_ancestors_v4(A_seg, nrow(A_seg))
-		# 	res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors)
-		# 	res_inner[[i]] = res_outer
-		# 	cat('finished', '\n')
-		# }
-
-		## changed to paralell, 2018-11-11
-		cat('\n')
-
-		res_inner = foreach::foreach(i=1:nrow(compartment_segs)) %do%
-		{
-			seg = compartment_segs[i,1]:compartment_segs[i,2]
-			cat('\r', sprintf('Find sub-domains in %d of %d CDs | length of current CD: %d bins', i, nrow(compartment_segs), length(seg)))
-
-			A_seg = as.matrix(pA_sym[seg, seg])
-			res_zigzag = zigzag_loglik_ancestors_v4_5(A_seg, nrow(A_seg), min_n_bins=min_n_bins)
-			res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors, min_n_bins=min_n_bins)
-			res_outer
-			# res_inner[[i]] = res_outer
-		}
-
-		cat('\n')
-		
-		segmentss = compartment_segs
-		res_info = list( arg_list=arg_list, pA_sym=pA_sym, A_final=pA_sym, segmentss=segmentss, res_inner=res_inner )
-		# res_folder_final = file.path(res_dir, 'final')
-		# dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
-		# save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
-	
-		time_finish = Sys.time()
-		cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
-		cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)
-	
-		return( res_info )
-	}
-	
-	
-	
-	
-	
-	
-	
-	

+ 20 - 0
total_execution.time

@@ -0,0 +1,20 @@
+Execution begins: 2022-05-25 16:42:08 
+Execution finishes: 2022-05-25 16:42:15 
+
+Total execution time: Time difference of 7.081351 secs 
+
+Execution begins: 2022-05-25 16:42:31 
+Execution finishes: 2022-05-25 16:42:36 
+
+Total execution time: Time difference of 5.262403 secs 
+
+Execution begins: 2022-05-31 14:43:13 
+Execution finishes: 2022-05-31 14:43:19 
+
+Total execution time: Time difference of 5.916845 secs 
+
+Execution begins: 2022-05-31 14:43:33 
+Execution finishes: 2022-05-31 14:43:38 
+
+Total execution time: Time difference of 4.761235 secs 
+