2 лет назад · 426a09c776
--- a/Snakefile
+++ b/Snakefile
@@ -0,0 +1,487 @@
 
				+import os
			
 
				+import yaml
			
 
				+import pandas as pd
			
 
				+
			
 
				+configfile: "config.yaml"
			
 
				+##########################################################
			
 
				+# 全局变量和样本信息
			
 
				+##########################################################
			
 
				+HICSNAKE_HOME = config["HICSNAKE_HOME"] if config["HICSNAKE_HOME"] else os.getcwd()
			
 
				+GENOME = config["genome"]
			
 
				+RESOLUTIONS = config["resolutions"]
			
 
				+BASE_RESOLUTION = min(RESOLUTIONS)
			
 
				+
			
 
				+with open('chromosomes.txt', 'r') as file:
			
 
				+    CHROMOSOMES = [line.strip() for line in file]
			
 
				+
			
 
				+
			
 
				+meta_data = pd.read_csv('samples.txt', sep='\t')
			
 
				+REPLICATES = sorted(meta_data['replicate'].tolist())
			
 
				+SAMPLES2REPLICATES = meta_data.groupby('sample')['replicate'].apply(list).to_dict()
			
 
				+SAMPLES = SAMPLES2REPLICATES.keys()
			
 
				+
			
 
				+contrasts_data = pd.read_csv('contrasts.txt', sep='\t', header=None)
			
 
				+CONTRASTS = [f"{row[0]}_vs_{row[1]}" for row in contrasts_data.itertuples(index=False)]
			
 
				+
			
 
				+##########################################################
			
 
				+# rule all: 最终想要生成的文件
			
 
				+##########################################################
			
 
				+rule all:
			
 
				+    input:
			
 
				+        #第一步: 测序数据的过滤和质控
			
 
				+        #"clean_data/fastp_summary.tsv",
			
 
				+        #第二步:比对到参考基因组
			
 
				+        #expand("aligned/{replicate}_R{i}.bam", replicate=REPLICATES, i=[1,2]),
			
 
				+        #第三步: 构建接触矩阵
			
 
				+        #expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES),
			
 
				+        #expand("Matrix/Merged/{sample}.cool", sample=SAMPLES),
			
 
				+        #expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES),
			
 
				+        #第四步: 格式转换
			
 
				+        expand("Matrix/Final/{sample}.{format}", sample=SAMPLES, format=["mcool"]),
			
 
				+        #第五步: 接触矩阵质控
			
 
				+        "Matrix/Raw/hicQC.html",
			
 
				+        "Matrix/Raw/SCC.csv",
			
 
				+        expand("Matrix/Final/{sample}_res.csv", sample=SAMPLES),
			
 
				+        #第六步: AB compartment, TADs, Loops
			
 
				+        expand("Matrix/Diff/{resolution}/{contrast}.cool", resolution=config["TAD_resolution"], contrast=CONTRASTS),
			
 
				+        expand("Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed", resolution=config["Compartment_resolution"], sample=SAMPLES),
			
 
				+        expand("TADs/HiCExplorer/{resolution}/{sample}_domains.bed", resolution=config["TAD_resolution"], sample=SAMPLES),
			
 
				+        expand("TADs/HiCExplorer/{resolution}/{contrast}_accepted.diff_tad", resolution=config["TAD_resolution"],contrast=CONTRASTS),
			
 
				+        expand("Loops/Mustache/{resolution}/{sample}.tsv", resolution=config["Loop_resolution"], sample=SAMPLES),
			
 
				+        expand("Loops/Mustache/{resolution}/{contrast}.diffloop1", resolution=config["Loop_resolution"], contrast=CONTRASTS),
			
 
				+        # 可视化
			
 
				+        expand("plots/{resolution}/{sample}_plotmatrix_allchr.png", resolution=config["Plot_matrix_resolution"], sample=SAMPLES),
			
 
				+        expand("plots/{resolution}/{sample}_plotmatrix_perchr.png", resolution=config["Plot_matrix_resolution"], sample=SAMPLES)
			
 
				+
			
 
				+ 
			
 
				+#********************************************************#
			
 
				+#            第一步: 测序数据的过滤和质控                #
			
 
				+#********************************************************#
			
 
				+
			
 
				+# 使用fastp进行原始数据质控和过滤，自动判断 SE？PE？
			
 
				+rule fastp_quality_control:
			
 
				+    input:
			
 
				+        read1="raw_data/{replicate}_R1.fastq.gz", 
			
 
				+        read2="raw_data/{replicate}_R2.fastq.gz"
			
 
				+    output:
			
 
				+        read1="clean_data/{replicate}_R1.fastq.gz", 
			
 
				+        read2="clean_data/{replicate}_R2.fastq.gz", 
			
 
				+        html_report="clean_data/{replicate}_fastp.html",
			
 
				+        json_report="clean_data/{replicate}_fastp.json"
			
 
				+    log: "logs/{replicate}_fastp_quality_control.log"
			
 
				+    threads: 3
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
			
 
				+    params:
			
 
				+        fastp=config["fastp"]
			
 
				+    shell:
			
 
				+        """
			
 
				+        fastp -i {input.read1} -o {output.read1} -I {input.read2} -O {output.read2} --html {output.html_report} --json {output.json_report} --thread {threads} {params.fastp} 1>{log} 2>&1
			
 
				+        """
			
 
				+
			
 
				+# 合并fastp质控结果生成表格
			
 
				+rule combine_fastp_reports:
			
 
				+    input:
			
 
				+        expand("clean_data/{replicate}_fastp.json", replicate=REPLICATES)
			
 
				+    output:
			
 
				+        "clean_data/fastp_summary.tsv"
			
 
				+    log: "logs/combine_fastp_reports.log"
			
 
				+    params:
			
 
				+        hicsnake_home=HICSNAKE_HOME
			
 
				+    shell:
			
 
				+        """
			
 
				+        Rscript {params.hicsnake_home}/scripts/combine_fastp_report.R --input {input} --output clean_data/ 1>{log} 2>&1
			
 
				+        """
			
 
				+
			
 
				+#********************************************************#
			
 
				+#             第二步:比对到参考基因组                    #
			
 
				+#********************************************************#
			
 
				+# 根据基因组构建 bwa index
			
 
				+rule bwa_index:
			
 
				+    input:
			
 
				+        genome = "ref/" + GENOME + ".fa"
			
 
				+    output:
			
 
				+        genome_index = "ref/genome.bwt"
			
 
				+    log: "logs/bwa_index.log"
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
			
 
				+    threads: 6
			
 
				+    shell:
			
 
				+        "bwa index -p ref/genome {input.genome} 1>{log} 2>&1"
			
 
				+
			
 
				+# 使用 bwa 比对 
			
 
				+rule bwa:
			
 
				+    input:
			
 
				+        genome_index="ref/genome.bwt",
			
 
				+        read="clean_data/{replicate}_R{i}.fastq.gz", 
			
 
				+    output:
			
 
				+        sam="aligned/{replicate}_R{i}.sam"
			
 
				+    threads: 20
			
 
				+    log: "logs/{replicate}_R{i}_bwa.log"
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
			
 
				+    shell:
			
 
				+        """
			
 
				+        bwa mem -A1 -B4  -E50 -L0 -t {threads} -o {output.sam} ref/genome {input.read} 2>{log}
			
 
				+        """
			
 
				+
			
 
				+rule sam2bam:
			
 
				+    input:
			
 
				+        sam="aligned/{replicate}_R{i}.sam"
			
 
				+    output:
			
 
				+        bam="aligned/{replicate}_R{i}.bam"
			
 
				+    threads: 6
			
 
				+    log: "logs/{replicate}_R{i}_sam2bam.log"
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
			
 
				+    shell:
			
 
				+        """
			
 
				+        samtools view -hb --threads {threads} -o {output.bam} {input.sam}
			
 
				+        """
			
 
				+
			
 
				+#********************************************************#
			
 
				+#             第三步: 构建接触矩阵                       #
			
 
				+#********************************************************#
			
 
				+
			
 
				+# 生成酶切位点
			
 
				+rule hicFindRestSite:
			
 
				+    input:
			
 
				+        genome="ref/" + GENOME + ".fa"
			
 
				+    output:
			
 
				+        rest_site="ref/rest_site.bed"
			
 
				+    log: "logs/hicFindRestSite.log"
			
 
				+    params:
			
 
				+        restriction_sequence=config["restriction_sequence"]
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
			
 
				+    shell:
			
 
				+        """
			
 
				+        hicFindRestSite --fasta {input.genome} --searchPattern {params.restriction_sequence} -o {output.rest_site} 1>{log} 2>&1
			
 
				+        """
			
 
				+
			
 
				+# 生成 cool 格式矩阵 hicBuildMatrix
			
 
				+rule hicBuildMatrix:
			
 
				+    input:
			
 
				+        read1_bam="aligned/{replicate}_R1.bam",
			
 
				+        read2_bam="aligned/{replicate}_R2.bam",
			
 
				+        rest_site="ref/rest_site.bed"
			
 
				+    output:
			
 
				+        bam="Matrix/Raw/{replicate}.bam",
			
 
				+        cool="Matrix/Raw/{replicate}.cool",
			
 
				+        log="Matrix/Raw/{replicate}/QC.log"
			
 
				+    log: "logs/{replicate}_hicBuildMatrix.log"
			
 
				+    params:
			
 
				+        genome="ref/" + GENOME + ".fa",
			
 
				+        base_resolution=BASE_RESOLUTION,
			
 
				+        hicBuildMatrix=config["hicBuildMatrix"]
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
			
 
				+    threads: 6
			
 
				+    shell:
			
 
				+        """
			
 
				+        hicBuildMatrix --samFiles {input.read1_bam} {input.read2_bam} \
			
 
				+            --genomeAssembly {params.genome} \
			
 
				+            --outBam {output.bam} \
			
 
				+            --outFileName {output.cool} \
			
 
				+            --QCfolder Matrix/Raw/{wildcards.replicate} \
			
 
				+            --restrictionCutFile {input.rest_site} \
			
 
				+            --binSize {params.base_resolution} \
			
 
				+            --threads {threads} {params.hicBuildMatrix} 1>{log} 2>&1
			
 
				+        """
			
 
				+
			
 
				+# 合并生物学重复的矩阵
			
 
				+rule hicSumMatrices:
			
 
				+    input:
			
 
				+        cools=lambda wildcards: expand("Matrix/Raw/{replicate}.cool", replicate=SAMPLES2REPLICATES[wildcards.sample])
			
 
				+    output:
			
 
				+        cool="Matrix/Merged/{sample}.cool"
			
 
				+    log: "logs/{sample}_hicSumMatrices.log"
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
			
 
				+    threads: 3
			
 
				+    shell:
			
 
				+        """
			
 
				+        if [ $(echo {input.cools} | wc -w) -gt 1 ]; then
			
 
				+            hicSumMatrices --matrices {input.cools} --outFileName {output.cool} 1>{log} 2>&1
			
 
				+        else
			
 
				+            cp {input.cools[0]} {output.cool} 1>{log} 2>&1
			
 
				+        fi
			
 
				+        """
			
 
				+
			
 
				+# 归一化接触矩阵
			
 
				+rule hicNormalize:
			
 
				+    input:
			
 
				+        cools=expand("Matrix/Merged/{sample}.cool", sample=SAMPLES, allow_missing=True)
			
 
				+    output:
			
 
				+        cools=expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES, allow_missing=True)
			
 
				+    params:
			
 
				+        hicNormalize=config["hicNormalize"]
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
			
 
				+    threads: 1
			
 
				+    shell:
			
 
				+        """
			
 
				+        if [ $(echo {input.cools} | wc -w) -gt 1 ]; then
			
 
				+            hicNormalize --matrices {input.cools} --outFileName {output.cools} {params.hicNormalize}
			
 
				+        else
			
 
				+            cp {input.cools[0]} {output.cools[0]}
			
 
				+        fi
			
 
				+        """
			
 
				+
			
 
				+#********************************************************#
			
 
				+#                 第四步: 格式转换                       #
			
 
				+#********************************************************#
			
 
				+
			
 
				+# 转换为 mcool 格式
			
 
				+rule cooler_zoomify:
			
 
				+    input:
			
 
				+        cool="Matrix/Normalized/{sample}.cool"
			
 
				+    output:
			
 
				+        mcool="Matrix/Final/{sample}.mcool"
			
 
				+    log: "logs/{sample}_cooler_zoomify.log"
			
 
				+    params:
			
 
				+        resolutions=",".join(map(str, RESOLUTIONS))
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/cooler_20240402.sif"
			
 
				+    threads: 4
			
 
				+    shell:
			
 
				+        """
			
 
				+        cooler zoomify {input.cool} -n {threads} --resolutions {params.resolutions} -o Matrix/Final/{wildcards.sample}_nobalanced.mcool
			
 
				+        cooler zoomify {input.cool} -n {threads} --balance --resolutions {params.resolutions} -o {output.mcool}
			
 
				+        """
			
 
				+
			
 
				+# 转换为 ginteractions 格式
			
 
				+rule convert_to_ginteractions:
			
 
				+    input:
			
 
				+        cool="Matrix/Normalized/{sample}.cool"
			
 
				+    output:
			
 
				+        ginteractions="Matrix/Final/{sample}.ginteractions.tsv"
			
 
				+    log: "logs/{sample}_hicConvertFormat_ginteractions.log"
			
 
				+    params:
			
 
				+        resolutions=RESOLUTIONS,
			
 
				+        prefix="Matrix/Final/{sample}.ginteractions"
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
			
 
				+    threads: 1
			
 
				+    shell:
			
 
				+        """
			
 
				+        hicConvertFormat --matrices {input.cool} --outFileName {params.prefix} --inputFormat cool --outputFormat ginteractions 1>{log} 2>&1
			
 
				+        """
			
 
				+
			
 
				+# 转换为 hic 格式
			
 
				+rule convert_to_hic:
			
 
				+    input:
			
 
				+        ginteractions="Matrix/Final/{sample}.ginteractions.tsv",
			
 
				+        chrom_sizes = "ref/" + GENOME + ".chrom.sizes"
			
 
				+    output:
			
 
				+        hic="Matrix/Final/{sample}.hic"
			
 
				+    log: "logs/{sample}_convert_to_hic.log"
			
 
				+    params:
			
 
				+        resolutions=",".join(str(r) for r in RESOLUTIONS)
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/juicer_20240401"
			
 
				+    threads: 1
			
 
				+    shell:
			
 
				+        """
			
 
				+        awk -F "\\t" '{{print 0, $1, $2, 0, 0, $4, $5, 1, $7}}' {input.ginteractions} > Matrix/Final/{wildcards.sample}.short
			
 
				+        sort -k2,2d -k6,6d Matrix/Final/{wildcards.sample}.short > Matrix/Final/{wildcards.sample}.short.sorted
			
 
				+        java -Xms6g -Xmx40g -jar /opt/juicer_tools.jar pre -r {params.resolutions} Matrix/Final/{wildcards.sample}.short.sorted {output.hic} {input.chrom_sizes}
			
 
				+        """
			
 
				+
			
 
				+#********************************************************#
			
 
				+#                    第五步: 接触矩阵质控                #
			
 
				+#********************************************************#
			
 
				+
			
 
				+# 使用 hicQC 进行REPLICATE 层次质控
			
 
				+rule hicQC:
			
 
				+    input:
			
 
				+        logs=expand("Matrix/Raw/{replicate}/QC.log", replicate=REPLICATES)
			
 
				+    output:
			
 
				+        html="Matrix/Raw/hicQC.html"
			
 
				+    params:
			
 
				+        REPLICATES=REPLICATES
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
			
 
				+    threads: 20
			
 
				+    shell:
			
 
				+        """
			
 
				+        hicQC --logfiles {input.logs} --labels {params.REPLICATES} --outputFolder Matrix/Raw
			
 
				+        """
			
 
				+
			
 
				+# 计算 SCC
			
 
				+rule hicrep:
			
 
				+    input:
			
 
				+        cools=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES)
			
 
				+    output:
			
 
				+        scc="Matrix/Raw/SCC.csv"
			
 
				+    params:
			
 
				+        replicates=REPLICATES,
			
 
				+        hicrep=config["hicrep"]
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicrep_20240423.sif"
			
 
				+    threads: 4
			
 
				+    shell:
			
 
				+        """
			
 
				+        python scripts/scc.py --workers {threads} {params.hicrep} --output {output.scc} --sample_names {params.replicates} --files {input.cools}
			
 
				+        """
			
 
				+
			
 
				+# 评估数据分辨率
			
 
				+rule estimate_resolution:
			
 
				+    input:
			
 
				+        mcool="Matrix/Final/{sample}.mcool"
			
 
				+    output:
			
 
				+        res="Matrix/Final/{sample}_res.csv"
			
 
				+    params:
			
 
				+        hicres=config["hicres"]
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/cooler_20240402.sif"
			
 
				+    threads: 4
			
 
				+    shell:
			
 
				+        """
			
 
				+        python scripts/hicres.py {input.mcool} --threads {threads} --output {output.res} {params.hicres}
			
 
				+        """
			
 
				+
			
 
				+# hicPlotDistVsCounts: 距离衰减曲线, 查看样本/重复差异
			
 
				+rule hicPlotDistVsCounts:
			
 
				+    input:
			
 
				+        cool=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES)
			
 
				+    output:
			
 
				+        pdf="Matrix/Raw/DistVsCounts.pdf"
			
 
				+    params:
			
 
				+        replicates=REPLICATES,
			
 
				+        hicPlotDistVsCounts=config["hicPlotDistVsCounts"]
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
			
 
				+    threads: 2
			
 
				+    shell:
			
 
				+        """
			
 
				+        hicPlotDistVsCounts --matrices {input.cool} --labels {params.replicates} --plotFile {output.pdf} {params.hicPlotDistVsCounts}
			
 
				+        """ 
			
 
				+#********************************************************#
			
 
				+#                第七步: 三维结构预测                    #
			
 
				+#********************************************************#
			
 
				+rule calder2:
			
 
				+    input:
			
 
				+        mcool="Matrix/Final/{sample}.mcool",
			
 
				+    output:
			
 
				+        "Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed"
			
 
				+    params:
			
 
				+        calder=config["calder"],
			
 
				+        resolution="{resolution}"
			
 
				+    threads: 4
			
 
				+    log: "logs/{sample}_{resolution}_calder2.log"
			
 
				+    shell:
			
 
				+        """
			
 
				+        Rscript scripts/calder -i {input.mcool} -t mcool -b {params.resolution} -p {threads} -o Compartments/CALDER2/{wildcards.resolution}/{wildcards.sample} {params.calder}
			
 
				+        """
			
 
				+
			
 
				+# 使用 HiCExplorer 检测 TADs
			
 
				+rule hicFindTADs:
			
 
				+    input:
			
 
				+        mcool="Matrix/Final/{sample}.mcool"
			
 
				+    output:
			
 
				+        boundaries="TADs/HiCExplorer/{resolution}/{sample}_boundaries.bed",
			
 
				+        domains="TADs/HiCExplorer/{resolution}/{sample}_domains.bed",
			
 
				+        score="TADs/HiCExplorer/{resolution}/{sample}_score.bedgraph"
			
 
				+    params:
			
 
				+        hicFindTADs=config["hicFindTADs"],
			
 
				+        prefix="TADs/HiCExplorer/{resolution}/{sample}",
			
 
				+        resolution="{resolution}"
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
			
 
				+    threads: 4
			
 
				+    shell:
			
 
				+        """
			
 
				+        hicFindTADs -p {threads} --matrix {input.mcool}::/resolutions/{params.resolution} --outPrefix {params.prefix} {params.hicFindTADs}
			
 
				+        """
			
 
				+
			
 
				+# 使用 HiCExplorer 检测差异 TADs
			
 
				+rule hicDifferentialTAD:
			
 
				+    input:
			
 
				+        treatment_mcool="Matrix/Final/{treatment}.mcool",
			
 
				+        control_mcool="Matrix/Final/{control}.mcool",
			
 
				+        treatment_tads="TADs/HiCExplorer/{resolution}/{treatment}_domains.bed"
			
 
				+    output:
			
 
				+        accepted_diff_tad="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}_accepted.diff_tad"
			
 
				+    params:
			
 
				+        hicDifferentialTAD=config["hicDifferentialTAD"],
			
 
				+        prefix="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}",
			
 
				+        resolution="{resolution}"
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
			
 
				+    threads: 4
			
 
				+    shell:
			
 
				+        """
			
 
				+        hicDifferentialTAD --targetMatrix {input.treatment_mcool}::/resolutions/{params.resolution} --controlMatrix {input.control_mcool}::/resolutions/{params.resolution} --tadDomains {input.treatment_tads} --outFileNamePrefix {params.prefix} {params.hicDifferentialTAD} --threads {threads}
			
 
				+        """
			
 
				+
			
 
				+# 使用 mustache 检测 Loops
			
 
				+rule mustache:
			
 
				+    input:
			
 
				+        mcool="Matrix/Final/{sample}.mcool"
			
 
				+    output:
			
 
				+        loops="Loops/Mustache/{resolution}/{sample}.tsv"
			
 
				+    params:
			
 
				+        mustache=config["mustache"],
			
 
				+        resolution="{resolution}"
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/mustache_20240410.sif"
			
 
				+    threads: 6
			
 
				+    shell:
			
 
				+        """
			
 
				+        mustache --file {input.mcool} --resolution {params.resolution} --outfile {output.loops} --processes {threads} {params.mustache}
			
 
				+        """
			
 
				+
			
 
				+# 使用 mustache 检测差异 Loops
			
 
				+rule diff_mustache:
			
 
				+    input:
			
 
				+        treatment_mcool="Matrix/Final/{treatment}.mcool",
			
 
				+        control_mcool="Matrix/Final/{control}.mcool",
			
 
				+    output:
			
 
				+        diffloop1="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop1",
			
 
				+        diffloop2="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop2"
			
 
				+    params:
			
 
				+        diff_mustache=config["diff_mustache"],
			
 
				+        prefix="Loops/Mustache/{resolution}/{treatment}_vs_{control}",
			
 
				+        resolution="{resolution}"
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/mustache_20240410.sif"
			
 
				+    threads: 6
			
 
				+    shell:
			
 
				+        """
			
 
				+        python /opt/conda/lib/python3.12/site-packages/mustache/diff_mustache.py -f1 {input.treatment_mcool} -f2 {input.control_mcool} --resolution {params.resolution} --outfile {params.prefix} --processes {threads} {params.diff_mustache}
			
 
				+        """
			
 
				+
			
 
				+#********************************************************#
			
 
				+#                       可视化                           #
			
 
				+#********************************************************#
			
 
				+rule hicPlotMatrix:
			
 
				+    input:
			
 
				+        mcool="Matrix/Final/{sample}.mcool"
			
 
				+    output:
			
 
				+        allchr_png="plots/{resolution}/{sample}_plotmatrix_allchr.png",
			
 
				+        perchr_png="plots/{resolution}/{sample}_plotmatrix_perchr.png"
			
 
				+    params:
			
 
				+        resolution="{resolution}"
			
 
				+    shell:
			
 
				+        """
			
 
				+        hicPlotMatrix -m {input.mcool}::/resolutions/{params.resolution} -o {output.allchr_png} --log1p --colorMap YlOrRd --clearMaskedBins --log1p
			
 
				+        hicPlotMatrix -m {input.mcool}::/resolutions/{params.resolution} -o {output.perchr_png} --log1p --colorMap YlOrRd --clearMaskedBins --perChromosome --log1p
			
 
				+        """
			
 
				+
			
 
				+# 使用 hicCompareMatrices 比较矩阵
			
 
				+rule hicCompareMatrices:
			
 
				+    input:
			
 
				+        treatment_mcool="Matrix/Final/{treatment}.mcool",
			
 
				+        control_mcool="Matrix/Final/{control}.mcool",
			
 
				+    output:
			
 
				+        cool="Matrix/Diff/{resolution}/{treatment}_vs_{control}.cool"
			
 
				+    params:
			
 
				+        resolution="{resolution}"
			
 
				+    singularity:
			
 
				+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
			
 
				+    threads: 6
			
 
				+    shell:
			
 
				+        """
			
 
				+        hicCompareMatrices --matrices {input.treatment_mcool}::resolutions/{params.resolution} {input.control_mcool}::resolutions/{params.resolution} --operation log2ratio -o {output.cool}
			
 
				+        """
			
--- a/chromosomes.txt
+++ b/chromosomes.txt
@@ -0,0 +1,3 @@
 
				+Chr03
			
 
				+Chr04
			
 
				+Chr06
			
--- a/config.yaml
+++ b/config.yaml
@@ -0,0 +1,65 @@
 
				+#######################
			
 
				+# globe  
			
 
				+#######################
			
 
				+# HiCSnake 流程所在位置
			
 
				+HICSNAKE_HOME: /home/zhxd/workspace/20240527_HiC/HicSnake
			
 
				+# 基因组前缀 ref/$genome.fa
			
 
				+genome: genome
			
 
				+# 酶的识别位点
			
 
				+restriction_sequence: GATC
			
 
				+
			
 
				+#######################
			
 
				+# resolutions
			
 
				+#######################    
			
 
				+# 所有分辨率, 最小值为基础分辨率, 其他值应为基础分辨率的整倍
			
 
				+resolutions:
			
 
				+  - 1000
			
 
				+  - 2000
			
 
				+  - 5000
			
 
				+  - 10000
			
 
				+  - 15000
			
 
				+  - 20000
			
 
				+  - 25000
			
 
				+  - 40000
			
 
				+  - 100000
			
 
				+  - 250000
			
 
				+  - 500000
			
 
				+  - 1000000
			
 
				+# CALDER2 的分辨率, 10kb~100kb, 可设置多个 
			
 
				+Compartment_resolution:
			
 
				+  - 40000
			
 
				+# hicFindTADs 的分辨率, 40kb ~ 100kb, 可设置多个
			
 
				+TAD_resolution:
			
 
				+  - 40000
			
 
				+# Mustache 的分辨率, 1kb ~ 20kb, 可设置多个
			
 
				+Loop_resolution:
			
 
				+  - 10000
			
 
				+Plot_matrix_resolution:
			
 
				+  - 40000
			
 
				+
			
 
				+#######################
			
 
				+# paramaters  
			
 
				+#######################
			
 
				+# fastp 过滤
			
 
				+fastp: "--adapter_sequence AGATCGGAAGAGCACACGTCTGAACTCCAGTCA --adapter_sequence_r2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT --qualified_quality_phred 20 --length_required 50"
			
 
				+# 构建矩阵 
			
 
				+hicBuildMatrix: "--minDistance 500 --maxLibraryInsertSize 1500 --inputBufferSize 1000000 --restrictionSequence GATC --danglingSequence GATC"
			
 
				+# 样本间标准化, 不用改
			
 
				+hicNormalize: "--normalize smallest"
			
 
				+# 参考https://github.com/TaoYang-dev/hicrep 
			
 
				+hicrep: "--h 20 --dBPMax 500000"
			
 
				+# 计算分辨率
			
 
				+hicres: "--resolutions 1000,2000,5000,10000"
			
 
				+# 使用 Calder2 检测 subcompartment, -f 指定 feature track, --chr_prefix 指定原染色体前缀,需要统一
			
 
				+calder: "-g others -f ref/gene_density.bed --chr_prefix Chr"
			
 
				+#  
			
 
				+hicFindTADs: " --correctForMultipleTesting fdr --thresholdComparisons 0.01 --delta 0.01"
			
 
				+hicDifferentialTAD: "--pValue 0.05 --mode all --modeReject one"
			
 
				+hicDetectLoops: "--maxLoopDistance 2000000 --windowSize 10 --peakWidth 6 --pValuePreselection 0.05 --pValue 0.05"
			
 
				+  #mustache: "-pt 0.05 -st 0.88"
			
 
				+mustache: ""
			
 
				+  #diff_mustache: "-pt 0.05 -pt2 0.05 -st 0.88"
			
 
				+diff_mustache: ""
			
 
				+hicPlotDistVsCounts: ""
			
 
				+
			
 
				+stripenn: "" 
			
--- a/contrasts.txt
+++ b/contrasts.txt
@@ -0,0 +1 @@
 
				+leaf	bud
			
--- a/raw_data/bud_Rep1_R1.fastq.gz
+++ b/raw_data/bud_Rep1_R1.fastq.gz
--- a/raw_data/bud_Rep1_R2.fastq.gz
+++ b/raw_data/bud_Rep1_R2.fastq.gz
--- a/raw_data/bud_Rep2_R1.fastq.gz
+++ b/raw_data/bud_Rep2_R1.fastq.gz
--- a/raw_data/bud_Rep2_R2.fastq.gz
+++ b/raw_data/bud_Rep2_R2.fastq.gz
--- a/raw_data/leaf_Rep1_R1.fastq.gz
+++ b/raw_data/leaf_Rep1_R1.fastq.gz
--- a/raw_data/leaf_Rep1_R2.fastq.gz
+++ b/raw_data/leaf_Rep1_R2.fastq.gz
--- a/raw_data/leaf_Rep2_R1.fastq.gz
+++ b/raw_data/leaf_Rep2_R1.fastq.gz
--- a/raw_data/leaf_Rep2_R2.fastq.gz
+++ b/raw_data/leaf_Rep2_R2.fastq.gz
--- a/ref/gene_density.bed
+++ b/ref/gene_density.bed
--- a/ref/genome.chrom.sizes
+++ b/ref/genome.chrom.sizes
--- a/ref/genome.fa
+++ b/ref/genome.fa
--- a/run.sh
+++ b/run.sh
@@ -0,0 +1,6 @@
 
				+# 生成流程图
			
 
				+snakemake --dag | dot -Tpdf > workflow.pdf
			
 
				+# 生成代码,不真正运行
			
 
				+snakemake --use-singularity --keep-going --cores 40 --rerun-triggers mtime -np
			
 
				+# 运行程序
			
 
				+snakemake --use-singularity --keep-going --cores 40 --rerun-triggers mtime 
			
--- a/samples.txt
+++ b/samples.txt
@@ -0,0 +1,5 @@
 
				+replicate	sample
			
 
				+leaf_Rep1	leaf
			
 
				+leaf_Rep2	leaf
			
 
				+bud_Rep1	bud
			
 
				+bud_Rep2	bud
			
--- a/scripts/LongestTranscript.py
+++ b/scripts/LongestTranscript.py
@@ -0,0 +1,75 @@
 
				+import argparse
			
 
				+import pandas as pd
			
 
				+
			
 
				+def parse_attributes(attributes_str):
			
 
				+    """ 解析 GTF 文件中的属性字段 """
			
 
				+    attributes = {}
			
 
				+    for attr in attributes_str.split(';'):
			
 
				+        attr = attr.strip()
			
 
				+        if attr:
			
 
				+            parts = attr.split(' ', 1)
			
 
				+            if len(parts) == 2:
			
 
				+                key, value = parts
			
 
				+                attributes[key] = value.strip('"')
			
 
				+    return attributes
			
 
				+
			
 
				+def read_gtf(gtf_file):
			
 
				+    """ 从 GTF 文件读取数据 """
			
 
				+    data = []
			
 
				+    with open(gtf_file, 'r') as file:
			
 
				+        for line in file:
			
 
				+            if line.startswith('#'):
			
 
				+                continue
			
 
				+            fields = line.strip().split('\t')
			
 
				+            attributes = parse_attributes(fields[8])
			
 
				+            data.append({
			
 
				+                'chrom': fields[0],
			
 
				+                'source': fields[1],
			
 
				+                'feature': fields[2],
			
 
				+                'start': int(fields[3]),
			
 
				+                'end': int(fields[4]),
			
 
				+                'score': fields[5],
			
 
				+                'strand': fields[6],
			
 
				+                'frame': fields[7],
			
 
				+                'attributes': fields[8],
			
 
				+                'transcript_id': attributes.get('transcript_id', ''),
			
 
				+                'gene_id': attributes.get('gene_id', ''),
			
 
				+                'line': line.strip()  # 保存整行
			
 
				+            })
			
 
				+    return pd.DataFrame(data)
			
 
				+
			
 
				+def longest_transcripts(df):
			
 
				+    """ 找到每个基因的最长转录本 """
			
 
				+    return df[df['feature'] == 'transcript'].sort_values(['gene_id', 'end'], ascending=[True, False]).drop_duplicates('gene_id')
			
 
				+
			
 
				+def to_bed6(df):
			
 
				+    """ 将数据转换为 BED6 格式 """
			
 
				+    df['score'] = "."
			
 
				+    df['name'] = df['gene_id']
			
 
				+    return df[['chrom', 'start', 'end', 'name', 'score', 'strand']]
			
 
				+
			
 
				+def write_filtered_gtf(df, longest_transcripts, gtf_out_file):
			
 
				+    """ 将最长转录本的相关行和对应基因的行写入新的 GTF 文件 """
			
 
				+    longest_transcript_ids = set(longest_transcripts['transcript_id'])
			
 
				+    with open(gtf_out_file, 'w') as f:
			
 
				+        for _, row in df.iterrows():
			
 
				+            if row['feature'] == 'gene' or row['transcript_id'] in longest_transcript_ids:
			
 
				+                f.write(row['line'] + '\n')
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser(description="从 GTF 文件提取最长转录本并保存为 BED6 和 GTF 格式")
			
 
				+    parser.add_argument("gtf_file", help="输入的 GTF 文件路径")
			
 
				+    parser.add_argument("bed_file", help="输出的 BED6 格式文件路径")
			
 
				+    parser.add_argument("gtf_out_file", help="输出的 GTF 格式文件路径")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    df = read_gtf(args.gtf_file)
			
 
				+    longest_df = longest_transcripts(df)
			
 
				+    bed6_df = to_bed6(longest_df)
			
 
				+    bed6_df.to_csv(args.bed_file, sep='\t', index=False, header=False)
			
 
				+
			
 
				+    write_filtered_gtf(df, longest_df, args.gtf_out_file)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+
			
--- a/scripts/calder
+++ b/scripts/calder
@@ -0,0 +1,297 @@
 
				+#!/usr/bin/env Rscript
			
 
				+###########################################################################
			
 
				+# 该程序修改自 https://github.com/CSOgroup/CALDER2/blob/main/scripts/calder
			
 
				+# 1. 源程序无法正确处理染色体名称不以 chr 为前缀的情况
			
 
				+# 2. 只修改了 cool 格式作为输入文件，hic 格式预计仍然有 bug 未测试
			
 
				+# 3. 如果染色体前缀不一致，无法处理
			
 
				+# 4. 增加了对 mcool 格式的兼容
			
 
				+# 修改人：张旭东 zhangxudong@genek.cn
			
 
				+###########################################################################
			
 
				+
			
 
				+suppressPackageStartupMessages(library(optparse))
			
 
				+suppressPackageStartupMessages(library(CALDER))
			
 
				+
			
 
				+AVAILABLE_REFERENCE_TRACKS_GENOMES <- c("hg19", "hg38", "mm9", "mm10")
			
 
				+INPUT_TYPES <- c("hic", "cool", "mcool")
			
 
				+CHROMS_TO_REMOVE <- c("ALL", "M", "chrM", "MT", "chrMT", "Y", "chrY")
			
 
				+
			
 
				+
			
 
				+parse_arguments <- function(){
			
 
				+	# Creating the argument parsing options
			
 
				+	option_list = list(
			
 
				+	  make_option(c("-i", "--input"), action="store", default=NA, type='character',
			
 
				+	              help="Input Hi-C contacts"),
			
 
				+	  # 增加对 mcool 格式的兼容
			
 
				+	  make_option(c("-t", "--type"), action="store", default='hic', type='character',
			
 
				+	              help="The type of input: hic or cool or mcool [default %default]"),
			
 
				+	  make_option(c("-b", "--bin_size"), action="store", default=50000, type='integer',
			
 
				+	              help="Bin size to use for the analysis [default %default]"),
			
 
				+	  make_option(c("-g", "--genome"), action="store", default="hg19", type='character',
			
 
				+	              help="Genome assembly to use [default %default]"),
			
 
				+	  make_option(c("-f", "--feature_track"), action="store", default=NA, type='character',
			
 
				+	              help="Genomic feature track to be used to determine A/B compartment direction
			
 
				+	              when genome == 'others'. The track should presumably have higher values
			
 
				+	              in A than in B compartmnets. [default %default]"),
			
 
				+	  # 增加了原染色体前缀参数
			
 
				+	  make_option(c("--chr_prefix"), action="store", default="chr", type='character',
			
 
				+	              help="old chr prefix [default %default]"),
			
 
				+	  make_option(c("-c", "--chromosomes"), action='store', default='all', type='character',
			
 
				+	  			  help="Chromosomes to analyze, separated by comma. [default %default]"),
			
 
				+	  make_option(c("-p", "--nproc"), action="store", default=1, type='integer',
			
 
				+	              help="Number of cores to use [default %default]"),
			
 
				+	  make_option(c("-o", "--outpath"), action="store", default=NA, type='character',
			
 
				+	              help="Path to the output folder"),
			
 
				+	  make_option(c("-k", "--keep_intermediate"), action="store_true", default=FALSE, type='logical',
			
 
				+	              help="Keep intermediate data after done [default %default]"),
			
 
				+	  make_option(c("-a", "--adaptive"), action="store_true", default=FALSE, type='logical',
			
 
				+	              help="Use adaptive resolution choice [default %default]")
			
 
				+	)
			
 
				+	parser <- OptionParser(usage = "%prog [options]", option_list=option_list)
			
 
				+	opt <- parse_args(parser)
			
 
				+
			
 
				+	# Checking if input path exists
			
 
				+	if(is.na(opt$input)){
			
 
				+		print_help(parser)
			
 
				+		stop(paste0("Input path (", opt$input,") does not exist"))
			
 
				+	}
			
 
				+
			
 
				+	# Checking if output path is provided
			
 
				+	if(is.na(opt$outpath)){
			
 
				+		stop("Output path was not provided")
			
 
				+	}
			
 
				+
			
 
				+	# Check that the input type is one of the possible ones
			
 
				+	if(!(opt$type %in% INPUT_TYPES)){
			
 
				+		stop(paste0("Input type ", opt$input_type, " not available"))
			
 
				+	}
			
 
				+
			
 
				+	# Check if the provided genome is in the list of available reference genomes 
			
 
				+	# or if a feature track is provided
			
 
				+	if((!(opt$genome %in% AVAILABLE_REFERENCE_TRACKS_GENOMES)) || (file.exists(opt$feature_track))){
			
 
				+		# in this case, we just assign it the name 'others'
			
 
				+		opt$genome = "others"
			
 
				+	}
			
 
				+
			
 
				+	writeLines(c(
			
 
				+		"*******************************",
			
 
				+		"*            CALDER           *",
			
 
				+		"*******************************",
			
 
				+		paste0("[Parameters] Input: ", opt$input),
			
 
				+		paste0("[Parameters] Input type: ", opt$type),
			
 
				+		paste0("[Parameters] Bin size: ", opt$bin_size),
			
 
				+		paste0("[Parameters] Genome: ", opt$genome),
			
 
				+		paste0("[Parameters] Feature Track: ", opt$feature_track),
			
 
				+		paste0("[Parameters] Chr Prefix: ", opt$chr_prefix),
			
 
				+		paste0("[Parameters] Chromosomes: ", opt$chromosomes),
			
 
				+		paste0("[Parameters] N. cores: ", opt$nproc),
			
 
				+		paste0("[Parameters] Output: ", opt$outpath),
			
 
				+		paste0("[Parameters] Keep Intermediate data: ", opt$keep_intermediate),
			
 
				+		paste0("[Parameters] Use adaptive resolution: ", opt$adaptive)
			
 
				+	))
			
 
				+
			
 
				+	if(file.exists(opt$feature_track)){
			
 
				+		opt$feature_track <- read.table(opt$feature_track, header = F)
			
 
				+	}
			
 
				+
			
 
				+	return(opt)
			
 
				+}
			
 
				+
			
 
				+sanitize_chroms <- function(chroms){
			
 
				+	res <- lapply(chroms, function(x){
			
 
				+		if(startsWith(x, opt$chr_prefix)){
			
 
				+			return(substring(x, 4))
			
 
				+		} else{
			
 
				+			return(x)
			
 
				+		}
			
 
				+	})
			
 
				+	return(res)
			
 
				+}
			
 
				+
			
 
				+handle_input_hic <- function(opt){
			
 
				+	suppressPackageStartupMessages(library(strawr))
			
 
				+	chromsizes <- readHicChroms(opt$input)
			
 
				+	if(opt$chromosomes == "all"){
			
 
				+		chroms <- chromsizes[!(toupper(chromsizes$name) %in% toupper(CHROMS_TO_REMOVE)), "name"]
			
 
				+	}
			
 
				+	else{
			
 
				+		chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
			
 
				+		chroms <- chromsizes[chromsizes$name %in% chrom_list, "name"]
			
 
				+	}
			
 
				+	chroms <- sanitize_chroms(chroms)
			
 
				+	CALDER(contact_file_hic = opt$input,
			
 
				+		   chrs = chroms,
			
 
				+		   bin_size = opt$bin_size,
			
 
				+		   genome = opt$genome,
			
 
				+		   save_dir=opt$outpath,
			
 
				+		   save_intermediate_data=TRUE,
			
 
				+		   feature_track=opt$feature_track,
			
 
				+		   single_binsize_only=!opt$adaptive,
			
 
				+		   n_cores = opt$nproc,
			
 
				+		   sub_domains=T)
			
 
				+}
			
 
				+
			
 
				+# 增加了对 mcool 的支持
			
 
				+handle_input_mcool <- function(opt){
			
 
				+	intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
			
 
				+	dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE)
			
 
				+
			
 
				+	opt$feature_track[, 1] = gsub(paste0("^", opt$chr_prefix), "chr", opt$feature_track[, 1])
			
 
				+
			
 
				+	system(paste0("cooler dump --table chroms --out ", 
			
 
				+				  file.path(intermediate_data_dir, "chroms.txt"), 
			
 
				+				  " --header ", 
			
 
				+				  opt$input,
			
 
				+				  "::/resolutions/",
			
 
				+				  opt$bin_size
			
 
				+				  ))
			
 
				+	chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE)
			
 
				+	if(opt$chromosomes == "all"){
			
 
				+		chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"]
			
 
				+	}
			
 
				+	else{
			
 
				+		chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
			
 
				+		chroms <- chroms[chroms$name %in% chrom_list, "name"]
			
 
				+	}
			
 
				+	
			
 
				+	dump_paths <- list()
			
 
				+	for(chrom in chroms){
			
 
				+		cat(paste0("[Pre-processing] Dumping ", chrom, "\n"))
			
 
				+		chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt"))
			
 
				+		dump_paths <- c(dump_paths, chrom_dump_path)
			
 
				+		if(! file.exists(chrom_dump_path)){
			
 
				+			system(paste0("cooler dump --table pixels --range ", 
			
 
				+						  chrom, 
			
 
				+						  " --join --balanced ",
			
 
				+						  opt$input,
			
 
				+						  "::/resolutions/",
			
 
				+						  opt$bin_size,						  
			
 
				+						  " | cut -f2,5,8 | awk '{if ($3) print;}' > ",
			
 
				+						  chrom_dump_path))
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	chroms <- sanitize_chroms(chroms)
			
 
				+	names(dump_paths) <- chroms
			
 
				+
			
 
				+	CALDER(contact_file_dump=dump_paths, 
			
 
				+		   chrs=chroms, 
			
 
				+		   bin_size=opt$bin_size,
			
 
				+		   genome=opt$genome,
			
 
				+		   save_dir=opt$outpath,
			
 
				+		   feature_track=opt$feature_track,
			
 
				+		   single_binsize_only=!opt$adaptive,
			
 
				+		   save_intermediate_data=TRUE,
			
 
				+		   n_cores=opt$nproc,
			
 
				+		   sub_domains=T)
			
 
				+
			
 
				+}
			
 
				+
			
 
				+handle_input_cool <- function(opt){
			
 
				+  intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
			
 
				+  dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE)
			
 
				+  # 处理染色体前缀
			
 
				+  opt$feature_track[, 1] = gsub(paste0("^", opt$chr_prefix), "chr", opt$feature_track[, 1])
			
 
				+  
			
 
				+  system(paste0("cooler dump --table chroms --out ", 
			
 
				+                file.path(intermediate_data_dir, "chroms.txt"), 
			
 
				+                " --header ", 
			
 
				+                opt$input
			
 
				+  ))
			
 
				+  chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE)
			
 
				+  if(opt$chromosomes == "all"){
			
 
				+    chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"]
			
 
				+  }
			
 
				+  else{
			
 
				+    chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
			
 
				+    chroms <- chroms[chroms$name %in% chrom_list, "name"]
			
 
				+  }
			
 
				+  
			
 
				+  dump_paths <- list()
			
 
				+  for(chrom in chroms){
			
 
				+    cat(paste0("[Pre-processing] Dumping ", chrom, "\n"))
			
 
				+    chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt"))
			
 
				+    dump_paths <- c(dump_paths, chrom_dump_path)
			
 
				+    if(! file.exists(chrom_dump_path)){
			
 
				+      system(paste0("cooler dump --table pixels --range ", 
			
 
				+                    chrom, 
			
 
				+                    " --join --balanced ",
			
 
				+                    opt$input,
			
 
				+                    " | cut -f2,5,8 | awk '{if ($3) print;}' > ",
			
 
				+                    chrom_dump_path))
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  chroms <- sanitize_chroms(chroms)
			
 
				+  names(dump_paths) <- chroms
			
 
				+  
			
 
				+  CALDER(contact_file_dump=dump_paths, 
			
 
				+         chrs=chroms, 
			
 
				+         bin_size=opt$bin_size,
			
 
				+         genome=opt$genome,
			
 
				+         save_dir=opt$outpath,
			
 
				+         feature_track=opt$feature_track,
			
 
				+         single_binsize_only=!opt$adaptive,
			
 
				+         save_intermediate_data=TRUE,
			
 
				+         n_cores=opt$nproc,
			
 
				+         sub_domains=T)
			
 
				+  
			
 
				+}
			
 
				+
			
 
				+opt <- parse_arguments()
			
 
				+
			
 
				+if(opt$type == "hic"){
			
 
				+	handle_input_hic(opt)
			
 
				+} else if(opt$type == "mcool"){
			
 
				+	handle_input_mcool(opt)
			
 
				+} else if(opt$type == "cool"){
			
 
				+  handle_input_cool(opt)
			
 
				+} else {
			
 
				+	stop("Unknown input type")
			
 
				+}
			
 
				+
			
 
				+# Cleaning the output
			
 
				+intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
			
 
				+if(dir.exists(intermediate_data_dir) && (!opt$keep_intermediate)){
			
 
				+	cat('[Post-processing] Removing intermediate data\n')
			
 
				+	unlink(intermediate_data_dir, recursive=TRUE)
			
 
				+}
			
 
				+exec_time_file = "./total_execution.time"
			
 
				+if(file.exists(exec_time_file)){
			
 
				+	cat("[Post-processing] Removing total_execution.time\n")
			
 
				+	file.remove(exec_time_file)
			
 
				+}
			
 
				+
			
 
				+# 将 chr 再重新替换为原来的前缀
			
 
				+replace_chr_in_files <- function(dir_path, old_prefix, new_prefix) {
			
 
				+  # 定义常见的文本文件扩展名
			
 
				+  text_file_extensions <- c("txt", "tsv", "csv", "bed")
			
 
				+  
			
 
				+  # 获取目录中的所有文件和子目录
			
 
				+  files <- list.files(dir_path, recursive = TRUE, full.names = TRUE)
			
 
				+  
			
 
				+  for (file in files) {
			
 
				+    # 获取文件扩展名
			
 
				+    file_ext <- tools::file_ext(file)
			
 
				+    
			
 
				+    # 检查文件是否为文本文件
			
 
				+    if (file.info(file)$isdir == FALSE && file_ext %in% text_file_extensions) {
			
 
				+      tryCatch({
			
 
				+        # 读取文件内容
			
 
				+        content <- readLines(file)
			
 
				+        
			
 
				+        # 替换行首的 'chr' 为 'Chr'
			
 
				+        content <- gsub(paste0("^", old_prefix), new_prefix, content)
			
 
				+        
			
 
				+        # 写回文件
			
 
				+        writeLines(content, file)
			
 
				+        
			
 
				+        cat("Processed:", file, "\n")
			
 
				+      }, error = function(e) {
			
 
				+        cat("Failed to process:", file, "\n", e, "\n")
			
 
				+      })
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+# 示例用法
			
 
				+replace_chr_in_files(opt$outpath, "^chr", opt$chr_prefix)
			
--- a/scripts/combine_fastp_report.R
+++ b/scripts/combine_fastp_report.R
@@ -0,0 +1,73 @@
 
				+suppressPackageStartupMessages({
			
 
				+  library(jsonlite)
			
 
				+  library(dplyr)
			
 
				+  library(tidyr)
			
 
				+  library(purrr)
			
 
				+  library(argparse)
			
 
				+})
			
 
				+
			
 
				+# 创建命令行参数解析器
			
 
				+parser <- ArgumentParser()
			
 
				+parser$add_argument("--input", type="character", nargs='+', help="input fastp JSON reports")
			
 
				+parser$add_argument("--output", type="character", default="report", help="output folder to store fastp summary TSV and RData files")
			
 
				+
			
 
				+# 解析命令行参数
			
 
				+args <- parser$parse_args()
			
 
				+
			
 
				+# 获取输入文件和输出目录
			
 
				+json_files <- args$input
			
 
				+output_folder <- args$output
			
 
				+
			
 
				+# 确保指定了输入文件
			
 
				+if (length(json_files) == 0) {
			
 
				+  stop("No input files provided.")
			
 
				+}
			
 
				+
			
 
				+# 创建输出文件夹
			
 
				+dir.create(output_folder, showWarnings = FALSE, recursive = TRUE)
			
 
				+
			
 
				+# 输出文件路径
			
 
				+output_file <- file.path(output_folder, "fastp_summary.tsv")
			
 
				+
			
 
				+# 将要提取的字段列在这里
			
 
				+fields <- c(
			
 
				+  "total_reads",
			
 
				+  "total_bases",
			
 
				+  "q30_rate",
			
 
				+  "gc_content"
			
 
				+)
			
 
				+
			
 
				+# 初始化一个空 data.frame 来存储结果
			
 
				+fastp_summary <- data.frame()
			
 
				+
			
 
				+# 遍历所有 JSON 文件
			
 
				+for (json_file in json_files) {
			
 
				+  # 读取 JSON 文件
			
 
				+  data <- fromJSON(json_file)
			
 
				+  
			
 
				+  # 从文件名中提取样本名称
			
 
				+  sample_name <- gsub("_fastp.json", "", basename(json_file))
			
 
				+  
			
 
				+  # 提取过滤前和过滤后的所需字段
			
 
				+  before_filtering <- data$summary$before_filtering[fields]
			
 
				+  after_filtering <- data$summary$after_filtering[fields]
			
 
				+  
			
 
				+  # 将结果添加到 data.frame
			
 
				+  result <- data.frame(sample = sample_name)
			
 
				+  result <- cbind(result, before_filtering, after_filtering)
			
 
				+  fastp_summary <- rbind(fastp_summary, result)
			
 
				+}
			
 
				+
			
 
				+# 重命名列名
			
 
				+colnames(fastp_summary) <- c("sample",
			
 
				+                             paste0("before_", fields),
			
 
				+                             paste0("after_", fields))
			
 
				+
			
 
				+# 排序
			
 
				+fastp_summary <- fastp_summary %>% arrange(sample)
			
 
				+
			
 
				+# 将结果写入指定的 TSV 文件
			
 
				+write.table(fastp_summary, output_file, sep = "\t", row.names = FALSE, quote = FALSE)
			
 
				+
			
 
				+# 存储 fastp_summary 数据到 RData 文件中
			
 
				+save(fastp_summary, file = file.path(output_folder, "fastp_summary.RData"))
			
--- a/scripts/faCount
+++ b/scripts/faCount
--- a/scripts/hicres.py
+++ b/scripts/hicres.py
@@ -0,0 +1,126 @@
 
				+import cooler
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import argparse
			
 
				+from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				+
			
 
				+def calculate_valid_bins_percentage(file_path, threshold=1000, num_threads=1, chunk_size=100000, resolutions=None):
			
 
				+    if file_path.endswith('.mcool'):
			
 
				+        return calculate_mcool_valid_bins_percentage(file_path, threshold, num_threads, chunk_size, resolutions)
			
 
				+    else:
			
 
				+        return calculate_cool_valid_bins_percentage(file_path, threshold, chunk_size)
			
 
				+
			
 
				+def calculate_cool_valid_bins_percentage(file_path, threshold=1000, chunk_size=100000):
			
 
				+    clr = cooler.Cooler(file_path)
			
 
				+    bins = clr.bins()[:]
			
 
				+    total_valid_bins = 0
			
 
				+    total_bins = 0
			
 
				+
			
 
				+    for start in range(0, len(bins), chunk_size):
			
 
				+        end = min(start + chunk_size, len(bins))
			
 
				+        matrix = clr.matrix(balance=False)[start:end, start:end]
			
 
				+
			
 
				+        bin_sums = np.sum(matrix, axis=0) + np.sum(matrix, axis=1) - np.diag(matrix)
			
 
				+        valid_bins = np.sum(bin_sums > threshold)
			
 
				+        total_valid_bins += valid_bins
			
 
				+        total_bins += bin_sums.size
			
 
				+
			
 
				+    valid_bin_percentage = (total_valid_bins / total_bins) * 100 if total_bins > 0 else 0
			
 
				+
			
 
				+    return {
			
 
				+        'File': file_path,
			
 
				+        'Valid Bin Percentage': valid_bin_percentage,
			
 
				+        'Total Valid Bins': total_valid_bins,
			
 
				+        'Total Bins': total_bins
			
 
				+    }
			
 
				+
			
 
				+def calculate_mcool_valid_bins_percentage(file_path, threshold=1000, num_threads=1, chunk_size=100000, resolutions=None):
			
 
				+    available_resolutions = cooler.fileops.list_coolers(file_path)
			
 
				+
			
 
				+    print(f"Available resolutions in {file_path}: {available_resolutions}")
			
 
				+    if resolutions:
			
 
				+        resolutions = [f'/resolutions/{res}' for res in resolutions.split(',') if f'/resolutions/{res}' in available_resolutions]
			
 
				+        print(resolutions)
			
 
				+        if not resolutions:
			
 
				+            raise ValueError(f"No matching resolutions found in the .mcool file for specified resolutions.")
			
 
				+    else:
			
 
				+        resolutions = available_resolutions
			
 
				+
			
 
				+    results = []
			
 
				+
			
 
				+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
			
 
				+        future_to_resolution = {
			
 
				+            executor.submit(process_resolution, file_path, resolution_path, threshold, chunk_size): resolution_path
			
 
				+            for resolution_path in resolutions
			
 
				+        }
			
 
				+
			
 
				+        for future in as_completed(future_to_resolution):
			
 
				+            resolution_path = future_to_resolution[future]
			
 
				+            try:
			
 
				+                result = future.result()
			
 
				+                if result is not None:
			
 
				+                    results.append(result)
			
 
				+            except Exception as e:
			
 
				+                print(f"Error processing resolution {resolution_path}: {e}")
			
 
				+
			
 
				+    return {
			
 
				+        'File': file_path,
			
 
				+        'Results': results
			
 
				+    }
			
 
				+
			
 
				+def process_resolution(file_path, resolution_path, threshold, chunk_size):
			
 
				+    clr_res = cooler.Cooler(f"{file_path}::{resolution_path}")
			
 
				+    bins = clr_res.bins()[:]
			
 
				+
			
 
				+    print(f"Processing resolution: {resolution_path}")
			
 
				+
			
 
				+    total_valid_bins = 0
			
 
				+    total_bins = 0
			
 
				+
			
 
				+    for start in range(0, len(bins), chunk_size):
			
 
				+        end = min(start + chunk_size, len(bins))
			
 
				+        matrix = clr_res.matrix(balance=False)[start:end, start:end]
			
 
				+
			
 
				+        bin_sums = np.sum(matrix, axis=0) + np.sum(matrix, axis=1) - np.diag(matrix)
			
 
				+        valid_bins = np.sum(bin_sums > threshold)
			
 
				+        total_valid_bins += valid_bins
			
 
				+        total_bins += bin_sums.size
			
 
				+
			
 
				+    valid_bin_percentage = (total_valid_bins / total_bins) * 100 if total_bins > 0 else 0
			
 
				+
			
 
				+    return {
			
 
				+        'Resolution': resolution_path.split('/')[-1],
			
 
				+        'Valid Bin Percentage': valid_bin_percentage,
			
 
				+        'Total Valid Bins': total_valid_bins,
			
 
				+        'Total Bins': total_bins
			
 
				+    }
			
 
				+
			
 
				+def save_results_to_csv(results, output_path):
			
 
				+    if 'Results' in results:
			
 
				+        df = pd.DataFrame(results['Results'])
			
 
				+        df.insert(0, 'File', results['File'])
			
 
				+    else:
			
 
				+        df = pd.DataFrame([results])
			
 
				+
			
 
				+    df.to_csv(output_path, index=False)
			
 
				+    print(f"Results saved to {output_path}")
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser(description="Calculate valid bins percentage from .cool or .mcool files.")
			
 
				+    parser.add_argument("file_path", type=str, help="Path to the .cool or .mcool file")
			
 
				+    parser.add_argument("--output", type=str, required=True, help="Path to save the results as CSV file")
			
 
				+    parser.add_argument("--threads", type=int, default=1, help="Number of threads to use for processing")
			
 
				+    parser.add_argument("--chunk_size", type=int, default=100000, help="Chunk size for processing large matrices")
			
 
				+    parser.add_argument("--resolutions", type=str, help="Comma-separated list of resolutions to process in the .mcool file")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    try:
			
 
				+        results = calculate_valid_bins_percentage(args.file_path, num_threads=args.threads, chunk_size=args.chunk_size, resolutions=args.resolutions)
			
 
				+        save_results_to_csv(results, args.output)
			
 
				+    except Exception as e:
			
 
				+        print(f"Error processing file {args.file_path}: {e}")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+
			
--- a/scripts/scc.py
+++ b/scripts/scc.py
@@ -0,0 +1,69 @@
 
				+import argparse
			
 
				+import cooler
			
 
				+import logging
			
 
				+import pandas as pd
			
 
				+from hicrep import hicrepSCC
			
 
				+from itertools import combinations
			
 
				+import time
			
 
				+from concurrent.futures import ProcessPoolExecutor
			
 
				+
			
 
				+# 设置日志配置
			
 
				+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+def calculate_pair_scc(pair, h, dBPMax, bDownSample):
			
 
				+    (file1, name1), (file2, name2) = pair  # 解包每个元组
			
 
				+    logging.info(f"Starting SCC calculation between {name1} and {name2}")
			
 
				+
			
 
				+    cool1 = cooler.Cooler(file1)
			
 
				+    cool2 = cooler.Cooler(file2)
			
 
				+    binSize1 = cool1.info['bin-size']
			
 
				+    binSize2 = cool2.info['bin-size']
			
 
				+    chromosomes = cool1.chromnames
			
 
				+
			
 
				+    if binSize1 != binSize2:
			
 
				+        logging.error("Bin sizes of the two cool files must be equal.")
			
 
				+        raise ValueError("Bin sizes of the two cool files must be equal.")
			
 
				+
			
 
				+    start_time = time.time()
			
 
				+    scc_scores = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample)
			
 
				+    elapsed_time = time.time() - start_time
			
 
				+    logging.info(f"Finished SCC calculation between {name1} and {name2} in {elapsed_time:.2f} seconds")
			
 
				+
			
 
				+    results = [{'SampleA': name1, 'SampleB': name2, 'Chromosome': chrom, 'SCC': scc}
			
 
				+               for chrom, scc in zip(chromosomes, scc_scores)]
			
 
				+    return results
			
 
				+
			
 
				+def compute_scc(files, sample_names, h, dBPMax, bDownSample, num_workers=4):
			
 
				+    pairs = list(combinations(zip(files, sample_names), 2))
			
 
				+    results = []
			
 
				+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
			
 
				+        futures = [executor.submit(calculate_pair_scc, pair, h, dBPMax, bDownSample) for pair in pairs]
			
 
				+        for future in futures:
			
 
				+            results.extend(future.result())
			
 
				+    return results
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser(description="Compute SCC for pairs of .cool files.")
			
 
				+    parser.add_argument('--files', nargs='+', required=True, help="List of .cool files to process.")
			
 
				+    parser.add_argument('--sample_names', nargs='+', required=True, help="List of sample names corresponding to the .cool files.")
			
 
				+    parser.add_argument('--output', type=str, required=True, help="Output file path to store the results.")
			
 
				+    parser.add_argument('--h', type=int, default=1, help="Smoothing window radius.")
			
 
				+    parser.add_argument('--dBPMax', type=int, default=500000, help="Maximum genomic distance.")
			
 
				+    parser.add_argument('--bDownSample', action='store_true', help="Whether to perform downsampling.")
			
 
				+    parser.add_argument('--workers', type=int, default=4, help="Number of workers for parallel computation.")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    if len(args.files) != len(args.sample_names):
			
 
				+        logging.error("The number of files must match the number of sample names provided.")
			
 
				+        return
			
 
				+
			
 
				+    logging.info("Starting SCC computations for all combinations of provided .cool files.")
			
 
				+    scc_results = compute_scc(args.files, args.sample_names, args.h, args.dBPMax, args.bDownSample, args.workers)
			
 
				+    
			
 
				+    df = pd.DataFrame(scc_results)
			
 
				+    df.to_csv(args.output, index=False)
			
 
				+    logging.info(f"Results have been saved to {args.output}")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/workflow.pdf
+++ b/workflow.pdf