Browse Source

first commit

zhangxudong 5 months ago
commit
426a09c776

+ 487 - 0
Snakefile

@@ -0,0 +1,487 @@
+import os
+import yaml
+import pandas as pd
+
+configfile: "config.yaml"
+##########################################################
+# 全局变量和样本信息
+##########################################################
+HICSNAKE_HOME = config["HICSNAKE_HOME"] if config["HICSNAKE_HOME"] else os.getcwd()
+GENOME = config["genome"]
+RESOLUTIONS = config["resolutions"]
+BASE_RESOLUTION = min(RESOLUTIONS)
+
+with open('chromosomes.txt', 'r') as file:
+    CHROMOSOMES = [line.strip() for line in file]
+
+
+meta_data = pd.read_csv('samples.txt', sep='\t')
+REPLICATES = sorted(meta_data['replicate'].tolist())
+SAMPLES2REPLICATES = meta_data.groupby('sample')['replicate'].apply(list).to_dict()
+SAMPLES = SAMPLES2REPLICATES.keys()
+
+contrasts_data = pd.read_csv('contrasts.txt', sep='\t', header=None)
+CONTRASTS = [f"{row[0]}_vs_{row[1]}" for row in contrasts_data.itertuples(index=False)]
+
+##########################################################
+# rule all: 最终想要生成的文件
+##########################################################
+rule all:
+    input:
+        #第一步: 测序数据的过滤和质控
+        #"clean_data/fastp_summary.tsv",
+        #第二步:比对到参考基因组
+        #expand("aligned/{replicate}_R{i}.bam", replicate=REPLICATES, i=[1,2]),
+        #第三步: 构建接触矩阵
+        #expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES),
+        #expand("Matrix/Merged/{sample}.cool", sample=SAMPLES),
+        #expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES),
+        #第四步: 格式转换
+        expand("Matrix/Final/{sample}.{format}", sample=SAMPLES, format=["mcool"]),
+        #第五步: 接触矩阵质控
+        "Matrix/Raw/hicQC.html",
+        "Matrix/Raw/SCC.csv",
+        expand("Matrix/Final/{sample}_res.csv", sample=SAMPLES),
+        #第六步: AB compartment, TADs, Loops
+        expand("Matrix/Diff/{resolution}/{contrast}.cool", resolution=config["TAD_resolution"], contrast=CONTRASTS),
+        expand("Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed", resolution=config["Compartment_resolution"], sample=SAMPLES),
+        expand("TADs/HiCExplorer/{resolution}/{sample}_domains.bed", resolution=config["TAD_resolution"], sample=SAMPLES),
+        expand("TADs/HiCExplorer/{resolution}/{contrast}_accepted.diff_tad", resolution=config["TAD_resolution"],contrast=CONTRASTS),
+        expand("Loops/Mustache/{resolution}/{sample}.tsv", resolution=config["Loop_resolution"], sample=SAMPLES),
+        expand("Loops/Mustache/{resolution}/{contrast}.diffloop1", resolution=config["Loop_resolution"], contrast=CONTRASTS),
+        # 可视化
+        expand("plots/{resolution}/{sample}_plotmatrix_allchr.png", resolution=config["Plot_matrix_resolution"], sample=SAMPLES),
+        expand("plots/{resolution}/{sample}_plotmatrix_perchr.png", resolution=config["Plot_matrix_resolution"], sample=SAMPLES)
+
+ 
+#********************************************************#
+#            第一步: 测序数据的过滤和质控                #
+#********************************************************#
+
+# 使用fastp进行原始数据质控和过滤,自动判断 SE?PE?
+rule fastp_quality_control:
+    input:
+        read1="raw_data/{replicate}_R1.fastq.gz", 
+        read2="raw_data/{replicate}_R2.fastq.gz"
+    output:
+        read1="clean_data/{replicate}_R1.fastq.gz", 
+        read2="clean_data/{replicate}_R2.fastq.gz", 
+        html_report="clean_data/{replicate}_fastp.html",
+        json_report="clean_data/{replicate}_fastp.json"
+    log: "logs/{replicate}_fastp_quality_control.log"
+    threads: 3
+    singularity:
+        HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
+    params:
+        fastp=config["fastp"]
+    shell:
+        """
+        fastp -i {input.read1} -o {output.read1} -I {input.read2} -O {output.read2} --html {output.html_report} --json {output.json_report} --thread {threads} {params.fastp} 1>{log} 2>&1
+        """
+
+# 合并fastp质控结果生成表格
+rule combine_fastp_reports:
+    input:
+        expand("clean_data/{replicate}_fastp.json", replicate=REPLICATES)
+    output:
+        "clean_data/fastp_summary.tsv"
+    log: "logs/combine_fastp_reports.log"
+    params:
+        hicsnake_home=HICSNAKE_HOME
+    shell:
+        """
+        Rscript {params.hicsnake_home}/scripts/combine_fastp_report.R --input {input} --output clean_data/ 1>{log} 2>&1
+        """
+
+#********************************************************#
+#             第二步:比对到参考基因组                    #
+#********************************************************#
+# 根据基因组构建 bwa index
+rule bwa_index:
+    input:
+        genome = "ref/" + GENOME + ".fa"
+    output:
+        genome_index = "ref/genome.bwt"
+    log: "logs/bwa_index.log"
+    singularity:
+        HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
+    threads: 6
+    shell:
+        "bwa index -p ref/genome {input.genome} 1>{log} 2>&1"
+
+# 使用 bwa 比对 
+rule bwa:
+    input:
+        genome_index="ref/genome.bwt",
+        read="clean_data/{replicate}_R{i}.fastq.gz", 
+    output:
+        sam="aligned/{replicate}_R{i}.sam"
+    threads: 20
+    log: "logs/{replicate}_R{i}_bwa.log"
+    singularity:
+        HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
+    shell:
+        """
+        bwa mem -A1 -B4  -E50 -L0 -t {threads} -o {output.sam} ref/genome {input.read} 2>{log}
+        """
+
+rule sam2bam:
+    input:
+        sam="aligned/{replicate}_R{i}.sam"
+    output:
+        bam="aligned/{replicate}_R{i}.bam"
+    threads: 6
+    log: "logs/{replicate}_R{i}_sam2bam.log"
+    singularity:
+        HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
+    shell:
+        """
+        samtools view -hb --threads {threads} -o {output.bam} {input.sam}
+        """
+
+#********************************************************#
+#             第三步: 构建接触矩阵                       #
+#********************************************************#
+
+# 生成酶切位点
+rule hicFindRestSite:
+    input:
+        genome="ref/" + GENOME + ".fa"
+    output:
+        rest_site="ref/rest_site.bed"
+    log: "logs/hicFindRestSite.log"
+    params:
+        restriction_sequence=config["restriction_sequence"]
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
+    shell:
+        """
+        hicFindRestSite --fasta {input.genome} --searchPattern {params.restriction_sequence} -o {output.rest_site} 1>{log} 2>&1
+        """
+
+# 生成 cool 格式矩阵 hicBuildMatrix
+rule hicBuildMatrix:
+    input:
+        read1_bam="aligned/{replicate}_R1.bam",
+        read2_bam="aligned/{replicate}_R2.bam",
+        rest_site="ref/rest_site.bed"
+    output:
+        bam="Matrix/Raw/{replicate}.bam",
+        cool="Matrix/Raw/{replicate}.cool",
+        log="Matrix/Raw/{replicate}/QC.log"
+    log: "logs/{replicate}_hicBuildMatrix.log"
+    params:
+        genome="ref/" + GENOME + ".fa",
+        base_resolution=BASE_RESOLUTION,
+        hicBuildMatrix=config["hicBuildMatrix"]
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
+    threads: 6
+    shell:
+        """
+        hicBuildMatrix --samFiles {input.read1_bam} {input.read2_bam} \
+            --genomeAssembly {params.genome} \
+            --outBam {output.bam} \
+            --outFileName {output.cool} \
+            --QCfolder Matrix/Raw/{wildcards.replicate} \
+            --restrictionCutFile {input.rest_site} \
+            --binSize {params.base_resolution} \
+            --threads {threads} {params.hicBuildMatrix} 1>{log} 2>&1
+        """
+
+# 合并生物学重复的矩阵
+rule hicSumMatrices:
+    input:
+        cools=lambda wildcards: expand("Matrix/Raw/{replicate}.cool", replicate=SAMPLES2REPLICATES[wildcards.sample])
+    output:
+        cool="Matrix/Merged/{sample}.cool"
+    log: "logs/{sample}_hicSumMatrices.log"
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
+    threads: 3
+    shell:
+        """
+        if [ $(echo {input.cools} | wc -w) -gt 1 ]; then
+            hicSumMatrices --matrices {input.cools} --outFileName {output.cool} 1>{log} 2>&1
+        else
+            cp {input.cools[0]} {output.cool} 1>{log} 2>&1
+        fi
+        """
+
+# 归一化接触矩阵
+rule hicNormalize:
+    input:
+        cools=expand("Matrix/Merged/{sample}.cool", sample=SAMPLES, allow_missing=True)
+    output:
+        cools=expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES, allow_missing=True)
+    params:
+        hicNormalize=config["hicNormalize"]
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
+    threads: 1
+    shell:
+        """
+        if [ $(echo {input.cools} | wc -w) -gt 1 ]; then
+            hicNormalize --matrices {input.cools} --outFileName {output.cools} {params.hicNormalize}
+        else
+            cp {input.cools[0]} {output.cools[0]}
+        fi
+        """
+
+#********************************************************#
+#                 第四步: 格式转换                       #
+#********************************************************#
+
+# 转换为 mcool 格式
+rule cooler_zoomify:
+    input:
+        cool="Matrix/Normalized/{sample}.cool"
+    output:
+        mcool="Matrix/Final/{sample}.mcool"
+    log: "logs/{sample}_cooler_zoomify.log"
+    params:
+        resolutions=",".join(map(str, RESOLUTIONS))
+    singularity:
+        HICSNAKE_HOME + "/sifs/cooler_20240402.sif"
+    threads: 4
+    shell:
+        """
+        cooler zoomify {input.cool} -n {threads} --resolutions {params.resolutions} -o Matrix/Final/{wildcards.sample}_nobalanced.mcool
+        cooler zoomify {input.cool} -n {threads} --balance --resolutions {params.resolutions} -o {output.mcool}
+        """
+
+# 转换为 ginteractions 格式
+rule convert_to_ginteractions:
+    input:
+        cool="Matrix/Normalized/{sample}.cool"
+    output:
+        ginteractions="Matrix/Final/{sample}.ginteractions.tsv"
+    log: "logs/{sample}_hicConvertFormat_ginteractions.log"
+    params:
+        resolutions=RESOLUTIONS,
+        prefix="Matrix/Final/{sample}.ginteractions"
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
+    threads: 1
+    shell:
+        """
+        hicConvertFormat --matrices {input.cool} --outFileName {params.prefix} --inputFormat cool --outputFormat ginteractions 1>{log} 2>&1
+        """
+
+# 转换为 hic 格式
+rule convert_to_hic:
+    input:
+        ginteractions="Matrix/Final/{sample}.ginteractions.tsv",
+        chrom_sizes = "ref/" + GENOME + ".chrom.sizes"
+    output:
+        hic="Matrix/Final/{sample}.hic"
+    log: "logs/{sample}_convert_to_hic.log"
+    params:
+        resolutions=",".join(str(r) for r in RESOLUTIONS)
+    singularity:
+        HICSNAKE_HOME + "/sifs/juicer_20240401"
+    threads: 1
+    shell:
+        """
+        awk -F "\\t" '{{print 0, $1, $2, 0, 0, $4, $5, 1, $7}}' {input.ginteractions} > Matrix/Final/{wildcards.sample}.short
+        sort -k2,2d -k6,6d Matrix/Final/{wildcards.sample}.short > Matrix/Final/{wildcards.sample}.short.sorted
+        java -Xms6g -Xmx40g -jar /opt/juicer_tools.jar pre -r {params.resolutions} Matrix/Final/{wildcards.sample}.short.sorted {output.hic} {input.chrom_sizes}
+        """
+
+#********************************************************#
+#                    第五步: 接触矩阵质控                #
+#********************************************************#
+
+# 使用 hicQC 进行REPLICATE 层次质控
+rule hicQC:
+    input:
+        logs=expand("Matrix/Raw/{replicate}/QC.log", replicate=REPLICATES)
+    output:
+        html="Matrix/Raw/hicQC.html"
+    params:
+        REPLICATES=REPLICATES
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
+    threads: 20
+    shell:
+        """
+        hicQC --logfiles {input.logs} --labels {params.REPLICATES} --outputFolder Matrix/Raw
+        """
+
+# 计算 SCC
+rule hicrep:
+    input:
+        cools=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES)
+    output:
+        scc="Matrix/Raw/SCC.csv"
+    params:
+        replicates=REPLICATES,
+        hicrep=config["hicrep"]
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicrep_20240423.sif"
+    threads: 4
+    shell:
+        """
+        python scripts/scc.py --workers {threads} {params.hicrep} --output {output.scc} --sample_names {params.replicates} --files {input.cools}
+        """
+
+# 评估数据分辨率
+rule estimate_resolution:
+    input:
+        mcool="Matrix/Final/{sample}.mcool"
+    output:
+        res="Matrix/Final/{sample}_res.csv"
+    params:
+        hicres=config["hicres"]
+    singularity:
+        HICSNAKE_HOME + "/sifs/cooler_20240402.sif"
+    threads: 4
+    shell:
+        """
+        python scripts/hicres.py {input.mcool} --threads {threads} --output {output.res} {params.hicres}
+        """
+
+# hicPlotDistVsCounts: 距离衰减曲线, 查看样本/重复差异
+rule hicPlotDistVsCounts:
+    input:
+        cool=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES)
+    output:
+        pdf="Matrix/Raw/DistVsCounts.pdf"
+    params:
+        replicates=REPLICATES,
+        hicPlotDistVsCounts=config["hicPlotDistVsCounts"]
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
+    threads: 2
+    shell:
+        """
+        hicPlotDistVsCounts --matrices {input.cool} --labels {params.replicates} --plotFile {output.pdf} {params.hicPlotDistVsCounts}
+        """ 
+#********************************************************#
+#                第七步: 三维结构预测                    #
+#********************************************************#
+rule calder2:
+    input:
+        mcool="Matrix/Final/{sample}.mcool",
+    output:
+        "Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed"
+    params:
+        calder=config["calder"],
+        resolution="{resolution}"
+    threads: 4
+    log: "logs/{sample}_{resolution}_calder2.log"
+    shell:
+        """
+        Rscript scripts/calder -i {input.mcool} -t mcool -b {params.resolution} -p {threads} -o Compartments/CALDER2/{wildcards.resolution}/{wildcards.sample} {params.calder}
+        """
+
+# 使用 HiCExplorer 检测 TADs
+rule hicFindTADs:
+    input:
+        mcool="Matrix/Final/{sample}.mcool"
+    output:
+        boundaries="TADs/HiCExplorer/{resolution}/{sample}_boundaries.bed",
+        domains="TADs/HiCExplorer/{resolution}/{sample}_domains.bed",
+        score="TADs/HiCExplorer/{resolution}/{sample}_score.bedgraph"
+    params:
+        hicFindTADs=config["hicFindTADs"],
+        prefix="TADs/HiCExplorer/{resolution}/{sample}",
+        resolution="{resolution}"
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
+    threads: 4
+    shell:
+        """
+        hicFindTADs -p {threads} --matrix {input.mcool}::/resolutions/{params.resolution} --outPrefix {params.prefix} {params.hicFindTADs}
+        """
+
+# 使用 HiCExplorer 检测差异 TADs
+rule hicDifferentialTAD:
+    input:
+        treatment_mcool="Matrix/Final/{treatment}.mcool",
+        control_mcool="Matrix/Final/{control}.mcool",
+        treatment_tads="TADs/HiCExplorer/{resolution}/{treatment}_domains.bed"
+    output:
+        accepted_diff_tad="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}_accepted.diff_tad"
+    params:
+        hicDifferentialTAD=config["hicDifferentialTAD"],
+        prefix="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}",
+        resolution="{resolution}"
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
+    threads: 4
+    shell:
+        """
+        hicDifferentialTAD --targetMatrix {input.treatment_mcool}::/resolutions/{params.resolution} --controlMatrix {input.control_mcool}::/resolutions/{params.resolution} --tadDomains {input.treatment_tads} --outFileNamePrefix {params.prefix} {params.hicDifferentialTAD} --threads {threads}
+        """
+
+# 使用 mustache 检测 Loops
+rule mustache:
+    input:
+        mcool="Matrix/Final/{sample}.mcool"
+    output:
+        loops="Loops/Mustache/{resolution}/{sample}.tsv"
+    params:
+        mustache=config["mustache"],
+        resolution="{resolution}"
+    singularity:
+        HICSNAKE_HOME + "/sifs/mustache_20240410.sif"
+    threads: 6
+    shell:
+        """
+        mustache --file {input.mcool} --resolution {params.resolution} --outfile {output.loops} --processes {threads} {params.mustache}
+        """
+
+# 使用 mustache 检测差异 Loops
+rule diff_mustache:
+    input:
+        treatment_mcool="Matrix/Final/{treatment}.mcool",
+        control_mcool="Matrix/Final/{control}.mcool",
+    output:
+        diffloop1="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop1",
+        diffloop2="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop2"
+    params:
+        diff_mustache=config["diff_mustache"],
+        prefix="Loops/Mustache/{resolution}/{treatment}_vs_{control}",
+        resolution="{resolution}"
+    singularity:
+        HICSNAKE_HOME + "/sifs/mustache_20240410.sif"
+    threads: 6
+    shell:
+        """
+        python /opt/conda/lib/python3.12/site-packages/mustache/diff_mustache.py -f1 {input.treatment_mcool} -f2 {input.control_mcool} --resolution {params.resolution} --outfile {params.prefix} --processes {threads} {params.diff_mustache}
+        """
+
+#********************************************************#
+#                       可视化                           #
+#********************************************************#
+rule hicPlotMatrix:
+    input:
+        mcool="Matrix/Final/{sample}.mcool"
+    output:
+        allchr_png="plots/{resolution}/{sample}_plotmatrix_allchr.png",
+        perchr_png="plots/{resolution}/{sample}_plotmatrix_perchr.png"
+    params:
+        resolution="{resolution}"
+    shell:
+        """
+        hicPlotMatrix -m {input.mcool}::/resolutions/{params.resolution} -o {output.allchr_png} --log1p --colorMap YlOrRd --clearMaskedBins --log1p
+        hicPlotMatrix -m {input.mcool}::/resolutions/{params.resolution} -o {output.perchr_png} --log1p --colorMap YlOrRd --clearMaskedBins --perChromosome --log1p
+        """
+
+# 使用 hicCompareMatrices 比较矩阵
+rule hicCompareMatrices:
+    input:
+        treatment_mcool="Matrix/Final/{treatment}.mcool",
+        control_mcool="Matrix/Final/{control}.mcool",
+    output:
+        cool="Matrix/Diff/{resolution}/{treatment}_vs_{control}.cool"
+    params:
+        resolution="{resolution}"
+    singularity:
+        HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
+    threads: 6
+    shell:
+        """
+        hicCompareMatrices --matrices {input.treatment_mcool}::resolutions/{params.resolution} {input.control_mcool}::resolutions/{params.resolution} --operation log2ratio -o {output.cool}
+        """

+ 3 - 0
chromosomes.txt

@@ -0,0 +1,3 @@
+Chr03
+Chr04
+Chr06

+ 65 - 0
config.yaml

@@ -0,0 +1,65 @@
+#######################
+# globe  
+#######################
+# HiCSnake 流程所在位置
+HICSNAKE_HOME: /home/zhxd/workspace/20240527_HiC/HicSnake
+# 基因组前缀 ref/$genome.fa
+genome: genome
+# 酶的识别位点
+restriction_sequence: GATC
+
+#######################
+# resolutions
+#######################    
+# 所有分辨率, 最小值为基础分辨率, 其他值应为基础分辨率的整倍
+resolutions:
+  - 1000
+  - 2000
+  - 5000
+  - 10000
+  - 15000
+  - 20000
+  - 25000
+  - 40000
+  - 100000
+  - 250000
+  - 500000
+  - 1000000
+# CALDER2 的分辨率, 10kb~100kb, 可设置多个 
+Compartment_resolution:
+  - 40000
+# hicFindTADs 的分辨率, 40kb ~ 100kb, 可设置多个
+TAD_resolution:
+  - 40000
+# Mustache 的分辨率, 1kb ~ 20kb, 可设置多个
+Loop_resolution:
+  - 10000
+Plot_matrix_resolution:
+  - 40000
+
+#######################
+# paramaters  
+#######################
+# fastp 过滤
+fastp: "--adapter_sequence AGATCGGAAGAGCACACGTCTGAACTCCAGTCA --adapter_sequence_r2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT --qualified_quality_phred 20 --length_required 50"
+# 构建矩阵 
+hicBuildMatrix: "--minDistance 500 --maxLibraryInsertSize 1500 --inputBufferSize 1000000 --restrictionSequence GATC --danglingSequence GATC"
+# 样本间标准化, 不用改
+hicNormalize: "--normalize smallest"
+# 参考https://github.com/TaoYang-dev/hicrep 
+hicrep: "--h 20 --dBPMax 500000"
+# 计算分辨率
+hicres: "--resolutions 1000,2000,5000,10000"
+# 使用 Calder2 检测 subcompartment, -f 指定 feature track, --chr_prefix 指定原染色体前缀,需要统一
+calder: "-g others -f ref/gene_density.bed --chr_prefix Chr"
+#  
+hicFindTADs: " --correctForMultipleTesting fdr --thresholdComparisons 0.01 --delta 0.01"
+hicDifferentialTAD: "--pValue 0.05 --mode all --modeReject one"
+hicDetectLoops: "--maxLoopDistance 2000000 --windowSize 10 --peakWidth 6 --pValuePreselection 0.05 --pValue 0.05"
+  #mustache: "-pt 0.05 -st 0.88"
+mustache: ""
+  #diff_mustache: "-pt 0.05 -pt2 0.05 -st 0.88"
+diff_mustache: ""
+hicPlotDistVsCounts: ""
+
+stripenn: "" 

+ 1 - 0
contrasts.txt

@@ -0,0 +1 @@
+leaf	bud

+ 0 - 0
raw_data/bud_Rep1_R1.fastq.gz


+ 0 - 0
raw_data/bud_Rep1_R2.fastq.gz


+ 0 - 0
raw_data/bud_Rep2_R1.fastq.gz


+ 0 - 0
raw_data/bud_Rep2_R2.fastq.gz


+ 0 - 0
raw_data/leaf_Rep1_R1.fastq.gz


+ 0 - 0
raw_data/leaf_Rep1_R2.fastq.gz


+ 0 - 0
raw_data/leaf_Rep2_R1.fastq.gz


+ 0 - 0
raw_data/leaf_Rep2_R2.fastq.gz


+ 0 - 0
ref/gene_density.bed


+ 0 - 0
ref/genome.chrom.sizes


+ 0 - 0
ref/genome.fa


+ 6 - 0
run.sh

@@ -0,0 +1,6 @@
+# 生成流程图
+snakemake --dag | dot -Tpdf > workflow.pdf
+# 生成代码,不真正运行
+snakemake --use-singularity --keep-going --cores 40 --rerun-triggers mtime -np
+# 运行程序
+snakemake --use-singularity --keep-going --cores 40 --rerun-triggers mtime 

+ 5 - 0
samples.txt

@@ -0,0 +1,5 @@
+replicate	sample
+leaf_Rep1	leaf
+leaf_Rep2	leaf
+bud_Rep1	bud
+bud_Rep2	bud

+ 75 - 0
scripts/LongestTranscript.py

@@ -0,0 +1,75 @@
+import argparse
+import pandas as pd
+
+def parse_attributes(attributes_str):
+    """ 解析 GTF 文件中的属性字段 """
+    attributes = {}
+    for attr in attributes_str.split(';'):
+        attr = attr.strip()
+        if attr:
+            parts = attr.split(' ', 1)
+            if len(parts) == 2:
+                key, value = parts
+                attributes[key] = value.strip('"')
+    return attributes
+
+def read_gtf(gtf_file):
+    """ 从 GTF 文件读取数据 """
+    data = []
+    with open(gtf_file, 'r') as file:
+        for line in file:
+            if line.startswith('#'):
+                continue
+            fields = line.strip().split('\t')
+            attributes = parse_attributes(fields[8])
+            data.append({
+                'chrom': fields[0],
+                'source': fields[1],
+                'feature': fields[2],
+                'start': int(fields[3]),
+                'end': int(fields[4]),
+                'score': fields[5],
+                'strand': fields[6],
+                'frame': fields[7],
+                'attributes': fields[8],
+                'transcript_id': attributes.get('transcript_id', ''),
+                'gene_id': attributes.get('gene_id', ''),
+                'line': line.strip()  # 保存整行
+            })
+    return pd.DataFrame(data)
+
+def longest_transcripts(df):
+    """ 找到每个基因的最长转录本 """
+    return df[df['feature'] == 'transcript'].sort_values(['gene_id', 'end'], ascending=[True, False]).drop_duplicates('gene_id')
+
+def to_bed6(df):
+    """ 将数据转换为 BED6 格式 """
+    df['score'] = "."
+    df['name'] = df['gene_id']
+    return df[['chrom', 'start', 'end', 'name', 'score', 'strand']]
+
+def write_filtered_gtf(df, longest_transcripts, gtf_out_file):
+    """ 将最长转录本的相关行和对应基因的行写入新的 GTF 文件 """
+    longest_transcript_ids = set(longest_transcripts['transcript_id'])
+    with open(gtf_out_file, 'w') as f:
+        for _, row in df.iterrows():
+            if row['feature'] == 'gene' or row['transcript_id'] in longest_transcript_ids:
+                f.write(row['line'] + '\n')
+
+def main():
+    parser = argparse.ArgumentParser(description="从 GTF 文件提取最长转录本并保存为 BED6 和 GTF 格式")
+    parser.add_argument("gtf_file", help="输入的 GTF 文件路径")
+    parser.add_argument("bed_file", help="输出的 BED6 格式文件路径")
+    parser.add_argument("gtf_out_file", help="输出的 GTF 格式文件路径")
+    args = parser.parse_args()
+
+    df = read_gtf(args.gtf_file)
+    longest_df = longest_transcripts(df)
+    bed6_df = to_bed6(longest_df)
+    bed6_df.to_csv(args.bed_file, sep='\t', index=False, header=False)
+
+    write_filtered_gtf(df, longest_df, args.gtf_out_file)
+
+if __name__ == "__main__":
+    main()
+

+ 297 - 0
scripts/calder

@@ -0,0 +1,297 @@
+#!/usr/bin/env Rscript
+###########################################################################
+# 该程序修改自 https://github.com/CSOgroup/CALDER2/blob/main/scripts/calder
+# 1. 源程序无法正确处理染色体名称不以 chr 为前缀的情况
+# 2. 只修改了 cool 格式作为输入文件,hic 格式预计仍然有 bug 未测试
+# 3. 如果染色体前缀不一致,无法处理
+# 4. 增加了对 mcool 格式的兼容
+# 修改人:张旭东 zhangxudong@genek.cn
+###########################################################################
+
+suppressPackageStartupMessages(library(optparse))
+suppressPackageStartupMessages(library(CALDER))
+
+AVAILABLE_REFERENCE_TRACKS_GENOMES <- c("hg19", "hg38", "mm9", "mm10")
+INPUT_TYPES <- c("hic", "cool", "mcool")
+CHROMS_TO_REMOVE <- c("ALL", "M", "chrM", "MT", "chrMT", "Y", "chrY")
+
+
+parse_arguments <- function(){
+	# Creating the argument parsing options
+	option_list = list(
+	  make_option(c("-i", "--input"), action="store", default=NA, type='character',
+	              help="Input Hi-C contacts"),
+	  # 增加对 mcool 格式的兼容
+	  make_option(c("-t", "--type"), action="store", default='hic', type='character',
+	              help="The type of input: hic or cool or mcool [default %default]"),
+	  make_option(c("-b", "--bin_size"), action="store", default=50000, type='integer',
+	              help="Bin size to use for the analysis [default %default]"),
+	  make_option(c("-g", "--genome"), action="store", default="hg19", type='character',
+	              help="Genome assembly to use [default %default]"),
+	  make_option(c("-f", "--feature_track"), action="store", default=NA, type='character',
+	              help="Genomic feature track to be used to determine A/B compartment direction
+	              when genome == 'others'. The track should presumably have higher values
+	              in A than in B compartmnets. [default %default]"),
+	  # 增加了原染色体前缀参数
+	  make_option(c("--chr_prefix"), action="store", default="chr", type='character',
+	              help="old chr prefix [default %default]"),
+	  make_option(c("-c", "--chromosomes"), action='store', default='all', type='character',
+	  			  help="Chromosomes to analyze, separated by comma. [default %default]"),
+	  make_option(c("-p", "--nproc"), action="store", default=1, type='integer',
+	              help="Number of cores to use [default %default]"),
+	  make_option(c("-o", "--outpath"), action="store", default=NA, type='character',
+	              help="Path to the output folder"),
+	  make_option(c("-k", "--keep_intermediate"), action="store_true", default=FALSE, type='logical',
+	              help="Keep intermediate data after done [default %default]"),
+	  make_option(c("-a", "--adaptive"), action="store_true", default=FALSE, type='logical',
+	              help="Use adaptive resolution choice [default %default]")
+	)
+	parser <- OptionParser(usage = "%prog [options]", option_list=option_list)
+	opt <- parse_args(parser)
+
+	# Checking if input path exists
+	if(is.na(opt$input)){
+		print_help(parser)
+		stop(paste0("Input path (", opt$input,") does not exist"))
+	}
+
+	# Checking if output path is provided
+	if(is.na(opt$outpath)){
+		stop("Output path was not provided")
+	}
+
+	# Check that the input type is one of the possible ones
+	if(!(opt$type %in% INPUT_TYPES)){
+		stop(paste0("Input type ", opt$input_type, " not available"))
+	}
+
+	# Check if the provided genome is in the list of available reference genomes 
+	# or if a feature track is provided
+	if((!(opt$genome %in% AVAILABLE_REFERENCE_TRACKS_GENOMES)) || (file.exists(opt$feature_track))){
+		# in this case, we just assign it the name 'others'
+		opt$genome = "others"
+	}
+
+	writeLines(c(
+		"*******************************",
+		"*            CALDER           *",
+		"*******************************",
+		paste0("[Parameters] Input: ", opt$input),
+		paste0("[Parameters] Input type: ", opt$type),
+		paste0("[Parameters] Bin size: ", opt$bin_size),
+		paste0("[Parameters] Genome: ", opt$genome),
+		paste0("[Parameters] Feature Track: ", opt$feature_track),
+		paste0("[Parameters] Chr Prefix: ", opt$chr_prefix),
+		paste0("[Parameters] Chromosomes: ", opt$chromosomes),
+		paste0("[Parameters] N. cores: ", opt$nproc),
+		paste0("[Parameters] Output: ", opt$outpath),
+		paste0("[Parameters] Keep Intermediate data: ", opt$keep_intermediate),
+		paste0("[Parameters] Use adaptive resolution: ", opt$adaptive)
+	))
+
+	if(file.exists(opt$feature_track)){
+		opt$feature_track <- read.table(opt$feature_track, header = F)
+	}
+
+	return(opt)
+}
+
+sanitize_chroms <- function(chroms){
+	res <- lapply(chroms, function(x){
+		if(startsWith(x, opt$chr_prefix)){
+			return(substring(x, 4))
+		} else{
+			return(x)
+		}
+	})
+	return(res)
+}
+
+handle_input_hic <- function(opt){
+	suppressPackageStartupMessages(library(strawr))
+	chromsizes <- readHicChroms(opt$input)
+	if(opt$chromosomes == "all"){
+		chroms <- chromsizes[!(toupper(chromsizes$name) %in% toupper(CHROMS_TO_REMOVE)), "name"]
+	}
+	else{
+		chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
+		chroms <- chromsizes[chromsizes$name %in% chrom_list, "name"]
+	}
+	chroms <- sanitize_chroms(chroms)
+	CALDER(contact_file_hic = opt$input,
+		   chrs = chroms,
+		   bin_size = opt$bin_size,
+		   genome = opt$genome,
+		   save_dir=opt$outpath,
+		   save_intermediate_data=TRUE,
+		   feature_track=opt$feature_track,
+		   single_binsize_only=!opt$adaptive,
+		   n_cores = opt$nproc,
+		   sub_domains=T)
+}
+
+# 增加了对 mcool 的支持
+handle_input_mcool <- function(opt){
+	intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
+	dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE)
+
+	opt$feature_track[, 1] = gsub(paste0("^", opt$chr_prefix), "chr", opt$feature_track[, 1])
+
+	system(paste0("cooler dump --table chroms --out ", 
+				  file.path(intermediate_data_dir, "chroms.txt"), 
+				  " --header ", 
+				  opt$input,
+				  "::/resolutions/",
+				  opt$bin_size
+				  ))
+	chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE)
+	if(opt$chromosomes == "all"){
+		chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"]
+	}
+	else{
+		chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
+		chroms <- chroms[chroms$name %in% chrom_list, "name"]
+	}
+	
+	dump_paths <- list()
+	for(chrom in chroms){
+		cat(paste0("[Pre-processing] Dumping ", chrom, "\n"))
+		chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt"))
+		dump_paths <- c(dump_paths, chrom_dump_path)
+		if(! file.exists(chrom_dump_path)){
+			system(paste0("cooler dump --table pixels --range ", 
+						  chrom, 
+						  " --join --balanced ",
+						  opt$input,
+						  "::/resolutions/",
+						  opt$bin_size,						  
+						  " | cut -f2,5,8 | awk '{if ($3) print;}' > ",
+						  chrom_dump_path))
+		}
+	}
+
+	chroms <- sanitize_chroms(chroms)
+	names(dump_paths) <- chroms
+
+	CALDER(contact_file_dump=dump_paths, 
+		   chrs=chroms, 
+		   bin_size=opt$bin_size,
+		   genome=opt$genome,
+		   save_dir=opt$outpath,
+		   feature_track=opt$feature_track,
+		   single_binsize_only=!opt$adaptive,
+		   save_intermediate_data=TRUE,
+		   n_cores=opt$nproc,
+		   sub_domains=T)
+
+}
+
+handle_input_cool <- function(opt){
+  intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
+  dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE)
+  # 处理染色体前缀
+  opt$feature_track[, 1] = gsub(paste0("^", opt$chr_prefix), "chr", opt$feature_track[, 1])
+  
+  system(paste0("cooler dump --table chroms --out ", 
+                file.path(intermediate_data_dir, "chroms.txt"), 
+                " --header ", 
+                opt$input
+  ))
+  chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE)
+  if(opt$chromosomes == "all"){
+    chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"]
+  }
+  else{
+    chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
+    chroms <- chroms[chroms$name %in% chrom_list, "name"]
+  }
+  
+  dump_paths <- list()
+  for(chrom in chroms){
+    cat(paste0("[Pre-processing] Dumping ", chrom, "\n"))
+    chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt"))
+    dump_paths <- c(dump_paths, chrom_dump_path)
+    if(! file.exists(chrom_dump_path)){
+      system(paste0("cooler dump --table pixels --range ", 
+                    chrom, 
+                    " --join --balanced ",
+                    opt$input,
+                    " | cut -f2,5,8 | awk '{if ($3) print;}' > ",
+                    chrom_dump_path))
+    }
+  }
+  
+  chroms <- sanitize_chroms(chroms)
+  names(dump_paths) <- chroms
+  
+  CALDER(contact_file_dump=dump_paths, 
+         chrs=chroms, 
+         bin_size=opt$bin_size,
+         genome=opt$genome,
+         save_dir=opt$outpath,
+         feature_track=opt$feature_track,
+         single_binsize_only=!opt$adaptive,
+         save_intermediate_data=TRUE,
+         n_cores=opt$nproc,
+         sub_domains=T)
+  
+}
+
+opt <- parse_arguments()
+
+if(opt$type == "hic"){
+	handle_input_hic(opt)
+} else if(opt$type == "mcool"){
+	handle_input_mcool(opt)
+} else if(opt$type == "cool"){
+  handle_input_cool(opt)
+} else {
+	stop("Unknown input type")
+}
+
+# Cleaning the output
+intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
+if(dir.exists(intermediate_data_dir) && (!opt$keep_intermediate)){
+	cat('[Post-processing] Removing intermediate data\n')
+	unlink(intermediate_data_dir, recursive=TRUE)
+}
+exec_time_file = "./total_execution.time"
+if(file.exists(exec_time_file)){
+	cat("[Post-processing] Removing total_execution.time\n")
+	file.remove(exec_time_file)
+}
+
+# 将 chr 再重新替换为原来的前缀
+replace_chr_in_files <- function(dir_path, old_prefix, new_prefix) {
+  # 定义常见的文本文件扩展名
+  text_file_extensions <- c("txt", "tsv", "csv", "bed")
+  
+  # 获取目录中的所有文件和子目录
+  files <- list.files(dir_path, recursive = TRUE, full.names = TRUE)
+  
+  for (file in files) {
+    # 获取文件扩展名
+    file_ext <- tools::file_ext(file)
+    
+    # 检查文件是否为文本文件
+    if (file.info(file)$isdir == FALSE && file_ext %in% text_file_extensions) {
+      tryCatch({
+        # 读取文件内容
+        content <- readLines(file)
+        
+        # 替换行首的 'chr' 为 'Chr'
+        content <- gsub(paste0("^", old_prefix), new_prefix, content)
+        
+        # 写回文件
+        writeLines(content, file)
+        
+        cat("Processed:", file, "\n")
+      }, error = function(e) {
+        cat("Failed to process:", file, "\n", e, "\n")
+      })
+    }
+  }
+}
+
+# 示例用法
+replace_chr_in_files(opt$outpath, "^chr", opt$chr_prefix)

+ 73 - 0
scripts/combine_fastp_report.R

@@ -0,0 +1,73 @@
+suppressPackageStartupMessages({
+  library(jsonlite)
+  library(dplyr)
+  library(tidyr)
+  library(purrr)
+  library(argparse)
+})
+
+# 创建命令行参数解析器
+parser <- ArgumentParser()
+parser$add_argument("--input", type="character", nargs='+', help="input fastp JSON reports")
+parser$add_argument("--output", type="character", default="report", help="output folder to store fastp summary TSV and RData files")
+
+# 解析命令行参数
+args <- parser$parse_args()
+
+# 获取输入文件和输出目录
+json_files <- args$input
+output_folder <- args$output
+
+# 确保指定了输入文件
+if (length(json_files) == 0) {
+  stop("No input files provided.")
+}
+
+# 创建输出文件夹
+dir.create(output_folder, showWarnings = FALSE, recursive = TRUE)
+
+# 输出文件路径
+output_file <- file.path(output_folder, "fastp_summary.tsv")
+
+# 将要提取的字段列在这里
+fields <- c(
+  "total_reads",
+  "total_bases",
+  "q30_rate",
+  "gc_content"
+)
+
+# 初始化一个空 data.frame 来存储结果
+fastp_summary <- data.frame()
+
+# 遍历所有 JSON 文件
+for (json_file in json_files) {
+  # 读取 JSON 文件
+  data <- fromJSON(json_file)
+  
+  # 从文件名中提取样本名称
+  sample_name <- gsub("_fastp.json", "", basename(json_file))
+  
+  # 提取过滤前和过滤后的所需字段
+  before_filtering <- data$summary$before_filtering[fields]
+  after_filtering <- data$summary$after_filtering[fields]
+  
+  # 将结果添加到 data.frame
+  result <- data.frame(sample = sample_name)
+  result <- cbind(result, before_filtering, after_filtering)
+  fastp_summary <- rbind(fastp_summary, result)
+}
+
+# 重命名列名
+colnames(fastp_summary) <- c("sample",
+                             paste0("before_", fields),
+                             paste0("after_", fields))
+
+# 排序
+fastp_summary <- fastp_summary %>% arrange(sample)
+
+# 将结果写入指定的 TSV 文件
+write.table(fastp_summary, output_file, sep = "\t", row.names = FALSE, quote = FALSE)
+
+# 存储 fastp_summary 数据到 RData 文件中
+save(fastp_summary, file = file.path(output_folder, "fastp_summary.RData"))

BIN
scripts/faCount


+ 126 - 0
scripts/hicres.py

@@ -0,0 +1,126 @@
+import cooler
+import pandas as pd
+import numpy as np
+import argparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+def calculate_valid_bins_percentage(file_path, threshold=1000, num_threads=1, chunk_size=100000, resolutions=None):
+    if file_path.endswith('.mcool'):
+        return calculate_mcool_valid_bins_percentage(file_path, threshold, num_threads, chunk_size, resolutions)
+    else:
+        return calculate_cool_valid_bins_percentage(file_path, threshold, chunk_size)
+
+def calculate_cool_valid_bins_percentage(file_path, threshold=1000, chunk_size=100000):
+    clr = cooler.Cooler(file_path)
+    bins = clr.bins()[:]
+    total_valid_bins = 0
+    total_bins = 0
+
+    for start in range(0, len(bins), chunk_size):
+        end = min(start + chunk_size, len(bins))
+        matrix = clr.matrix(balance=False)[start:end, start:end]
+
+        bin_sums = np.sum(matrix, axis=0) + np.sum(matrix, axis=1) - np.diag(matrix)
+        valid_bins = np.sum(bin_sums > threshold)
+        total_valid_bins += valid_bins
+        total_bins += bin_sums.size
+
+    valid_bin_percentage = (total_valid_bins / total_bins) * 100 if total_bins > 0 else 0
+
+    return {
+        'File': file_path,
+        'Valid Bin Percentage': valid_bin_percentage,
+        'Total Valid Bins': total_valid_bins,
+        'Total Bins': total_bins
+    }
+
+def calculate_mcool_valid_bins_percentage(file_path, threshold=1000, num_threads=1, chunk_size=100000, resolutions=None):
+    available_resolutions = cooler.fileops.list_coolers(file_path)
+
+    print(f"Available resolutions in {file_path}: {available_resolutions}")
+    if resolutions:
+        resolutions = [f'/resolutions/{res}' for res in resolutions.split(',') if f'/resolutions/{res}' in available_resolutions]
+        print(resolutions)
+        if not resolutions:
+            raise ValueError(f"No matching resolutions found in the .mcool file for specified resolutions.")
+    else:
+        resolutions = available_resolutions
+
+    results = []
+
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        future_to_resolution = {
+            executor.submit(process_resolution, file_path, resolution_path, threshold, chunk_size): resolution_path
+            for resolution_path in resolutions
+        }
+
+        for future in as_completed(future_to_resolution):
+            resolution_path = future_to_resolution[future]
+            try:
+                result = future.result()
+                if result is not None:
+                    results.append(result)
+            except Exception as e:
+                print(f"Error processing resolution {resolution_path}: {e}")
+
+    return {
+        'File': file_path,
+        'Results': results
+    }
+
+def process_resolution(file_path, resolution_path, threshold, chunk_size):
+    clr_res = cooler.Cooler(f"{file_path}::{resolution_path}")
+    bins = clr_res.bins()[:]
+
+    print(f"Processing resolution: {resolution_path}")
+
+    total_valid_bins = 0
+    total_bins = 0
+
+    for start in range(0, len(bins), chunk_size):
+        end = min(start + chunk_size, len(bins))
+        matrix = clr_res.matrix(balance=False)[start:end, start:end]
+
+        bin_sums = np.sum(matrix, axis=0) + np.sum(matrix, axis=1) - np.diag(matrix)
+        valid_bins = np.sum(bin_sums > threshold)
+        total_valid_bins += valid_bins
+        total_bins += bin_sums.size
+
+    valid_bin_percentage = (total_valid_bins / total_bins) * 100 if total_bins > 0 else 0
+
+    return {
+        'Resolution': resolution_path.split('/')[-1],
+        'Valid Bin Percentage': valid_bin_percentage,
+        'Total Valid Bins': total_valid_bins,
+        'Total Bins': total_bins
+    }
+
+def save_results_to_csv(results, output_path):
+    if 'Results' in results:
+        df = pd.DataFrame(results['Results'])
+        df.insert(0, 'File', results['File'])
+    else:
+        df = pd.DataFrame([results])
+
+    df.to_csv(output_path, index=False)
+    print(f"Results saved to {output_path}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Calculate valid bins percentage from .cool or .mcool files.")
+    parser.add_argument("file_path", type=str, help="Path to the .cool or .mcool file")
+    parser.add_argument("--output", type=str, required=True, help="Path to save the results as CSV file")
+    parser.add_argument("--threads", type=int, default=1, help="Number of threads to use for processing")
+    parser.add_argument("--chunk_size", type=int, default=100000, help="Chunk size for processing large matrices")
+    parser.add_argument("--resolutions", type=str, help="Comma-separated list of resolutions to process in the .mcool file")
+
+    args = parser.parse_args()
+
+    try:
+        results = calculate_valid_bins_percentage(args.file_path, num_threads=args.threads, chunk_size=args.chunk_size, resolutions=args.resolutions)
+        save_results_to_csv(results, args.output)
+    except Exception as e:
+        print(f"Error processing file {args.file_path}: {e}")
+
+if __name__ == "__main__":
+    main()
+

+ 69 - 0
scripts/scc.py

@@ -0,0 +1,69 @@
+import argparse
+import cooler
+import logging
+import pandas as pd
+from hicrep import hicrepSCC
+from itertools import combinations
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+# 设置日志配置
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def calculate_pair_scc(pair, h, dBPMax, bDownSample):
+    (file1, name1), (file2, name2) = pair  # 解包每个元组
+    logging.info(f"Starting SCC calculation between {name1} and {name2}")
+
+    cool1 = cooler.Cooler(file1)
+    cool2 = cooler.Cooler(file2)
+    binSize1 = cool1.info['bin-size']
+    binSize2 = cool2.info['bin-size']
+    chromosomes = cool1.chromnames
+
+    if binSize1 != binSize2:
+        logging.error("Bin sizes of the two cool files must be equal.")
+        raise ValueError("Bin sizes of the two cool files must be equal.")
+
+    start_time = time.time()
+    scc_scores = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample)
+    elapsed_time = time.time() - start_time
+    logging.info(f"Finished SCC calculation between {name1} and {name2} in {elapsed_time:.2f} seconds")
+
+    results = [{'SampleA': name1, 'SampleB': name2, 'Chromosome': chrom, 'SCC': scc}
+               for chrom, scc in zip(chromosomes, scc_scores)]
+    return results
+
+def compute_scc(files, sample_names, h, dBPMax, bDownSample, num_workers=4):
+    pairs = list(combinations(zip(files, sample_names), 2))
+    results = []
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = [executor.submit(calculate_pair_scc, pair, h, dBPMax, bDownSample) for pair in pairs]
+        for future in futures:
+            results.extend(future.result())
+    return results
+
+def main():
+    parser = argparse.ArgumentParser(description="Compute SCC for pairs of .cool files.")
+    parser.add_argument('--files', nargs='+', required=True, help="List of .cool files to process.")
+    parser.add_argument('--sample_names', nargs='+', required=True, help="List of sample names corresponding to the .cool files.")
+    parser.add_argument('--output', type=str, required=True, help="Output file path to store the results.")
+    parser.add_argument('--h', type=int, default=1, help="Smoothing window radius.")
+    parser.add_argument('--dBPMax', type=int, default=500000, help="Maximum genomic distance.")
+    parser.add_argument('--bDownSample', action='store_true', help="Whether to perform downsampling.")
+    parser.add_argument('--workers', type=int, default=4, help="Number of workers for parallel computation.")
+
+    args = parser.parse_args()
+
+    if len(args.files) != len(args.sample_names):
+        logging.error("The number of files must match the number of sample names provided.")
+        return
+
+    logging.info("Starting SCC computations for all combinations of provided .cool files.")
+    scc_results = compute_scc(args.files, args.sample_names, args.h, args.dBPMax, args.bDownSample, args.workers)
+    
+    df = pd.DataFrame(scc_results)
+    df.to_csv(args.output, index=False)
+    logging.info(f"Results have been saved to {args.output}")
+
+if __name__ == "__main__":
+    main()

BIN
workflow.pdf