123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487 |
- import os
- import yaml
- import pandas as pd
- configfile: "config.yaml"
- ##########################################################
- # 全局变量和样本信息
- ##########################################################
- HICSNAKE_HOME = config["HICSNAKE_HOME"] if config["HICSNAKE_HOME"] else os.getcwd()
- GENOME = config["genome"]
- RESOLUTIONS = config["resolutions"]
- BASE_RESOLUTION = min(RESOLUTIONS)
- with open('chromosomes.txt', 'r') as file:
- CHROMOSOMES = [line.strip() for line in file]
- meta_data = pd.read_csv('samples.txt', sep='\t')
- REPLICATES = sorted(meta_data['replicate'].tolist())
- SAMPLES2REPLICATES = meta_data.groupby('sample')['replicate'].apply(list).to_dict()
- SAMPLES = SAMPLES2REPLICATES.keys()
- contrasts_data = pd.read_csv('contrasts.txt', sep='\t', header=None)
- CONTRASTS = [f"{row[0]}_vs_{row[1]}" for row in contrasts_data.itertuples(index=False)]
- ##########################################################
- # rule all: 最终想要生成的文件
- ##########################################################
- rule all:
- input:
- #第一步: 测序数据的过滤和质控
- #"clean_data/fastp_summary.tsv",
- #第二步:比对到参考基因组
- #expand("aligned/{replicate}_R{i}.bam", replicate=REPLICATES, i=[1,2]),
- #第三步: 构建接触矩阵
- #expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES),
- #expand("Matrix/Merged/{sample}.cool", sample=SAMPLES),
- #expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES),
- #第四步: 格式转换
- expand("Matrix/Final/{sample}.{format}", sample=SAMPLES, format=["mcool"]),
- #第五步: 接触矩阵质控
- "Matrix/Raw/hicQC.html",
- "Matrix/Raw/SCC.csv",
- expand("Matrix/Final/{sample}_res.csv", sample=SAMPLES),
- #第六步: AB compartment, TADs, Loops
- expand("Matrix/Diff/{resolution}/{contrast}.cool", resolution=config["TAD_resolution"], contrast=CONTRASTS),
- expand("Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed", resolution=config["Compartment_resolution"], sample=SAMPLES),
- expand("TADs/HiCExplorer/{resolution}/{sample}_domains.bed", resolution=config["TAD_resolution"], sample=SAMPLES),
- expand("TADs/HiCExplorer/{resolution}/{contrast}_accepted.diff_tad", resolution=config["TAD_resolution"],contrast=CONTRASTS),
- expand("Loops/Mustache/{resolution}/{sample}.tsv", resolution=config["Loop_resolution"], sample=SAMPLES),
- expand("Loops/Mustache/{resolution}/{contrast}.diffloop1", resolution=config["Loop_resolution"], contrast=CONTRASTS),
- # 可视化
- expand("plots/{resolution}/{sample}_plotmatrix_allchr.png", resolution=config["Plot_matrix_resolution"], sample=SAMPLES),
- expand("plots/{resolution}/{sample}_plotmatrix_perchr.png", resolution=config["Plot_matrix_resolution"], sample=SAMPLES)
-
- #********************************************************#
- # 第一步: 测序数据的过滤和质控 #
- #********************************************************#
- # 使用fastp进行原始数据质控和过滤,自动判断 SE?PE?
- rule fastp_quality_control:
- input:
- read1="raw_data/{replicate}_R1.fastq.gz",
- read2="raw_data/{replicate}_R2.fastq.gz"
- output:
- read1="clean_data/{replicate}_R1.fastq.gz",
- read2="clean_data/{replicate}_R2.fastq.gz",
- html_report="clean_data/{replicate}_fastp.html",
- json_report="clean_data/{replicate}_fastp.json"
- log: "logs/{replicate}_fastp_quality_control.log"
- threads: 3
- singularity:
- HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
- params:
- fastp=config["fastp"]
- shell:
- """
- fastp -i {input.read1} -o {output.read1} -I {input.read2} -O {output.read2} --html {output.html_report} --json {output.json_report} --thread {threads} {params.fastp} 1>{log} 2>&1
- """
- # 合并fastp质控结果生成表格
- rule combine_fastp_reports:
- input:
- expand("clean_data/{replicate}_fastp.json", replicate=REPLICATES)
- output:
- "clean_data/fastp_summary.tsv"
- log: "logs/combine_fastp_reports.log"
- params:
- hicsnake_home=HICSNAKE_HOME
- shell:
- """
- Rscript {params.hicsnake_home}/scripts/combine_fastp_report.R --input {input} --output clean_data/ 1>{log} 2>&1
- """
- #********************************************************#
- # 第二步:比对到参考基因组 #
- #********************************************************#
- # 根据基因组构建 bwa index
- rule bwa_index:
- input:
- genome = "ref/" + GENOME + ".fa"
- output:
- genome_index = "ref/genome.bwt"
- log: "logs/bwa_index.log"
- singularity:
- HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
- threads: 6
- shell:
- "bwa index -p ref/genome {input.genome} 1>{log} 2>&1"
- # 使用 bwa 比对
- rule bwa:
- input:
- genome_index="ref/genome.bwt",
- read="clean_data/{replicate}_R{i}.fastq.gz",
- output:
- sam="aligned/{replicate}_R{i}.sam"
- threads: 20
- log: "logs/{replicate}_R{i}_bwa.log"
- singularity:
- HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
- shell:
- """
- bwa mem -A1 -B4 -E50 -L0 -t {threads} -o {output.sam} ref/genome {input.read} 2>{log}
- """
- rule sam2bam:
- input:
- sam="aligned/{replicate}_R{i}.sam"
- output:
- bam="aligned/{replicate}_R{i}.bam"
- threads: 6
- log: "logs/{replicate}_R{i}_sam2bam.log"
- singularity:
- HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
- shell:
- """
- samtools view -hb --threads {threads} -o {output.bam} {input.sam}
- """
- #********************************************************#
- # 第三步: 构建接触矩阵 #
- #********************************************************#
- # 生成酶切位点
- rule hicFindRestSite:
- input:
- genome="ref/" + GENOME + ".fa"
- output:
- rest_site="ref/rest_site.bed"
- log: "logs/hicFindRestSite.log"
- params:
- restriction_sequence=config["restriction_sequence"]
- singularity:
- HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
- shell:
- """
- hicFindRestSite --fasta {input.genome} --searchPattern {params.restriction_sequence} -o {output.rest_site} 1>{log} 2>&1
- """
- # 生成 cool 格式矩阵 hicBuildMatrix
- rule hicBuildMatrix:
- input:
- read1_bam="aligned/{replicate}_R1.bam",
- read2_bam="aligned/{replicate}_R2.bam",
- rest_site="ref/rest_site.bed"
- output:
- bam="Matrix/Raw/{replicate}.bam",
- cool="Matrix/Raw/{replicate}.cool",
- log="Matrix/Raw/{replicate}/QC.log"
- log: "logs/{replicate}_hicBuildMatrix.log"
- params:
- genome="ref/" + GENOME + ".fa",
- base_resolution=BASE_RESOLUTION,
- hicBuildMatrix=config["hicBuildMatrix"]
- singularity:
- HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
- threads: 6
- shell:
- """
- hicBuildMatrix --samFiles {input.read1_bam} {input.read2_bam} \
- --genomeAssembly {params.genome} \
- --outBam {output.bam} \
- --outFileName {output.cool} \
- --QCfolder Matrix/Raw/{wildcards.replicate} \
- --restrictionCutFile {input.rest_site} \
- --binSize {params.base_resolution} \
- --threads {threads} {params.hicBuildMatrix} 1>{log} 2>&1
- """
- # 合并生物学重复的矩阵
- rule hicSumMatrices:
- input:
- cools=lambda wildcards: expand("Matrix/Raw/{replicate}.cool", replicate=SAMPLES2REPLICATES[wildcards.sample])
- output:
- cool="Matrix/Merged/{sample}.cool"
- log: "logs/{sample}_hicSumMatrices.log"
- singularity:
- HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
- threads: 3
- shell:
- """
- if [ $(echo {input.cools} | wc -w) -gt 1 ]; then
- hicSumMatrices --matrices {input.cools} --outFileName {output.cool} 1>{log} 2>&1
- else
- cp {input.cools[0]} {output.cool} 1>{log} 2>&1
- fi
- """
- # 归一化接触矩阵
- rule hicNormalize:
- input:
- cools=expand("Matrix/Merged/{sample}.cool", sample=SAMPLES, allow_missing=True)
- output:
- cools=expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES, allow_missing=True)
- params:
- hicNormalize=config["hicNormalize"]
- singularity:
- HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
- threads: 1
- shell:
- """
- if [ $(echo {input.cools} | wc -w) -gt 1 ]; then
- hicNormalize --matrices {input.cools} --outFileName {output.cools} {params.hicNormalize}
- else
- cp {input.cools[0]} {output.cools[0]}
- fi
- """
- #********************************************************#
- # 第四步: 格式转换 #
- #********************************************************#
- # 转换为 mcool 格式
- rule cooler_zoomify:
- input:
- cool="Matrix/Normalized/{sample}.cool"
- output:
- mcool="Matrix/Final/{sample}.mcool"
- log: "logs/{sample}_cooler_zoomify.log"
- params:
- resolutions=",".join(map(str, RESOLUTIONS))
- singularity:
- HICSNAKE_HOME + "/sifs/cooler_20240402.sif"
- threads: 4
- shell:
- """
- cooler zoomify {input.cool} -n {threads} --resolutions {params.resolutions} -o Matrix/Final/{wildcards.sample}_nobalanced.mcool
- cooler zoomify {input.cool} -n {threads} --balance --resolutions {params.resolutions} -o {output.mcool}
- """
- # 转换为 ginteractions 格式
- rule convert_to_ginteractions:
- input:
- cool="Matrix/Normalized/{sample}.cool"
- output:
- ginteractions="Matrix/Final/{sample}.ginteractions.tsv"
- log: "logs/{sample}_hicConvertFormat_ginteractions.log"
- params:
- resolutions=RESOLUTIONS,
- prefix="Matrix/Final/{sample}.ginteractions"
- singularity:
- HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
- threads: 1
- shell:
- """
- hicConvertFormat --matrices {input.cool} --outFileName {params.prefix} --inputFormat cool --outputFormat ginteractions 1>{log} 2>&1
- """
- # 转换为 hic 格式
- rule convert_to_hic:
- input:
- ginteractions="Matrix/Final/{sample}.ginteractions.tsv",
- chrom_sizes = "ref/" + GENOME + ".chrom.sizes"
- output:
- hic="Matrix/Final/{sample}.hic"
- log: "logs/{sample}_convert_to_hic.log"
- params:
- resolutions=",".join(str(r) for r in RESOLUTIONS)
- singularity:
- HICSNAKE_HOME + "/sifs/juicer_20240401"
- threads: 1
- shell:
- """
- awk -F "\\t" '{{print 0, $1, $2, 0, 0, $4, $5, 1, $7}}' {input.ginteractions} > Matrix/Final/{wildcards.sample}.short
- sort -k2,2d -k6,6d Matrix/Final/{wildcards.sample}.short > Matrix/Final/{wildcards.sample}.short.sorted
- java -Xms6g -Xmx40g -jar /opt/juicer_tools.jar pre -r {params.resolutions} Matrix/Final/{wildcards.sample}.short.sorted {output.hic} {input.chrom_sizes}
- """
- #********************************************************#
- # 第五步: 接触矩阵质控 #
- #********************************************************#
- # 使用 hicQC 进行REPLICATE 层次质控
- rule hicQC:
- input:
- logs=expand("Matrix/Raw/{replicate}/QC.log", replicate=REPLICATES)
- output:
- html="Matrix/Raw/hicQC.html"
- params:
- REPLICATES=REPLICATES
- singularity:
- HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
- threads: 20
- shell:
- """
- hicQC --logfiles {input.logs} --labels {params.REPLICATES} --outputFolder Matrix/Raw
- """
- # 计算 SCC
- rule hicrep:
- input:
- cools=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES)
- output:
- scc="Matrix/Raw/SCC.csv"
- params:
- replicates=REPLICATES,
- hicrep=config["hicrep"]
- singularity:
- HICSNAKE_HOME + "/sifs/hicrep_20240423.sif"
- threads: 4
- shell:
- """
- python scripts/scc.py --workers {threads} {params.hicrep} --output {output.scc} --sample_names {params.replicates} --files {input.cools}
- """
- # 评估数据分辨率
- rule estimate_resolution:
- input:
- mcool="Matrix/Final/{sample}.mcool"
- output:
- res="Matrix/Final/{sample}_res.csv"
- params:
- hicres=config["hicres"]
- singularity:
- HICSNAKE_HOME + "/sifs/cooler_20240402.sif"
- threads: 4
- shell:
- """
- python scripts/hicres.py {input.mcool} --threads {threads} --output {output.res} {params.hicres}
- """
- # hicPlotDistVsCounts: 距离衰减曲线, 查看样本/重复差异
- rule hicPlotDistVsCounts:
- input:
- cool=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES)
- output:
- pdf="Matrix/Raw/DistVsCounts.pdf"
- params:
- replicates=REPLICATES,
- hicPlotDistVsCounts=config["hicPlotDistVsCounts"]
- singularity:
- HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
- threads: 2
- shell:
- """
- hicPlotDistVsCounts --matrices {input.cool} --labels {params.replicates} --plotFile {output.pdf} {params.hicPlotDistVsCounts}
- """
- #********************************************************#
- # 第七步: 三维结构预测 #
- #********************************************************#
- rule calder2:
- input:
- mcool="Matrix/Final/{sample}.mcool",
- output:
- "Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed"
- params:
- calder=config["calder"],
- resolution="{resolution}"
- threads: 4
- log: "logs/{sample}_{resolution}_calder2.log"
- shell:
- """
- Rscript scripts/calder -i {input.mcool} -t mcool -b {params.resolution} -p {threads} -o Compartments/CALDER2/{wildcards.resolution}/{wildcards.sample} {params.calder}
- """
- # 使用 HiCExplorer 检测 TADs
- rule hicFindTADs:
- input:
- mcool="Matrix/Final/{sample}.mcool"
- output:
- boundaries="TADs/HiCExplorer/{resolution}/{sample}_boundaries.bed",
- domains="TADs/HiCExplorer/{resolution}/{sample}_domains.bed",
- score="TADs/HiCExplorer/{resolution}/{sample}_score.bedgraph"
- params:
- hicFindTADs=config["hicFindTADs"],
- prefix="TADs/HiCExplorer/{resolution}/{sample}",
- resolution="{resolution}"
- singularity:
- HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
- threads: 4
- shell:
- """
- hicFindTADs -p {threads} --matrix {input.mcool}::/resolutions/{params.resolution} --outPrefix {params.prefix} {params.hicFindTADs}
- """
- # 使用 HiCExplorer 检测差异 TADs
- rule hicDifferentialTAD:
- input:
- treatment_mcool="Matrix/Final/{treatment}.mcool",
- control_mcool="Matrix/Final/{control}.mcool",
- treatment_tads="TADs/HiCExplorer/{resolution}/{treatment}_domains.bed"
- output:
- accepted_diff_tad="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}_accepted.diff_tad"
- params:
- hicDifferentialTAD=config["hicDifferentialTAD"],
- prefix="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}",
- resolution="{resolution}"
- singularity:
- HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
- threads: 4
- shell:
- """
- hicDifferentialTAD --targetMatrix {input.treatment_mcool}::/resolutions/{params.resolution} --controlMatrix {input.control_mcool}::/resolutions/{params.resolution} --tadDomains {input.treatment_tads} --outFileNamePrefix {params.prefix} {params.hicDifferentialTAD} --threads {threads}
- """
- # 使用 mustache 检测 Loops
- rule mustache:
- input:
- mcool="Matrix/Final/{sample}.mcool"
- output:
- loops="Loops/Mustache/{resolution}/{sample}.tsv"
- params:
- mustache=config["mustache"],
- resolution="{resolution}"
- singularity:
- HICSNAKE_HOME + "/sifs/mustache_20240410.sif"
- threads: 6
- shell:
- """
- mustache --file {input.mcool} --resolution {params.resolution} --outfile {output.loops} --processes {threads} {params.mustache}
- """
- # 使用 mustache 检测差异 Loops
- rule diff_mustache:
- input:
- treatment_mcool="Matrix/Final/{treatment}.mcool",
- control_mcool="Matrix/Final/{control}.mcool",
- output:
- diffloop1="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop1",
- diffloop2="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop2"
- params:
- diff_mustache=config["diff_mustache"],
- prefix="Loops/Mustache/{resolution}/{treatment}_vs_{control}",
- resolution="{resolution}"
- singularity:
- HICSNAKE_HOME + "/sifs/mustache_20240410.sif"
- threads: 6
- shell:
- """
- python /opt/conda/lib/python3.12/site-packages/mustache/diff_mustache.py -f1 {input.treatment_mcool} -f2 {input.control_mcool} --resolution {params.resolution} --outfile {params.prefix} --processes {threads} {params.diff_mustache}
- """
- #********************************************************#
- # 可视化 #
- #********************************************************#
- rule hicPlotMatrix:
- input:
- mcool="Matrix/Final/{sample}.mcool"
- output:
- allchr_png="plots/{resolution}/{sample}_plotmatrix_allchr.png",
- perchr_png="plots/{resolution}/{sample}_plotmatrix_perchr.png"
- params:
- resolution="{resolution}"
- shell:
- """
- hicPlotMatrix -m {input.mcool}::/resolutions/{params.resolution} -o {output.allchr_png} --log1p --colorMap YlOrRd --clearMaskedBins --log1p
- hicPlotMatrix -m {input.mcool}::/resolutions/{params.resolution} -o {output.perchr_png} --log1p --colorMap YlOrRd --clearMaskedBins --perChromosome --log1p
- """
- # 使用 hicCompareMatrices 比较矩阵
- rule hicCompareMatrices:
- input:
- treatment_mcool="Matrix/Final/{treatment}.mcool",
- control_mcool="Matrix/Final/{control}.mcool",
- output:
- cool="Matrix/Diff/{resolution}/{treatment}_vs_{control}.cool"
- params:
- resolution="{resolution}"
- singularity:
- HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
- threads: 6
- shell:
- """
- hicCompareMatrices --matrices {input.treatment_mcool}::resolutions/{params.resolution} {input.control_mcool}::resolutions/{params.resolution} --operation log2ratio -o {output.cool}
- """
|