import os import yaml import pandas as pd configfile: "config.yaml" ########################################################## # 全局变量和样本信息 ########################################################## HICSNAKE_HOME = config["HICSNAKE_HOME"] if config["HICSNAKE_HOME"] else os.getcwd() GENOME = config["genome"] RESOLUTIONS = config["resolutions"] BASE_RESOLUTION = min(RESOLUTIONS) #with open('chromosomes.txt', 'r') as file: # CHROMOSOMES = [line.strip() for line in file] meta_data = pd.read_csv('samples.txt', sep='\t') REPLICATES = sorted(meta_data['replicate'].tolist()) SAMPLES2REPLICATES = meta_data.groupby('sample')['replicate'].apply(list).to_dict() SAMPLES = SAMPLES2REPLICATES.keys() contrasts_data = pd.read_csv('contrasts.txt', sep='\t', header=None) CONTRASTS = [f"{row[0]}_vs_{row[1]}" for row in contrasts_data.itertuples(index=False)] ########################################################## # rule all: 最终想要生成的文件 ########################################################## rule all: input: #第一步: 测序数据的过滤和质控 #"clean_data/fastp_summary.tsv", #第二步:比对到参考基因组 #expand("aligned/{replicate}_R{i}.bam", replicate=REPLICATES, i=[1,2]), #第三步: 构建接触矩阵 #expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES), #expand("Matrix/Merged/{sample}.cool", sample=SAMPLES), #expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES), #第四步: 格式转换 expand("Matrix/Final/{sample}.{format}", sample=SAMPLES, format=["mcool"]), #第五步: 接触矩阵质控 #"Matrix/Raw/hicQC.html", #"Matrix/Raw/SCC.csv", expand("Matrix/Final/{sample}_res.csv", sample=SAMPLES), #第六步: AB compartment, TADs, Loops expand("Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed", resolution=config["Compartment_resolution"], sample=SAMPLES), expand("TADs/HiCExplorer/{resolution}/{sample}_domains.bed", resolution=config["TAD_resolution"], sample=SAMPLES), expand("TADs/HiCExplorer/{resolution}/{contrast}_accepted.diff_tad", resolution=config["TAD_resolution"],contrast=CONTRASTS), expand("Loops/Mustache/{resolution}/{sample}.tsv", resolution=config["Loop_resolution"], sample=SAMPLES), expand("Loops/Mustache/{resolution}/{contrast}.diffloop1", resolution=config["Loop_resolution"], contrast=CONTRASTS) #********************************************************# # 第一步: 测序数据的过滤和质控 # #********************************************************# # 使用fastp进行原始数据质控和过滤,自动判断 SE?PE? rule fastp_quality_control: input: read1="raw_data/{replicate}_R1.fastq.gz", read2="raw_data/{replicate}_R2.fastq.gz" output: read1="clean_data/{replicate}_R1.fastq.gz", read2="clean_data/{replicate}_R2.fastq.gz", html_report="clean_data/{replicate}_fastp.html", json_report="clean_data/{replicate}_fastp.json" log: "logs/{replicate}_fastp_quality_control.log" threads: 3 singularity: HICSNAKE_HOME + "/sifs/commonTools_20240327.sif" params: fastp=config["fastp"] shell: """ fastp -i {input.read1} -o {output.read1} -I {input.read2} -O {output.read2} --html {output.html_report} --json {output.json_report} --thread {threads} {params.fastp} 1>{log} 2>&1 """ # 合并fastp质控结果生成表格 rule combine_fastp_reports: input: expand("clean_data/{replicate}_fastp.json", replicate=REPLICATES) output: "clean_data/fastp_summary.tsv" log: "logs/combine_fastp_reports.log" params: hicsnake_home=HICSNAKE_HOME shell: """ Rscript {params.hicsnake_home}/scripts/combine_fastp_report.R --input {input} --output clean_data/ 1>{log} 2>&1 """ #********************************************************# # 第二步:比对到参考基因组 # #********************************************************# # 根据基因组构建 bwa index rule bwa_index: input: genome = "ref/" + GENOME + ".fa" output: genome_index = "ref/genome.bwt" log: "logs/bwa_index.log" singularity: HICSNAKE_HOME + "/sifs/commonTools_20240327.sif" threads: 6 shell: "bwa index -p ref/genome {input.genome} 1>{log} 2>&1" # 使用 bwa 比对 rule bwa: input: genome_index="ref/genome.bwt", read="clean_data/{replicate}_R{i}.fastq.gz", output: sam="aligned/{replicate}_R{i}.sam" threads: 20 log: "logs/{replicate}_R{i}_bwa.log" singularity: HICSNAKE_HOME + "/sifs/commonTools_20240327.sif" shell: """ bwa mem -A1 -B4 -E50 -L0 -t {threads} -o {output.sam} ref/genome {input.read} 2>{log} """ rule sam2bam: input: sam="aligned/{replicate}_R{i}.sam" output: bam="aligned/{replicate}_R{i}.bam" threads: 6 log: "logs/{replicate}_R{i}_sam2bam.log" singularity: HICSNAKE_HOME + "/sifs/commonTools_20240327.sif" shell: """ samtools view -hb --threads {threads} -o {output.bam} {input.sam} """ #********************************************************# # 第三步: 构建接触矩阵 # #********************************************************# # 生成酶切位点 rule hicFindRestSite: input: genome="ref/" + GENOME + ".fa" output: rest_site="ref/rest_site.bed" log: "logs/hicFindRestSite.log" params: restriction_sequence=config["restriction_sequence"] singularity: HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif" shell: """ hicFindRestSite --fasta {input.genome} --searchPattern {params.restriction_sequence} -o {output.rest_site} 1>{log} 2>&1 """ # 生成 cool 格式矩阵 hicBuildMatrix rule hicBuildMatrix: input: read1_bam="aligned/{replicate}_R1.bam", read2_bam="aligned/{replicate}_R2.bam", rest_site="ref/rest_site.bed" output: bam="Matrix/Raw/{replicate}.bam", cool="Matrix/Raw/{replicate}.cool", log="Matrix/Raw/{replicate}/QC.log" log: "logs/{replicate}_hicBuildMatrix.log" params: genome="ref/" + GENOME + ".fa", base_resolution=BASE_RESOLUTION, hicBuildMatrix=config["hicBuildMatrix"] singularity: HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif" threads: 6 shell: """ hicBuildMatrix --samFiles {input.read1_bam} {input.read2_bam} \ --genomeAssembly {params.genome} \ --outBam {output.bam} \ --outFileName {output.cool} \ --QCfolder Matrix/Raw/{wildcards.replicate} \ --restrictionCutFile {input.rest_site} \ --binSize {params.base_resolution} \ --threads {threads} {params.hicBuildMatrix} 1>{log} 2>&1 """ # 合并生物学重复的矩阵 rule hicSumMatrices: input: cools=lambda wildcards: expand("Matrix/Raw/{replicate}.cool", replicate=SAMPLES2REPLICATES[wildcards.sample]) output: cool="Matrix/Merged/{sample}.cool" log: "logs/{sample}_hicSumMatrices.log" singularity: HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif" threads: 3 shell: """ if [ $(echo {input.cools} | wc -w) -gt 1 ]; then hicSumMatrices --matrices {input.cools} --outFileName {output.cool} 1>{log} 2>&1 else cp {input.cools[0]} {output.cool} 1>{log} 2>&1 fi """ # 归一化接触矩阵 rule hicNormalize: input: cools=expand("Matrix/Merged/{sample}.cool", sample=SAMPLES, allow_missing=True) output: cools=expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES, allow_missing=True) params: hicNormalize=config["hicNormalize"] singularity: HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif" threads: 1 shell: """ if [ $(echo {input.cools} | wc -w) -gt 1 ]; then hicNormalize --matrices {input.cools} --outFileName {output.cools} {params.hicNormalize} else cp {input.cools[0]} {output.cools[0]} fi """ #********************************************************# # 第四步: 格式转换 # #********************************************************# # 转换为 mcool 格式 rule cooler_zoomify: input: cool="Matrix/Normalized/{sample}.cool" output: mcool="Matrix/Final/{sample}.mcool" log: "logs/{sample}_cooler_zoomify.log" params: resolutions=",".join(map(str, RESOLUTIONS)) singularity: HICSNAKE_HOME + "/sifs/cooler_20240402.sif" threads: 4 shell: """ cooler zoomify {input.cool} -n {threads} --resolutions {params.resolutions} -o Matrix/Final/{wildcards.sample}_nobalanced.mcool cooler zoomify {input.cool} -n {threads} --balance --resolutions {params.resolutions} -o {output.mcool} """ # 转换为 ginteractions 格式 rule convert_to_ginteractions: input: cool="Matrix/Normalized/{sample}.cool" output: ginteractions="Matrix/Final/{sample}.ginteractions.tsv" log: "logs/{sample}_hicConvertFormat_ginteractions.log" params: resolutions=RESOLUTIONS, prefix="Matrix/Final/{sample}.ginteractions" singularity: HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif" threads: 1 shell: """ hicConvertFormat --matrices {input.cool} --outFileName {params.prefix} --inputFormat cool --outputFormat ginteractions 1>{log} 2>&1 """ # 转换为 hic 格式 rule convert_to_hic: input: ginteractions="Matrix/Final/{sample}.ginteractions.tsv", chrom_sizes = "ref/" + GENOME + ".chrom.sizes" output: hic="Matrix/Final/{sample}.hic" log: "logs/{sample}_convert_to_hic.log" params: resolutions=",".join(str(r) for r in RESOLUTIONS) singularity: HICSNAKE_HOME + "/sifs/juicer_20240401" threads: 1 shell: """ awk -F "\\t" '{{print 0, $1, $2, 0, 0, $4, $5, 1, $7}}' {input.ginteractions} > Matrix/Final/{wildcards.sample}.short sort -k2,2d -k6,6d Matrix/Final/{wildcards.sample}.short > Matrix/Final/{wildcards.sample}.short.sorted java -Xms6g -Xmx40g -jar /opt/juicer_tools.jar pre -r {params.resolutions} Matrix/Final/{wildcards.sample}.short.sorted {output.hic} {input.chrom_sizes} """ #********************************************************# # 第五步: 接触矩阵质控 # #********************************************************# # 使用 hicQC 进行REPLICATE 层次质控 rule hicQC: input: logs=expand("Matrix/Raw/{replicate}/QC.log", replicate=REPLICATES) output: html="Matrix/Raw/hicQC.html" params: REPLICATES=REPLICATES singularity: HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif" threads: 20 shell: """ hicQC --logfiles {input.logs} --labels {params.REPLICATES} --outputFolder Matrix/Raw """ # 计算 SCC rule hicrep: input: cools=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES) output: scc="Matrix/Raw/SCC.csv" params: replicates=REPLICATES, hicrep=config["hicrep"] singularity: HICSNAKE_HOME + "/sifs/hicrep_20240423.sif" threads: 4 shell: """ python scripts/scc.py --workers {threads} {params.hicrep} --output {output.scc} --sample_names {params.replicates} --files {input.cools} """ # 评估数据分辨率 rule estimate_resolution: input: mcool="Matrix/Final/{sample}.mcool" output: res="Matrix/Final/{sample}_res.csv" params: hicres=config["hicres"] singularity: HICSNAKE_HOME + "/sifs/cooler_20240402.sif" threads: 4 shell: """ python scripts/hicres.py {input.mcool} --threads {threads} --output {output.res} {params.hicres} """ # hicPlotDistVsCounts: 距离衰减曲线, 查看样本/重复差异 rule hicPlotDistVsCounts: input: cool=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES) output: pdf="Matrix/Raw/DistVsCounts.pdf" params: replicates=REPLICATES, hicPlotDistVsCounts=config["hicPlotDistVsCounts"] singularity: HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif" threads: 2 shell: """ hicPlotDistVsCounts --matrices {input.cool} --labels {params.replicates} --plotFile {output.pdf} {params.hicPlotDistVsCounts} """ #********************************************************# # 第七步: 三维结构预测 # #********************************************************# rule calder2: input: mcool="Matrix/Final/{sample}.mcool", output: "Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed" params: calder=config["calder"], resolution="{resolution}" threads: 4 log: "logs/{sample}_{resolution}_calder2.log" shell: """ Rscript scripts/calder -i {input.mcool} -t mcool -b {params.resolution} -p {threads} -o Compartments/CALDER2/{wildcards.resolution}/{wildcards.sample} {params.calder} """ # 使用 HiCExplorer 检测 TADs rule hicFindTADs: input: mcool="Matrix/Final/{sample}.mcool" output: boundaries="TADs/HiCExplorer/{resolution}/{sample}_boundaries.bed", domains="TADs/HiCExplorer/{resolution}/{sample}_domains.bed", score="TADs/HiCExplorer/{resolution}/{sample}_score.bedgraph" params: hicFindTADs=config["hicFindTADs"], prefix="TADs/HiCExplorer/{resolution}/{sample}", resolution="{resolution}" singularity: HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif" threads: 4 shell: """ hicFindTADs -p {threads} --matrix {input.mcool}::/resolutions/{params.resolution} --outPrefix {params.prefix} {params.hicFindTADs} """ # 使用 HiCExplorer 检测差异 TADs rule hicDifferentialTAD: input: treatment_mcool="Matrix/Final/{treatment}.mcool", control_mcool="Matrix/Final/{control}.mcool", treatment_tads="TADs/HiCExplorer/{resolution}/{treatment}_domains.bed" output: accepted_diff_tad="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}_accepted.diff_tad" params: hicDifferentialTAD=config["hicDifferentialTAD"], prefix="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}", resolution="{resolution}" singularity: HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif" threads: 4 shell: """ hicDifferentialTAD --targetMatrix {input.treatment_mcool}::/resolutions/{params.resolution} --controlMatrix {input.control_mcool}::/resolutions/{params.resolution} --tadDomains {input.treatment_tads} --outFileNamePrefix {params.prefix} {params.hicDifferentialTAD} --threads {threads} """ # 使用 mustache 检测 Loops rule mustache: input: mcool="Matrix/Final/{sample}.mcool" output: loops="Loops/Mustache/{resolution}/{sample}.tsv" params: mustache=config["mustache"], resolution="{resolution}" singularity: HICSNAKE_HOME + "/sifs/mustache_20240410.sif" threads: 6 shell: """ mustache --file {input.mcool} --resolution {params.resolution} --outfile {output.loops} --processes {threads} {params.mustache} """ # 使用 mustache 检测差异 Loops rule diff_mustache: input: treatment_mcool="Matrix/Final/{treatment}.mcool", control_mcool="Matrix/Final/{control}.mcool", output: diffloop1="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop1", diffloop2="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop2" params: diff_mustache=config["diff_mustache"], prefix="Loops/Mustache/{resolution}/{treatment}_vs_{control}", resolution="{resolution}" singularity: HICSNAKE_HOME + "/sifs/mustache_20240410.sif" threads: 6 shell: """ python /opt/conda/lib/python3.12/site-packages/mustache/diff_mustache.py -f1 {input.treatment_mcool} -f2 {input.control_mcool} --resolution {params.resolution} --outfile {params.prefix} --processes {threads} {params.diff_mustache} """