Snakefile 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. import os
  2. import yaml
  3. import pandas as pd
  4. configfile: "config.yaml"
  5. ##########################################################
  6. # 全局变量和样本信息
  7. ##########################################################
  8. HICSNAKE_HOME = config["HICSNAKE_HOME"] if config["HICSNAKE_HOME"] else os.getcwd()
  9. GENOME = config["genome"]
  10. RESOLUTIONS = config["resolutions"]
  11. BASE_RESOLUTION = min(RESOLUTIONS)
  12. #with open('chromosomes.txt', 'r') as file:
  13. # CHROMOSOMES = [line.strip() for line in file]
  14. meta_data = pd.read_csv('samples.txt', sep='\t')
  15. REPLICATES = sorted(meta_data['replicate'].tolist())
  16. SAMPLES2REPLICATES = meta_data.groupby('sample')['replicate'].apply(list).to_dict()
  17. SAMPLES = SAMPLES2REPLICATES.keys()
  18. contrasts_data = pd.read_csv('contrasts.txt', sep='\t', header=None)
  19. CONTRASTS = [f"{row[0]}_vs_{row[1]}" for row in contrasts_data.itertuples(index=False)]
  20. ##########################################################
  21. # rule all: 最终想要生成的文件
  22. ##########################################################
  23. rule all:
  24. input:
  25. #第一步: 测序数据的过滤和质控
  26. #"clean_data/fastp_summary.tsv",
  27. #第二步:比对到参考基因组
  28. #expand("aligned/{replicate}_R{i}.bam", replicate=REPLICATES, i=[1,2]),
  29. #第三步: 构建接触矩阵
  30. #expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES),
  31. #expand("Matrix/Merged/{sample}.cool", sample=SAMPLES),
  32. #expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES),
  33. #第四步: 格式转换
  34. expand("Matrix/Final/{sample}.{format}", sample=SAMPLES, format=["mcool"]),
  35. #第五步: 接触矩阵质控
  36. #"Matrix/Raw/hicQC.html",
  37. #"Matrix/Raw/SCC.csv",
  38. expand("Matrix/Final/{sample}_res.csv", sample=SAMPLES),
  39. #第六步: AB compartment, TADs, Loops
  40. expand("Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed", resolution=config["Compartment_resolution"], sample=SAMPLES),
  41. expand("TADs/HiCExplorer/{resolution}/{sample}_domains.bed", resolution=config["TAD_resolution"], sample=SAMPLES),
  42. expand("TADs/HiCExplorer/{resolution}/{contrast}_accepted.diff_tad", resolution=config["TAD_resolution"],contrast=CONTRASTS),
  43. expand("Loops/Mustache/{resolution}/{sample}.tsv", resolution=config["Loop_resolution"], sample=SAMPLES),
  44. expand("Loops/Mustache/{resolution}/{contrast}.diffloop1", resolution=config["Loop_resolution"], contrast=CONTRASTS)
  45. #********************************************************#
  46. # 第一步: 测序数据的过滤和质控 #
  47. #********************************************************#
  48. # 使用fastp进行原始数据质控和过滤,自动判断 SE?PE?
  49. rule fastp_quality_control:
  50. input:
  51. read1="raw_data/{replicate}_R1.fastq.gz",
  52. read2="raw_data/{replicate}_R2.fastq.gz"
  53. output:
  54. read1="clean_data/{replicate}_R1.fastq.gz",
  55. read2="clean_data/{replicate}_R2.fastq.gz",
  56. html_report="clean_data/{replicate}_fastp.html",
  57. json_report="clean_data/{replicate}_fastp.json"
  58. log: "logs/{replicate}_fastp_quality_control.log"
  59. threads: 3
  60. singularity:
  61. HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
  62. params:
  63. fastp=config["fastp"]
  64. shell:
  65. """
  66. fastp -i {input.read1} -o {output.read1} -I {input.read2} -O {output.read2} --html {output.html_report} --json {output.json_report} --thread {threads} {params.fastp} 1>{log} 2>&1
  67. """
  68. # 合并fastp质控结果生成表格
  69. rule combine_fastp_reports:
  70. input:
  71. expand("clean_data/{replicate}_fastp.json", replicate=REPLICATES)
  72. output:
  73. "clean_data/fastp_summary.tsv"
  74. log: "logs/combine_fastp_reports.log"
  75. params:
  76. hicsnake_home=HICSNAKE_HOME
  77. shell:
  78. """
  79. Rscript {params.hicsnake_home}/scripts/combine_fastp_report.R --input {input} --output clean_data/ 1>{log} 2>&1
  80. """
  81. #********************************************************#
  82. # 第二步:比对到参考基因组 #
  83. #********************************************************#
  84. # 根据基因组构建 bwa index
  85. rule bwa_index:
  86. input:
  87. genome = "ref/" + GENOME + ".fa"
  88. output:
  89. genome_index = "ref/genome.bwt"
  90. log: "logs/bwa_index.log"
  91. singularity:
  92. HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
  93. threads: 6
  94. shell:
  95. "bwa index -p ref/genome {input.genome} 1>{log} 2>&1"
  96. # 使用 bwa 比对
  97. rule bwa:
  98. input:
  99. genome_index="ref/genome.bwt",
  100. read="clean_data/{replicate}_R{i}.fastq.gz",
  101. output:
  102. sam="aligned/{replicate}_R{i}.sam"
  103. threads: 20
  104. log: "logs/{replicate}_R{i}_bwa.log"
  105. singularity:
  106. HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
  107. shell:
  108. """
  109. bwa mem -A1 -B4 -E50 -L0 -t {threads} -o {output.sam} ref/genome {input.read} 2>{log}
  110. """
  111. rule sam2bam:
  112. input:
  113. sam="aligned/{replicate}_R{i}.sam"
  114. output:
  115. bam="aligned/{replicate}_R{i}.bam"
  116. threads: 6
  117. log: "logs/{replicate}_R{i}_sam2bam.log"
  118. singularity:
  119. HICSNAKE_HOME + "/sifs/commonTools_20240327.sif"
  120. shell:
  121. """
  122. samtools view -hb --threads {threads} -o {output.bam} {input.sam}
  123. """
  124. #********************************************************#
  125. # 第三步: 构建接触矩阵 #
  126. #********************************************************#
  127. # 生成酶切位点
  128. rule hicFindRestSite:
  129. input:
  130. genome="ref/" + GENOME + ".fa"
  131. output:
  132. rest_site="ref/rest_site.bed"
  133. log: "logs/hicFindRestSite.log"
  134. params:
  135. restriction_sequence=config["restriction_sequence"]
  136. singularity:
  137. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  138. shell:
  139. """
  140. hicFindRestSite --fasta {input.genome} --searchPattern {params.restriction_sequence} -o {output.rest_site} 1>{log} 2>&1
  141. """
  142. # 生成 cool 格式矩阵 hicBuildMatrix
  143. rule hicBuildMatrix:
  144. input:
  145. read1_bam="aligned/{replicate}_R1.bam",
  146. read2_bam="aligned/{replicate}_R2.bam",
  147. rest_site="ref/rest_site.bed"
  148. output:
  149. bam="Matrix/Raw/{replicate}.bam",
  150. cool="Matrix/Raw/{replicate}.cool",
  151. log="Matrix/Raw/{replicate}/QC.log"
  152. log: "logs/{replicate}_hicBuildMatrix.log"
  153. params:
  154. genome="ref/" + GENOME + ".fa",
  155. base_resolution=BASE_RESOLUTION,
  156. hicBuildMatrix=config["hicBuildMatrix"]
  157. singularity:
  158. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  159. threads: 6
  160. shell:
  161. """
  162. hicBuildMatrix --samFiles {input.read1_bam} {input.read2_bam} \
  163. --genomeAssembly {params.genome} \
  164. --outBam {output.bam} \
  165. --outFileName {output.cool} \
  166. --QCfolder Matrix/Raw/{wildcards.replicate} \
  167. --restrictionCutFile {input.rest_site} \
  168. --binSize {params.base_resolution} \
  169. --threads {threads} {params.hicBuildMatrix} 1>{log} 2>&1
  170. """
  171. # 合并生物学重复的矩阵
  172. rule hicSumMatrices:
  173. input:
  174. cools=lambda wildcards: expand("Matrix/Raw/{replicate}.cool", replicate=SAMPLES2REPLICATES[wildcards.sample])
  175. output:
  176. cool="Matrix/Merged/{sample}.cool"
  177. log: "logs/{sample}_hicSumMatrices.log"
  178. singularity:
  179. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  180. threads: 3
  181. shell:
  182. """
  183. if [ $(echo {input.cools} | wc -w) -gt 1 ]; then
  184. hicSumMatrices --matrices {input.cools} --outFileName {output.cool} 1>{log} 2>&1
  185. else
  186. cp {input.cools[0]} {output.cool} 1>{log} 2>&1
  187. fi
  188. """
  189. # 归一化接触矩阵
  190. rule hicNormalize:
  191. input:
  192. cools=expand("Matrix/Merged/{sample}.cool", sample=SAMPLES, allow_missing=True)
  193. output:
  194. cools=expand("Matrix/Normalized/{sample}.cool", sample=SAMPLES, allow_missing=True)
  195. params:
  196. hicNormalize=config["hicNormalize"]
  197. singularity:
  198. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  199. threads: 1
  200. shell:
  201. """
  202. if [ $(echo {input.cools} | wc -w) -gt 1 ]; then
  203. hicNormalize --matrices {input.cools} --outFileName {output.cools} {params.hicNormalize}
  204. else
  205. cp {input.cools[0]} {output.cools[0]}
  206. fi
  207. """
  208. #********************************************************#
  209. # 第四步: 格式转换 #
  210. #********************************************************#
  211. # 转换为 mcool 格式
  212. rule cooler_zoomify:
  213. input:
  214. cool="Matrix/Normalized/{sample}.cool"
  215. output:
  216. mcool="Matrix/Final/{sample}.mcool"
  217. log: "logs/{sample}_cooler_zoomify.log"
  218. params:
  219. resolutions=",".join(map(str, RESOLUTIONS))
  220. singularity:
  221. HICSNAKE_HOME + "/sifs/cooler_20240402.sif"
  222. threads: 4
  223. shell:
  224. """
  225. cooler zoomify {input.cool} -n {threads} --resolutions {params.resolutions} -o Matrix/Final/{wildcards.sample}_nobalanced.mcool
  226. cooler zoomify {input.cool} -n {threads} --balance --resolutions {params.resolutions} -o {output.mcool}
  227. """
  228. # 转换为 ginteractions 格式
  229. rule convert_to_ginteractions:
  230. input:
  231. cool="Matrix/Normalized/{sample}.cool"
  232. output:
  233. ginteractions="Matrix/Final/{sample}.ginteractions.tsv"
  234. log: "logs/{sample}_hicConvertFormat_ginteractions.log"
  235. params:
  236. resolutions=RESOLUTIONS,
  237. prefix="Matrix/Final/{sample}.ginteractions"
  238. singularity:
  239. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  240. threads: 1
  241. shell:
  242. """
  243. hicConvertFormat --matrices {input.cool} --outFileName {params.prefix} --inputFormat cool --outputFormat ginteractions 1>{log} 2>&1
  244. """
  245. # 转换为 hic 格式
  246. rule convert_to_hic:
  247. input:
  248. ginteractions="Matrix/Final/{sample}.ginteractions.tsv",
  249. chrom_sizes = "ref/" + GENOME + ".chrom.sizes"
  250. output:
  251. hic="Matrix/Final/{sample}.hic"
  252. log: "logs/{sample}_convert_to_hic.log"
  253. params:
  254. resolutions=",".join(str(r) for r in RESOLUTIONS)
  255. singularity:
  256. HICSNAKE_HOME + "/sifs/juicer_20240401"
  257. threads: 1
  258. shell:
  259. """
  260. awk -F "\\t" '{{print 0, $1, $2, 0, 0, $4, $5, 1, $7}}' {input.ginteractions} > Matrix/Final/{wildcards.sample}.short
  261. sort -k2,2d -k6,6d Matrix/Final/{wildcards.sample}.short > Matrix/Final/{wildcards.sample}.short.sorted
  262. java -Xms6g -Xmx40g -jar /opt/juicer_tools.jar pre -r {params.resolutions} Matrix/Final/{wildcards.sample}.short.sorted {output.hic} {input.chrom_sizes}
  263. """
  264. #********************************************************#
  265. # 第五步: 接触矩阵质控 #
  266. #********************************************************#
  267. # 使用 hicQC 进行REPLICATE 层次质控
  268. rule hicQC:
  269. input:
  270. logs=expand("Matrix/Raw/{replicate}/QC.log", replicate=REPLICATES)
  271. output:
  272. html="Matrix/Raw/hicQC.html"
  273. params:
  274. REPLICATES=REPLICATES
  275. singularity:
  276. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  277. threads: 20
  278. shell:
  279. """
  280. hicQC --logfiles {input.logs} --labels {params.REPLICATES} --outputFolder Matrix/Raw
  281. """
  282. # 计算 SCC
  283. rule hicrep:
  284. input:
  285. cools=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES)
  286. output:
  287. scc="Matrix/Raw/SCC.csv"
  288. params:
  289. replicates=REPLICATES,
  290. hicrep=config["hicrep"]
  291. singularity:
  292. HICSNAKE_HOME + "/sifs/hicrep_20240423.sif"
  293. threads: 4
  294. shell:
  295. """
  296. python scripts/scc.py --workers {threads} {params.hicrep} --output {output.scc} --sample_names {params.replicates} --files {input.cools}
  297. """
  298. # 评估数据分辨率
  299. rule estimate_resolution:
  300. input:
  301. mcool="Matrix/Final/{sample}.mcool"
  302. output:
  303. res="Matrix/Final/{sample}_res.csv"
  304. params:
  305. hicres=config["hicres"]
  306. singularity:
  307. HICSNAKE_HOME + "/sifs/cooler_20240402.sif"
  308. threads: 4
  309. shell:
  310. """
  311. python scripts/hicres.py {input.mcool} --threads {threads} --output {output.res} {params.hicres}
  312. """
  313. # hicPlotDistVsCounts: 距离衰减曲线, 查看样本/重复差异
  314. rule hicPlotDistVsCounts:
  315. input:
  316. cool=expand("Matrix/Raw/{replicate}.cool", replicate=REPLICATES)
  317. output:
  318. pdf="Matrix/Raw/DistVsCounts.pdf"
  319. params:
  320. replicates=REPLICATES,
  321. hicPlotDistVsCounts=config["hicPlotDistVsCounts"]
  322. singularity:
  323. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  324. threads: 2
  325. shell:
  326. """
  327. hicPlotDistVsCounts --matrices {input.cool} --labels {params.replicates} --plotFile {output.pdf} {params.hicPlotDistVsCounts}
  328. """
  329. #********************************************************#
  330. # 第七步: 三维结构预测 #
  331. #********************************************************#
  332. rule calder2:
  333. input:
  334. mcool="Matrix/Final/{sample}.mcool",
  335. output:
  336. "Compartments/CALDER2/{resolution}/{sample}/sub_compartments/all_sub_compartments.bed"
  337. params:
  338. calder=config["calder"],
  339. resolution="{resolution}"
  340. threads: 4
  341. log: "logs/{sample}_{resolution}_calder2.log"
  342. shell:
  343. """
  344. Rscript scripts/calder -i {input.mcool} -t mcool -b {params.resolution} -p {threads} -o Compartments/CALDER2/{wildcards.resolution}/{wildcards.sample} {params.calder}
  345. """
  346. # 使用 HiCExplorer 检测 TADs
  347. rule hicFindTADs:
  348. input:
  349. mcool="Matrix/Final/{sample}.mcool"
  350. output:
  351. boundaries="TADs/HiCExplorer/{resolution}/{sample}_boundaries.bed",
  352. domains="TADs/HiCExplorer/{resolution}/{sample}_domains.bed",
  353. score="TADs/HiCExplorer/{resolution}/{sample}_score.bedgraph"
  354. params:
  355. hicFindTADs=config["hicFindTADs"],
  356. prefix="TADs/HiCExplorer/{resolution}/{sample}",
  357. resolution="{resolution}"
  358. singularity:
  359. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  360. threads: 4
  361. shell:
  362. """
  363. hicFindTADs -p {threads} --matrix {input.mcool}::/resolutions/{params.resolution} --outPrefix {params.prefix} {params.hicFindTADs}
  364. """
  365. # 使用 HiCExplorer 检测差异 TADs
  366. rule hicDifferentialTAD:
  367. input:
  368. treatment_mcool="Matrix/Final/{treatment}.mcool",
  369. control_mcool="Matrix/Final/{control}.mcool",
  370. treatment_tads="TADs/HiCExplorer/{resolution}/{treatment}_domains.bed"
  371. output:
  372. accepted_diff_tad="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}_accepted.diff_tad"
  373. params:
  374. hicDifferentialTAD=config["hicDifferentialTAD"],
  375. prefix="TADs/HiCExplorer/{resolution}/{treatment}_vs_{control}",
  376. resolution="{resolution}"
  377. singularity:
  378. HICSNAKE_HOME + "/sifs/hicexplorer_20240327.sif"
  379. threads: 4
  380. shell:
  381. """
  382. hicDifferentialTAD --targetMatrix {input.treatment_mcool}::/resolutions/{params.resolution} --controlMatrix {input.control_mcool}::/resolutions/{params.resolution} --tadDomains {input.treatment_tads} --outFileNamePrefix {params.prefix} {params.hicDifferentialTAD} --threads {threads}
  383. """
  384. # 使用 mustache 检测 Loops
  385. rule mustache:
  386. input:
  387. mcool="Matrix/Final/{sample}.mcool"
  388. output:
  389. loops="Loops/Mustache/{resolution}/{sample}.tsv"
  390. params:
  391. mustache=config["mustache"],
  392. resolution="{resolution}"
  393. singularity:
  394. HICSNAKE_HOME + "/sifs/mustache_20240410.sif"
  395. threads: 6
  396. shell:
  397. """
  398. mustache --file {input.mcool} --resolution {params.resolution} --outfile {output.loops} --processes {threads} {params.mustache}
  399. """
  400. # 使用 mustache 检测差异 Loops
  401. rule diff_mustache:
  402. input:
  403. treatment_mcool="Matrix/Final/{treatment}.mcool",
  404. control_mcool="Matrix/Final/{control}.mcool",
  405. output:
  406. diffloop1="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop1",
  407. diffloop2="Loops/Mustache/{resolution}/{treatment}_vs_{control}.diffloop2"
  408. params:
  409. diff_mustache=config["diff_mustache"],
  410. prefix="Loops/Mustache/{resolution}/{treatment}_vs_{control}",
  411. resolution="{resolution}"
  412. singularity:
  413. HICSNAKE_HOME + "/sifs/mustache_20240410.sif"
  414. threads: 6
  415. shell:
  416. """
  417. python /opt/conda/lib/python3.12/site-packages/mustache/diff_mustache.py -f1 {input.treatment_mcool} -f2 {input.control_mcool} --resolution {params.resolution} --outfile {params.prefix} --processes {threads} {params.diff_mustache}
  418. """