run-featurecounts.R 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/usr/bin/env Rscript
  2. # 导入必要的库
  3. library(argparser, quietly=TRUE)
  4. library(Rsubread)
  5. library(limma)
  6. library(edgeR)
  7. # 创建参数解析器
  8. p <- arg_parser("Run featureCounts and calculate FPKM/TPM")
  9. # 添加参数
  10. p <- add_argument(p, "--bam", help="input: bam file", type="character")
  11. p <- add_argument(p, "--strandSpecific", help="Strand specificity (0, 1, 2)", type="numeric", default=0)
  12. p <- add_argument(p, "--gtf", help="input: gtf file", type="character", default=NA)
  13. p <- add_argument(p, "--saf", help="input: saf file", type="character", default=NA)
  14. p <- add_argument(p, "--isPairedEnd", help="Paired-end reads, TRUE or FALSE", type="logical", default=TRUE)
  15. p <- add_argument(p, "--featureType", help="Feature type in GTF annotation (default: exon)", type="character", default="exon")
  16. p <- add_argument(p, "--attrType", help="Attribute type in GTF annotation (default: gene_id)", type="character", default="gene_id")
  17. p <- add_argument(p, "--output", help="Output prefix", type="character")
  18. # 解析命令行参数
  19. argv <- parse_args(p)
  20. # 检查 gtf 和 saf 参数是否互斥
  21. if (!is.na(argv$gtf) && !is.na(argv$saf)) {
  22. stop("Cannot use both --gtf and --saf. Choose one.")
  23. }
  24. # 主要计算流程
  25. bamFile <- argv$bam
  26. annotFile <- if (!is.na(argv$gtf)) argv$gtf else argv$saf
  27. isGTF <- !is.na(argv$gtf)
  28. outFilePref <- argv$output
  29. # 输出文件路径
  30. outStatsFilePath <- paste0(outFilePref, '.log')
  31. outCountsFilePath <- paste0(outFilePref, '.count')
  32. # 运行 featureCounts
  33. fCountsArgs <- list(
  34. files = bamFile,
  35. annot.ext = annotFile,
  36. nthreads = 1,
  37. isPairedEnd = argv$isPairedEnd,
  38. strandSpecific = argv$strandSpecific
  39. )
  40. if (isGTF) {
  41. fCountsArgs$isGTFAnnotationFile <- TRUE
  42. fCountsArgs$GTF.featureType <- argv$featureType
  43. fCountsArgs$GTF.attrType <- argv$attrType
  44. } else {
  45. fCountsArgs$isGTFAnnotationFile <- FALSE
  46. }
  47. fCountsList <- do.call(featureCounts, fCountsArgs)
  48. # 创建 DGEList
  49. dgeList <- DGEList(counts = fCountsList$counts, genes = fCountsList$annotation)
  50. # 计算表达量
  51. cpm <- cpm(dgeList)
  52. fpkm <- rpkm(dgeList, dgeList$genes$Length)
  53. tpm <- exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
  54. # 写入输出文件
  55. write.table(fCountsList$stat, outStatsFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
  56. # 使用 --attrType 参数的值作为第一列的列名
  57. firstColName <- argv$attrType
  58. featureCounts <- cbind(fCountsList$annotation[,1], fCountsList$counts, fpkm, tpm, cpm)
  59. colnames(featureCounts) <- c(firstColName, 'counts', 'fpkm', 'tpm', 'cpm')
  60. write.table(featureCounts, outCountsFilePath, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)