#!/usr/bin/env Rscript # 导入必要的库 library(argparser, quietly=TRUE) library(Rsubread) library(limma) library(edgeR) # 创建参数解析器 p <- arg_parser("Run featureCounts and calculate FPKM/TPM") # 添加参数 p <- add_argument(p, "--bam", help="input: bam file", type="character") p <- add_argument(p, "--gtf", help="input: gtf file", type="character", default=NA) p <- add_argument(p, "--saf", help="input: saf file", type="character", default=NA) p <- add_argument(p, "--isPairedEnd", help="Paired-end reads (default: TRUE)", type="logical", default=TRUE) p <- add_argument(p, "--strandSpecific", help="Strand specificity (0, 1, 2)", type="numeric", default=0) p <- add_argument(p, "--output", help="Output prefix", type="character") # 解析命令行参数 argv <- parse_args(p) # 检查 gtf 和 saf 参数是否互斥 if (!is.na(argv$gtf) && !is.na(argv$saf)) { stop("Cannot use both --gtf and --saf. Choose one.") } # 主要计算流程 bamFile <- argv$bam annotFile <- if (!is.na(argv$gtf)) argv$gtf else argv$saf isGTF <- !is.na(argv$gtf) outFilePref <- argv$output # 输出文件路径 outStatsFilePath <- paste0(outFilePref, '.log') outCountsFilePath <- paste0(outFilePref, '.count') # 运行 featureCounts fCountsList <- featureCounts( files = bamFile, annot.ext = annotFile, isGTFAnnotationFile = isGTF, nthreads = 1, isPairedEnd = argv$isPairedEnd, strandSpecific = argv$strandSpecific ) # 创建 DGEList dgeList <- DGEList(counts = fCountsList$counts, genes = fCountsList$annotation) # 计算表达量 cpm <- cpm(dgeList) fpkm <- rpkm(dgeList, dgeList$genes$Length) tpm <- exp(log(fpkm) - log(sum(fpkm)) + log(1e6)) # 写入输出文件 write.table(fCountsList$stat, outStatsFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) featureCounts <- cbind(fCountsList$annotation[,1], fCountsList$counts, fpkm, tpm, cpm) colnames(featureCounts) <- c('gene_id', 'counts', 'fpkm','tpm', 'cpm') write.table(featureCounts, outCountsFilePath, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)