소스 검색

suppor SAF format

zhangxudong 10 달 전
부모
커밋
05b2977565
2개의 변경된 파일56개의 추가작업 그리고 30개의 파일을 삭제
  1. 7 1
      README.md
  2. 49 29
      run-featurecounts.R

+ 7 - 1
README.md

@@ -9,6 +9,12 @@ git clone http://git.genek.cn:3333/zhxd2/RunFeatureCounts.git
 
 # 二.使用
 
+对 GTF 定量
 ```
-Rscript ../11.software/RunFeatureCounts/run-featurecounts.R -b ../21.Mapping/BLO_S1_LD1.bam -g ../12.ref/genes.gtf -f exon -a gene_id -i FALSE -s 2 -o BLO_S1_LD1
+Rscript RunFeatureCounts/run-featurecounts.R -b ../21.Mapping/BLO_S1_LD1.bam -g ../12.ref/genes.gtf -f exon -a gene_id -i FALSE -s 2 -o BLO_S1_LD1
+```
+
+对 SAF 定量
+```
+Rscript RunFeatureCounts/run-featurecounts.R -b aligned/FoxA1_E2_rep1_sorted_filtered.bam --saf merged_peaks.saf --isPairedEnd FALSE -o FoxA1_E2_rep1
 ```

+ 49 - 29
run-featurecounts.R

@@ -1,41 +1,61 @@
 #!/usr/bin/env Rscript
-# parse parameter ---------------------------------------------------------
+
+# 导入必要的库
 library(argparser, quietly=TRUE)
-# Create a parser
-p <- arg_parser("run featureCounts and calculate FPKM/TPM")
+library(Rsubread)
+library(limma)
+library(edgeR)
 
-# Add command line arguments
+# 创建参数解析器
+p <- arg_parser("Run featureCounts and calculate FPKM/TPM")
+
+# 添加参数
 p <- add_argument(p, "--bam", help="input: bam file", type="character")
-p <- add_argument(p, "--gtf", help="input: gtf file", type="character")
-p <- add_argument(p, "--featureType", help="a character string or a vector of character strings giving the feature type or types used to select rows in the GTF annotation which will be used for read summarization", type="character", default="exon")
-p <- add_argument(p, "--attrType", help="a character string giving the attribute type in the GTF annotation which will be used to group features (eg. exons) into meta-features", type="character", default="gene_id")
-p <- add_argument(p, "--isPairedEnd", help="indicating whether libraries contain paired-end reads or not", type="logical", default=TRUE)
-p <- add_argument(p, "--strandSpecific", help="0 (unstranded), 1 (stranded) and 2 (reversely stranded)", type="numeric", default=0)
-p <- add_argument(p, "--output", help="output prefix", type="character")
-
-# Parse the command line arguments
+p <- add_argument(p, "--gtf", help="input: gtf file", type="character", default=NA)
+p <- add_argument(p, "--saf", help="input: saf file", type="character", default=NA)
+p <- add_argument(p, "--isPairedEnd", help="Paired-end reads (default: TRUE)", type="logical", default=TRUE)
+p <- add_argument(p, "--strandSpecific", help="Strand specificity (0, 1, 2)", type="numeric", default=0)
+p <- add_argument(p, "--output", help="Output prefix", type="character")
+
+# 解析命令行参数
 argv <- parse_args(p)
 
-library(Rsubread)
-library(limma)
-library(edgeR)
+# 检查 gtf 和 saf 参数是否互斥
+if (!is.na(argv$gtf) && !is.na(argv$saf)) {
+  stop("Cannot use both --gtf and --saf. Choose one.")
+}
 
+# 主要计算流程
 bamFile <- argv$bam
-gtfFile <- argv$gtf
-nthreads <- 1
+annotFile <- if (!is.na(argv$gtf)) argv$gtf else argv$saf
+isGTF <- !is.na(argv$gtf)
 outFilePref <- argv$output
 
-outStatsFilePath  <- paste(outFilePref, '.log',  sep = ''); 
-outCountsFilePath <- paste(outFilePref, '.count', sep = ''); 
-
-fCountsList = featureCounts(bamFile, annot.ext=gtfFile, isGTFAnnotationFile=TRUE, nthreads=nthreads, GTF.featureType=argv$featureType, GTF.attrType=argv$attrType, isPairedEnd=argv$isPairedEnd, strandSpecific=argv$strandSpecific)
-dgeList = DGEList(counts=fCountsList$counts, genes=fCountsList$annotation)
-cpm = cpm(dgeList)
-fpkm = rpkm(dgeList, dgeList$genes$Length)
-tpm = exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
-
+# 输出文件路径
+outStatsFilePath  <- paste0(outFilePref, '.log')
+outCountsFilePath <- paste0(outFilePref, '.count')
+
+# 运行 featureCounts
+fCountsList <- featureCounts(
+  files = bamFile,
+  annot.ext = annotFile,
+  isGTFAnnotationFile = isGTF,
+  nthreads = 1,
+  isPairedEnd = argv$isPairedEnd,
+  strandSpecific = argv$strandSpecific
+)
+
+# 创建 DGEList
+dgeList <- DGEList(counts = fCountsList$counts, genes = fCountsList$annotation)
+
+# 计算表达量
+cpm <- cpm(dgeList)
+fpkm <- rpkm(dgeList, dgeList$genes$Length)
+tpm <- exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
+
+# 写入输出文件
 write.table(fCountsList$stat, outStatsFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
-
-featureCounts = cbind(fCountsList$annotation[,1], fCountsList$counts, fpkm, tpm, cpm)
-colnames(featureCounts) = c('gene_id', 'counts', 'fpkm','tpm', 'cpm')
+featureCounts <- cbind(fCountsList$annotation[,1], fCountsList$counts, fpkm, tpm, cpm)
+colnames(featureCounts) <- c('gene_id', 'counts', 'fpkm','tpm', 'cpm')
 write.table(featureCounts, outCountsFilePath, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
+