Browse Source

增加 emapperx_split.R 脚本用于大数据集优化,每次只处理一种GO本体类型,并更新README.md

User 1 month ago
parent
commit
45fa76f2ee
2 changed files with 120 additions and 0 deletions
  1. 19 0
      README.md
  2. 101 0
      emapperx_split.R

+ 19 - 0
README.md

@@ -32,3 +32,22 @@ Rscript ../emapperx.R out.emapper.annotations proteins.fa
 这一步两个功能:    
 1. 对 emapper 注释结果进行统计绘图      
 2. 构建 OrgDB 用于富集分析等     
+
+## 2.4 大数据集的另一种选择: emapperx_split.R
+
+对于大数据集,可以使用 `emapperx_split.R` 脚本,它每次只构建一种GO本体类型(MF、BP或CC)的OrgDB包,有效解决内存不足问题。
+
+```
+cd example_data
+
+# 构建分子功能(MF)的OrgDB
+Rscript ../emapperx_split.R out.emapper.annotations proteins.fa MF
+
+# 构建生物过程(BP)的OrgDB
+Rscript ../emapperx_split.R out.emapper.annotations proteins.fa BP
+
+# 构建细胞组分(CC)的OrgDB
+Rscript ../emapperx_split.R out.emapper.annotations proteins.fa CC
+```
+
+这种方法可以显著减少内存使用,适合处理大型基因组数据集。

+ 101 - 0
emapperx_split.R

@@ -0,0 +1,101 @@
+#!/usr/bin/env Rscript
+
+###############################################
+# parse parameter 
+###############################################
+library(argparser, quietly=TRUE) 
+
+p <- arg_parser("make OrgDB from emapper")
+p <- add_argument(p, "emapper_anno", help="emapper annotation result", type="character")
+p <- add_argument(p, "proteins", help="proteins in fasta format", type="character")
+p <- add_argument(p, "ontology", help="GO ontology type (MF, BP, or CC)", 
+                 default="BP", type="character")
+
+argv <- parse_args(p)
+
+# set script dir
+script_dir <- dirname(strsplit(commandArgs(trailingOnly = FALSE)[4],"=")[[1]][2])
+
+###############################################
+# test input 
+###############################################
+# argv <- list()
+# argv$emapper_anno <- 'out.emapper.annotations'
+# argv$proteins <- 'proteins.fa'
+# script_dir <- 'emcp'
+
+library(tidyverse, quietly = TRUE)
+library(formattable, quietly = TRUE)
+library(AnnotationForge, quietly = TRUE)
+library(seqinr, quietly = TRUE)
+library(clusterProfiler, quietly = TRUE)
+library(GO.db, quietly = TRUE)
+
+
+###############################################
+# parse parameter 
+###############################################
+emapper <- read_delim(argv$emapper_anno, 
+                      "\t", escape_double = FALSE, col_names = FALSE, 
+                      comment = "#", trim_ws = TRUE) %>%
+  dplyr::select(GID = X1, 
+                COG = X7,
+                Gene_Name = X8,
+                Gene_Symbol = X9,
+                GO = X10,
+                KO = X12,
+                Pathway = X13
+                )
+
+###############################################
+# make OrgDB 
+###############################################
+# gene name
+gene_info <- dplyr::select(emapper,  GID, Gene_Name) %>%
+  dplyr::filter(!is.na(Gene_Name)) %>%
+  dplyr::filter(Gene_Name != '-') 
+eggnog_anno = length(gene_info$GID)
+
+# gene to gene ontology
+gene2go <- dplyr::select(emapper, GID, GO) %>%
+  separate_rows(GO, sep = ',', convert = F) %>%
+  filter(!is.na(GO)) %>%
+  filter(str_detect(GO, '^GO')) %>%
+  mutate(EVIDENCE = 'IEA') 
+
+go_terms <- AnnotationDbi::select(GO.db, 
+                                 keys = unique(gene2go$GO), 
+                                 columns = "ONTOLOGY", 
+                                 keytype = "GOID")
+
+# 分别处理三种GO类型
+message("拆分GO数据...")
+gene2go <- gene2go %>%
+    left_join(go_terms, by = c("GO" = "GOID")) %>%
+    filter(ONTOLOGY == argv$ontology) %>%
+    dplyr::select(GID, GO, EVIDENCE) %>%
+    distinct()  
+
+  
+# make org package
+makeOrgPackage(gene_info=gene_info,
+               go=gene2go,
+               maintainer='zhangsan <zhangsan@genek.cn>',
+               author='zhangsan',
+               outputDir="./",
+               tax_id=0000,
+               genus='M',
+               species='y',
+               goTable="go",
+               version="1.0")
+
+# build package
+pkgbuild::build('.//org.My.eg.db', dest_path = ".")
+
+# install package  
+dir.create('R_Library', recursive = T)
+install.packages('org.My.eg.db_1.0.tar.gz',
+                 repos = NULL, #从本地安装
+                 lib = 'R_Library') # 安装文件夹
+library(org.My.eg.db, lib = 'R_Library')
+