11 months ago · 45fa76f2ee
--- a/README.md
+++ b/README.md
@@ -32,3 +32,22 @@ Rscript ../emapperx.R out.emapper.annotations proteins.fa
 
				 这一步两个功能:    
			
 
				 1. 对 emapper 注释结果进行统计绘图      
			
 
				 2. 构建 OrgDB 用于富集分析等     
			
 
				+
			
 
				+## 2.4 大数据集的另一种选择: emapperx_split.R
			
 
				+
			
 
				+对于大数据集，可以使用 `emapperx_split.R` 脚本，它每次只构建一种GO本体类型（MF、BP或CC）的OrgDB包，有效解决内存不足问题。
			
 
				+
			
 
				+```
			
 
				+cd example_data
			
 
				+
			
 
				+# 构建分子功能(MF)的OrgDB
			
 
				+Rscript ../emapperx_split.R out.emapper.annotations proteins.fa MF
			
 
				+
			
 
				+# 构建生物过程(BP)的OrgDB
			
 
				+Rscript ../emapperx_split.R out.emapper.annotations proteins.fa BP
			
 
				+
			
 
				+# 构建细胞组分(CC)的OrgDB
			
 
				+Rscript ../emapperx_split.R out.emapper.annotations proteins.fa CC
			
 
				+```
			
 
				+
			
 
				+这种方法可以显著减少内存使用，适合处理大型基因组数据集。
			
--- a/emapperx_split.R
+++ b/emapperx_split.R
@@ -0,0 +1,101 @@
 
				+#!/usr/bin/env Rscript
			
 
				+
			
 
				+###############################################
			
 
				+# parse parameter 
			
 
				+###############################################
			
 
				+library(argparser, quietly=TRUE) 
			
 
				+
			
 
				+p <- arg_parser("make OrgDB from emapper")
			
 
				+p <- add_argument(p, "emapper_anno", help="emapper annotation result", type="character")
			
 
				+p <- add_argument(p, "proteins", help="proteins in fasta format", type="character")
			
 
				+p <- add_argument(p, "ontology", help="GO ontology type (MF, BP, or CC)", 
			
 
				+                 default="BP", type="character")
			
 
				+
			
 
				+argv <- parse_args(p)
			
 
				+
			
 
				+# set script dir
			
 
				+script_dir <- dirname(strsplit(commandArgs(trailingOnly = FALSE)[4],"=")[[1]][2])
			
 
				+
			
 
				+###############################################
			
 
				+# test input 
			
 
				+###############################################
			
 
				+# argv <- list()
			
 
				+# argv$emapper_anno <- 'out.emapper.annotations'
			
 
				+# argv$proteins <- 'proteins.fa'
			
 
				+# script_dir <- 'emcp'
			
 
				+
			
 
				+library(tidyverse, quietly = TRUE)
			
 
				+library(formattable, quietly = TRUE)
			
 
				+library(AnnotationForge, quietly = TRUE)
			
 
				+library(seqinr, quietly = TRUE)
			
 
				+library(clusterProfiler, quietly = TRUE)
			
 
				+library(GO.db, quietly = TRUE)
			
 
				+
			
 
				+
			
 
				+###############################################
			
 
				+# parse parameter 
			
 
				+###############################################
			
 
				+emapper <- read_delim(argv$emapper_anno, 
			
 
				+                      "\t", escape_double = FALSE, col_names = FALSE, 
			
 
				+                      comment = "#", trim_ws = TRUE) %>%
			
 
				+  dplyr::select(GID = X1, 
			
 
				+                COG = X7,
			
 
				+                Gene_Name = X8,
			
 
				+                Gene_Symbol = X9,
			
 
				+                GO = X10,
			
 
				+                KO = X12,
			
 
				+                Pathway = X13
			
 
				+                )
			
 
				+
			
 
				+###############################################
			
 
				+# make OrgDB 
			
 
				+###############################################
			
 
				+# gene name
			
 
				+gene_info <- dplyr::select(emapper,  GID, Gene_Name) %>%
			
 
				+  dplyr::filter(!is.na(Gene_Name)) %>%
			
 
				+  dplyr::filter(Gene_Name != '-') 
			
 
				+eggnog_anno = length(gene_info$GID)
			
 
				+
			
 
				+# gene to gene ontology
			
 
				+gene2go <- dplyr::select(emapper, GID, GO) %>%
			
 
				+  separate_rows(GO, sep = ',', convert = F) %>%
			
 
				+  filter(!is.na(GO)) %>%
			
 
				+  filter(str_detect(GO, '^GO')) %>%
			
 
				+  mutate(EVIDENCE = 'IEA') 
			
 
				+
			
 
				+go_terms <- AnnotationDbi::select(GO.db, 
			
 
				+                                 keys = unique(gene2go$GO), 
			
 
				+                                 columns = "ONTOLOGY", 
			
 
				+                                 keytype = "GOID")
			
 
				+
			
 
				+# 分别处理三种GO类型
			
 
				+message("拆分GO数据...")
			
 
				+gene2go <- gene2go %>%
			
 
				+    left_join(go_terms, by = c("GO" = "GOID")) %>%
			
 
				+    filter(ONTOLOGY == argv$ontology) %>%
			
 
				+    dplyr::select(GID, GO, EVIDENCE) %>%
			
 
				+    distinct()  
			
 
				+
			
 
				+  
			
 
				+# make org package
			
 
				+makeOrgPackage(gene_info=gene_info,
			
 
				+               go=gene2go,
			
 
				+               maintainer='zhangsan <zhangsan@genek.cn>',
			
 
				+               author='zhangsan',
			
 
				+               outputDir="./",
			
 
				+               tax_id=0000,
			
 
				+               genus='M',
			
 
				+               species='y',
			
 
				+               goTable="go",
			
 
				+               version="1.0")
			
 
				+
			
 
				+# build package
			
 
				+pkgbuild::build('.//org.My.eg.db', dest_path = ".")
			
 
				+
			
 
				+# install package  
			
 
				+dir.create('R_Library', recursive = T)
			
 
				+install.packages('org.My.eg.db_1.0.tar.gz',
			
 
				+                 repos = NULL, #从本地安装
			
 
				+                 lib = 'R_Library') # 安装文件夹
			
 
				+library(org.My.eg.db, lib = 'R_Library')
			
 
				+