1 год назад · 45fa76f2ee
--- a/README.md
+++ b/README.md
@@ -32,3 +32,22 @@ Rscript ../emapperx.R out.emapper.annotations proteins.fa
 
															 这一步两个功能:    
														
 
															 1. 对 emapper 注释结果进行统计绘图      
														
 
															 2. 构建 OrgDB 用于富集分析等     
														
 
															+
														
 
															+## 2.4 大数据集的另一种选择: emapperx_split.R
														
 
															+
														
 
															+对于大数据集，可以使用 `emapperx_split.R` 脚本，它每次只构建一种GO本体类型（MF、BP或CC）的OrgDB包，有效解决内存不足问题。
														
 
															+
														
 
															+```
														
 
															+cd example_data
														
 
															+
														
 
															+# 构建分子功能(MF)的OrgDB
														
 
															+Rscript ../emapperx_split.R out.emapper.annotations proteins.fa MF
														
 
															+
														
 
															+# 构建生物过程(BP)的OrgDB
														
 
															+Rscript ../emapperx_split.R out.emapper.annotations proteins.fa BP
														
 
															+
														
 
															+# 构建细胞组分(CC)的OrgDB
														
 
															+Rscript ../emapperx_split.R out.emapper.annotations proteins.fa CC
														
 
															+```
														
 
															+
														
 
															+这种方法可以显著减少内存使用，适合处理大型基因组数据集。
														
--- a/emapperx_split.R
+++ b/emapperx_split.R
@@ -0,0 +1,101 @@
 
															+#!/usr/bin/env Rscript
														
 
															+
														
 
															+###############################################
														
 
															+# parse parameter 
														
 
															+###############################################
														
 
															+library(argparser, quietly=TRUE) 
														
 
															+
														
 
															+p <- arg_parser("make OrgDB from emapper")
														
 
															+p <- add_argument(p, "emapper_anno", help="emapper annotation result", type="character")
														
 
															+p <- add_argument(p, "proteins", help="proteins in fasta format", type="character")
														
 
															+p <- add_argument(p, "ontology", help="GO ontology type (MF, BP, or CC)", 
														
 
															+                 default="BP", type="character")
														
 
															+
														
 
															+argv <- parse_args(p)
														
 
															+
														
 
															+# set script dir
														
 
															+script_dir <- dirname(strsplit(commandArgs(trailingOnly = FALSE)[4],"=")[[1]][2])
														
 
															+
														
 
															+###############################################
														
 
															+# test input 
														
 
															+###############################################
														
 
															+# argv <- list()
														
 
															+# argv$emapper_anno <- 'out.emapper.annotations'
														
 
															+# argv$proteins <- 'proteins.fa'
														
 
															+# script_dir <- 'emcp'
														
 
															+
														
 
															+library(tidyverse, quietly = TRUE)
														
 
															+library(formattable, quietly = TRUE)
														
 
															+library(AnnotationForge, quietly = TRUE)
														
 
															+library(seqinr, quietly = TRUE)
														
 
															+library(clusterProfiler, quietly = TRUE)
														
 
															+library(GO.db, quietly = TRUE)
														
 
															+
														
 
															+
														
 
															+###############################################
														
 
															+# parse parameter 
														
 
															+###############################################
														
 
															+emapper <- read_delim(argv$emapper_anno, 
														
 
															+                      "\t", escape_double = FALSE, col_names = FALSE, 
														
 
															+                      comment = "#", trim_ws = TRUE) %>%
														
 
															+  dplyr::select(GID = X1, 
														
 
															+                COG = X7,
														
 
															+                Gene_Name = X8,
														
 
															+                Gene_Symbol = X9,
														
 
															+                GO = X10,
														
 
															+                KO = X12,
														
 
															+                Pathway = X13
														
 
															+                )
														
 
															+
														
 
															+###############################################
														
 
															+# make OrgDB 
														
 
															+###############################################
														
 
															+# gene name
														
 
															+gene_info <- dplyr::select(emapper,  GID, Gene_Name) %>%
														
 
															+  dplyr::filter(!is.na(Gene_Name)) %>%
														
 
															+  dplyr::filter(Gene_Name != '-') 
														
 
															+eggnog_anno = length(gene_info$GID)
														
 
															+
														
 
															+# gene to gene ontology
														
 
															+gene2go <- dplyr::select(emapper, GID, GO) %>%
														
 
															+  separate_rows(GO, sep = ',', convert = F) %>%
														
 
															+  filter(!is.na(GO)) %>%
														
 
															+  filter(str_detect(GO, '^GO')) %>%
														
 
															+  mutate(EVIDENCE = 'IEA') 
														
 
															+
														
 
															+go_terms <- AnnotationDbi::select(GO.db, 
														
 
															+                                 keys = unique(gene2go$GO), 
														
 
															+                                 columns = "ONTOLOGY", 
														
 
															+                                 keytype = "GOID")
														
 
															+
														
 
															+# 分别处理三种GO类型
														
 
															+message("拆分GO数据...")
														
 
															+gene2go <- gene2go %>%
														
 
															+    left_join(go_terms, by = c("GO" = "GOID")) %>%
														
 
															+    filter(ONTOLOGY == argv$ontology) %>%
														
 
															+    dplyr::select(GID, GO, EVIDENCE) %>%
														
 
															+    distinct()  
														
 
															+
														
 
															+  
														
 
															+# make org package
														
 
															+makeOrgPackage(gene_info=gene_info,
														
 
															+               go=gene2go,
														
 
															+               maintainer='zhangsan <zhangsan@genek.cn>',
														
 
															+               author='zhangsan',
														
 
															+               outputDir="./",
														
 
															+               tax_id=0000,
														
 
															+               genus='M',
														
 
															+               species='y',
														
 
															+               goTable="go",
														
 
															+               version="1.0")
														
 
															+
														
 
															+# build package
														
 
															+pkgbuild::build('.//org.My.eg.db', dest_path = ".")
														
 
															+
														
 
															+# install package  
														
 
															+dir.create('R_Library', recursive = T)
														
 
															+install.packages('org.My.eg.db_1.0.tar.gz',
														
 
															+                 repos = NULL, #从本地安装
														
 
															+                 lib = 'R_Library') # 安装文件夹
														
 
															+library(org.My.eg.db, lib = 'R_Library')
														
 
															+