|
@@ -0,0 +1,101 @@
|
|
|
|
+#!/usr/bin/env Rscript
|
|
|
|
+
|
|
|
|
+###############################################
|
|
|
|
+# parse parameter
|
|
|
|
+###############################################
|
|
|
|
+library(argparser, quietly=TRUE)
|
|
|
|
+
|
|
|
|
+p <- arg_parser("make OrgDB from emapper")
|
|
|
|
+p <- add_argument(p, "emapper_anno", help="emapper annotation result", type="character")
|
|
|
|
+p <- add_argument(p, "proteins", help="proteins in fasta format", type="character")
|
|
|
|
+p <- add_argument(p, "ontology", help="GO ontology type (MF, BP, or CC)",
|
|
|
|
+ default="BP", type="character")
|
|
|
|
+
|
|
|
|
+argv <- parse_args(p)
|
|
|
|
+
|
|
|
|
+# set script dir
|
|
|
|
+script_dir <- dirname(strsplit(commandArgs(trailingOnly = FALSE)[4],"=")[[1]][2])
|
|
|
|
+
|
|
|
|
+###############################################
|
|
|
|
+# test input
|
|
|
|
+###############################################
|
|
|
|
+# argv <- list()
|
|
|
|
+# argv$emapper_anno <- 'out.emapper.annotations'
|
|
|
|
+# argv$proteins <- 'proteins.fa'
|
|
|
|
+# script_dir <- 'emcp'
|
|
|
|
+
|
|
|
|
+library(tidyverse, quietly = TRUE)
|
|
|
|
+library(formattable, quietly = TRUE)
|
|
|
|
+library(AnnotationForge, quietly = TRUE)
|
|
|
|
+library(seqinr, quietly = TRUE)
|
|
|
|
+library(clusterProfiler, quietly = TRUE)
|
|
|
|
+library(GO.db, quietly = TRUE)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+###############################################
|
|
|
|
+# parse parameter
|
|
|
|
+###############################################
|
|
|
|
+emapper <- read_delim(argv$emapper_anno,
|
|
|
|
+ "\t", escape_double = FALSE, col_names = FALSE,
|
|
|
|
+ comment = "#", trim_ws = TRUE) %>%
|
|
|
|
+ dplyr::select(GID = X1,
|
|
|
|
+ COG = X7,
|
|
|
|
+ Gene_Name = X8,
|
|
|
|
+ Gene_Symbol = X9,
|
|
|
|
+ GO = X10,
|
|
|
|
+ KO = X12,
|
|
|
|
+ Pathway = X13
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+###############################################
|
|
|
|
+# make OrgDB
|
|
|
|
+###############################################
|
|
|
|
+# gene name
|
|
|
|
+gene_info <- dplyr::select(emapper, GID, Gene_Name) %>%
|
|
|
|
+ dplyr::filter(!is.na(Gene_Name)) %>%
|
|
|
|
+ dplyr::filter(Gene_Name != '-')
|
|
|
|
+eggnog_anno = length(gene_info$GID)
|
|
|
|
+
|
|
|
|
+# gene to gene ontology
|
|
|
|
+gene2go <- dplyr::select(emapper, GID, GO) %>%
|
|
|
|
+ separate_rows(GO, sep = ',', convert = F) %>%
|
|
|
|
+ filter(!is.na(GO)) %>%
|
|
|
|
+ filter(str_detect(GO, '^GO')) %>%
|
|
|
|
+ mutate(EVIDENCE = 'IEA')
|
|
|
|
+
|
|
|
|
+go_terms <- AnnotationDbi::select(GO.db,
|
|
|
|
+ keys = unique(gene2go$GO),
|
|
|
|
+ columns = "ONTOLOGY",
|
|
|
|
+ keytype = "GOID")
|
|
|
|
+
|
|
|
|
+# 分别处理三种GO类型
|
|
|
|
+message("拆分GO数据...")
|
|
|
|
+gene2go <- gene2go %>%
|
|
|
|
+ left_join(go_terms, by = c("GO" = "GOID")) %>%
|
|
|
|
+ filter(ONTOLOGY == argv$ontology) %>%
|
|
|
|
+ dplyr::select(GID, GO, EVIDENCE) %>%
|
|
|
|
+ distinct()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# make org package
|
|
|
|
+makeOrgPackage(gene_info=gene_info,
|
|
|
|
+ go=gene2go,
|
|
|
|
+ maintainer='zhangsan <zhangsan@genek.cn>',
|
|
|
|
+ author='zhangsan',
|
|
|
|
+ outputDir="./",
|
|
|
|
+ tax_id=0000,
|
|
|
|
+ genus='M',
|
|
|
|
+ species='y',
|
|
|
|
+ goTable="go",
|
|
|
|
+ version="1.0")
|
|
|
|
+
|
|
|
|
+# build package
|
|
|
|
+pkgbuild::build('.//org.My.eg.db', dest_path = ".")
|
|
|
|
+
|
|
|
|
+# install package
|
|
|
|
+dir.create('R_Library', recursive = T)
|
|
|
|
+install.packages('org.My.eg.db_1.0.tar.gz',
|
|
|
|
+ repos = NULL, #从本地安装
|
|
|
|
+ lib = 'R_Library') # 安装文件夹
|
|
|
|
+library(org.My.eg.db, lib = 'R_Library')
|
|
|
|
+
|