|
@@ -0,0 +1,220 @@
|
|
|
+#!/usr/bin/env Rscript
|
|
|
+
|
|
|
+###############################################
|
|
|
+# parse parameter
|
|
|
+###############################################
|
|
|
+library(argparser, quietly=TRUE)
|
|
|
+
|
|
|
+p <- arg_parser("make OrgDB from emapper")
|
|
|
+p <- add_argument(p, "emapper_anno", help="emapper annotation result", type="character")
|
|
|
+p <- add_argument(p, "proteins", help="proteins in fasta format", type="character")
|
|
|
+
|
|
|
+argv <- parse_args(p)
|
|
|
+
|
|
|
+# set script dir
|
|
|
+script_dir <- dirname(strsplit(commandArgs(trailingOnly = FALSE)[4],"=")[[1]][2])
|
|
|
+
|
|
|
+###############################################
|
|
|
+# test input
|
|
|
+###############################################
|
|
|
+# argv <- list()
|
|
|
+# argv$emapper_anno <- 'example_data/my.emapper.annotations'
|
|
|
+# argv$proteins <- 'example_data/Sind.pep.fasta'
|
|
|
+# script_dir <- '/home/zhxd/workspace/Genomics/43.ProteinFunctionAnnotation/emcp'
|
|
|
+
|
|
|
+library(tidyverse, quietly = TRUE)
|
|
|
+library(formattable, quietly = TRUE)
|
|
|
+library(AnnotationForge, quietly = TRUE)
|
|
|
+library(seqinr, quietly = TRUE)
|
|
|
+library(clusterProfiler, quietly = TRUE)
|
|
|
+
|
|
|
+
|
|
|
+###############################################
|
|
|
+# parse parameter
|
|
|
+###############################################
|
|
|
+emapper <- read_delim(argv$emapper_anno,
|
|
|
+ "\t", escape_double = FALSE, col_names = FALSE,
|
|
|
+ comment = "#", trim_ws = TRUE) %>%
|
|
|
+ dplyr::select(GID = X1,
|
|
|
+ COG = X10,
|
|
|
+ Gene_Name = X11,
|
|
|
+ Gene_Symbol = X12,
|
|
|
+ GO = X13,
|
|
|
+ KO = X15,
|
|
|
+ Pathway = X16
|
|
|
+ )
|
|
|
+
|
|
|
+###############################################
|
|
|
+# make OrgDB
|
|
|
+###############################################
|
|
|
+# gene name
|
|
|
+gene_info <- dplyr::select(emapper, GID, Gene_Name) %>%
|
|
|
+ dplyr::filter(!is.na(Gene_Name)) %>%
|
|
|
+ dplyr::filter(Gene_Name != '-')
|
|
|
+ eggnog_anno = length(gene_info$GID)
|
|
|
+
|
|
|
+# gene to gene ontology
|
|
|
+gene2go <- dplyr::select(emapper, GID, GO) %>%
|
|
|
+ separate_rows(GO, sep = ',', convert = F) %>%
|
|
|
+ filter(!is.na(GO)) %>%
|
|
|
+ filter(str_detect(GO, '^GO')) %>%
|
|
|
+ mutate(EVIDENCE = 'IEA')
|
|
|
+
|
|
|
+# make org package
|
|
|
+makeOrgPackage(gene_info=gene_info,
|
|
|
+ go=gene2go,
|
|
|
+ maintainer='zhangsan <zhangsan@genek.cn>',
|
|
|
+ author='zhangsan',
|
|
|
+ outputDir="./",
|
|
|
+ tax_id=0000,
|
|
|
+ genus='M',
|
|
|
+ species='y',
|
|
|
+ goTable="go",
|
|
|
+ version="1.0")
|
|
|
+
|
|
|
+# build package
|
|
|
+pkgbuild::build('.//org.My.eg.db', dest_path = ".")
|
|
|
+
|
|
|
+# install package
|
|
|
+dir.create('R_Library', recursive = T)
|
|
|
+install.packages('org.My.eg.db_1.0.tar.gz',
|
|
|
+ repos = NULL, #从本地安装
|
|
|
+ lib = 'R_Library') # 安装文件夹
|
|
|
+library(org.My.eg.db, lib = 'R_Library')
|
|
|
+
|
|
|
+###############################################
|
|
|
+# GO statistics and plot
|
|
|
+###############################################
|
|
|
+all_gene <- getName.list(read.fasta(file = argv$proteins,
|
|
|
+ seqtype = 'AA'))
|
|
|
+go_anno = length(unique(gene2go$GID))
|
|
|
+go_bp <- clusterProfiler::groupGO(gene = all_gene,
|
|
|
+ OrgDb = org.My.eg.db,
|
|
|
+ keyType = "GID",
|
|
|
+ ont = "BP",
|
|
|
+ level = 2,
|
|
|
+ readable = FALSE)
|
|
|
+
|
|
|
+go_bp <- as.data.frame(go_bp)
|
|
|
+go_bp$GO_Class <- "Biological Process"
|
|
|
+
|
|
|
+go_cc <- clusterProfiler::groupGO(gene = all_gene,
|
|
|
+ OrgDb = org.My.eg.db,
|
|
|
+ keyType = "GID",
|
|
|
+ ont = "CC",
|
|
|
+ level = 2,
|
|
|
+ readable = FALSE)
|
|
|
+
|
|
|
+go_cc <- as.data.frame(go_cc)
|
|
|
+go_cc$GO_Class <- "Cellular Component"
|
|
|
+
|
|
|
+go_mf <- clusterProfiler::groupGO(gene = all_gene,
|
|
|
+ OrgDb = org.My.eg.db,
|
|
|
+ keyType = "GID",
|
|
|
+ ont = "MF",
|
|
|
+ level = 2,
|
|
|
+ readable = FALSE)
|
|
|
+go_mf <- as.data.frame(go_mf)
|
|
|
+go_mf$GO_Class <- "Molecular Function"
|
|
|
+
|
|
|
+go_all <- rbind(go_bp, go_cc, go_mf)
|
|
|
+write.table(go_all, "go.txt", sep = "\t", quote = F)
|
|
|
+p <- ggplot(go_all) +
|
|
|
+ geom_bar(aes(x = Description,
|
|
|
+ y = Count,
|
|
|
+ fill = GO_Class),
|
|
|
+ stat = "identity") +
|
|
|
+ facet_wrap(~GO_Class, scales = "free_x") +
|
|
|
+ labs(title = "GO function classification", y = "Number of genes") +
|
|
|
+ theme_classic() +
|
|
|
+ theme(plot.title = element_text(hjust = 0.5),
|
|
|
+ axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
|
|
|
+ axis.title.x = element_blank(),
|
|
|
+ legend.position = "none")
|
|
|
+
|
|
|
+ggsave("go.pdf", p, width = 20, height = 7)
|
|
|
+
|
|
|
+###############################################
|
|
|
+# Pathway statistics and plot
|
|
|
+###############################################
|
|
|
+gene2pathway <- dplyr::select(emapper, GID, Pathway) %>%
|
|
|
+ separate_rows(Pathway, sep = ',', convert = F) %>%
|
|
|
+ filter(str_detect(Pathway, 'ko'))
|
|
|
+
|
|
|
+load(file = paste(script_dir, "kegg_info.RData", sep = "/"))
|
|
|
+
|
|
|
+gene2pathway <- gene2pathway %>%
|
|
|
+ left_join(pathway2name, by = "Pathway") %>%
|
|
|
+ dplyr::select(GID, Pathway, Pathway_Name, Pathway_Class, Pathway_Subclass) %>%
|
|
|
+ distinct() %>%
|
|
|
+ na.omit()
|
|
|
+
|
|
|
+pathway_anno = length(unique(gene2pathway$GID))
|
|
|
+
|
|
|
+pathway_stat <- dplyr::select(gene2pathway, GID, Pathway_Class, Pathway_Subclass) %>%
|
|
|
+ distinct() %>%
|
|
|
+ group_by(Pathway_Class, Pathway_Subclass) %>%
|
|
|
+ summarise(Count = n(), Percentage = percent(n()/pathway_anno))
|
|
|
+
|
|
|
+pathway_stat$Pathway_Subclass <- ordered(pathway_stat$Pathway_Subclass, levels = pathway_stat$Pathway_Subclass)
|
|
|
+
|
|
|
+p <- ggplot(pathway_stat, aes(x = Pathway_Subclass, y = Percentage)) +
|
|
|
+ geom_bar(aes(fill = Pathway_Class), stat = 'identity') +
|
|
|
+ geom_text(aes(label = Count), nudge_y = 0.005) +
|
|
|
+ scale_y_continuous(labels=percent) +
|
|
|
+ labs(y = "Percent of genes(%)", x ="", fill = "Class") +
|
|
|
+ coord_flip() +
|
|
|
+ theme_classic()
|
|
|
+
|
|
|
+ggsave("pathway.pdf", p, width = 20, height = 7)
|
|
|
+write.table(gene2pathway, file = "pathway.txt", sep = "\t", quote = F)
|
|
|
+write.table(pathway2name, file = 'pathway_name.txt', sep = '\t', quote = F)
|
|
|
+write.table(pathway_stat, file = "pathway_stat.txt", sep = "\t", quote = F, row.names = F)
|
|
|
+
|
|
|
+
|
|
|
+###############################################
|
|
|
+# COG statistics and plot
|
|
|
+###############################################
|
|
|
+cog_funclass <- read_delim(paste(script_dir, "cog_funclass.tab", sep = "/"),
|
|
|
+ "\t", escape_double = FALSE, trim_ws = TRUE)
|
|
|
+
|
|
|
+insert_comma <- function(x){
|
|
|
+ str_c(x, sep = '', collapse = ',')
|
|
|
+}
|
|
|
+
|
|
|
+gene2cog <- dplyr::select(emapper, GID, COG) %>%
|
|
|
+ filter(!is.na(COG)) %>%
|
|
|
+ filter(COG != '-') %>%
|
|
|
+ mutate(COG = sapply(str_split(COG, ''), insert_comma)) %>%
|
|
|
+ separate_rows(COG, sep = ',', convert = F) %>%
|
|
|
+ left_join(cog_funclass, by = c('COG' = 'COG'))
|
|
|
+
|
|
|
+cog_anno = length(unique(gene2cog$GID))
|
|
|
+
|
|
|
+write.table(gene2cog, file = "cog.txt", sep = "\t", quote = F, row.names = F)
|
|
|
+
|
|
|
+p <- ggplot(data = gene2cog) +
|
|
|
+ geom_bar(aes(x = COG,
|
|
|
+ fill = COG_Name)) +
|
|
|
+ labs(title = "COG/KOG Function Classification ",
|
|
|
+ x = "",
|
|
|
+ y = "Number of genes") +
|
|
|
+ theme_classic() +
|
|
|
+ theme(plot.title = element_text(hjust = 0.5),
|
|
|
+ legend.title = element_blank(),
|
|
|
+ legend.key.size=unit(1,"line"),
|
|
|
+ legend.text = element_text(size = 7.5)) +
|
|
|
+ guides(fill=guide_legend(ncol=1))
|
|
|
+ggsave("cog.pdf", p, width = 16, height = 7)
|
|
|
+
|
|
|
+###############################################
|
|
|
+# number and percentage
|
|
|
+###############################################
|
|
|
+total_gene = length(all_gene)
|
|
|
+anno_stat <- tibble(
|
|
|
+ database = c("EggNOG", "GO", "COG/KOG", "KEGG Pathway"),
|
|
|
+ number = comma(c(eggnog_anno, go_anno, cog_anno, pathway_anno), digits = 0),
|
|
|
+ percentage = percent(c(eggnog_anno, go_anno, cog_anno, pathway_anno)/total_gene)
|
|
|
+)
|
|
|
+
|
|
|
+write.table(anno_stat, "anno_stat.txt", quote = F, row.names = F, sep = "\t")
|