zhxd2
/
emcp


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
							#!/usr/bin/env Rscript

###############################################
# parse parameter 
###############################################
library(argparser, quietly=TRUE)

p <- arg_parser("make OrgDB from emapper")
p <- add_argument(p, "emapper_anno", help="emapper annotation result", type="character")
p <- add_argument(p, "proteins", help="proteins in fasta format", type="character")

argv <- parse_args(p)

# set script dir
script_dir <- dirname(strsplit(commandArgs(trailingOnly = FALSE)[4],"=")[[1]][2])

###############################################
# test input 
###############################################
# argv <- list()
# argv$emapper_anno <- 'example_data/my.emapper.annotations'
# argv$proteins <- 'example_data/Sind.pep.fasta'
# script_dir <- '/home/zhxd/workspace/Genomics/43.ProteinFunctionAnnotation/emcp'

library(tidyverse, quietly = TRUE)
library(formattable, quietly = TRUE)
library(AnnotationForge, quietly = TRUE)
library(seqinr, quietly = TRUE)
library(clusterProfiler, quietly = TRUE)


###############################################
# parse parameter 
###############################################
emapper <- read_delim(argv$emapper_anno, 
                      "\t", escape_double = FALSE, col_names = FALSE, 
                      comment = "#", trim_ws = TRUE) %>%
  dplyr::select(GID = X1, 
                COG = X10,
                Gene_Name = X11,
                Gene_Symbol = X12,
                GO = X13,
                KO = X15,
                Pathway = X16
                )

###############################################
# make OrgDB 
###############################################
# gene name
gene_info <- dplyr::select(emapper,  GID, Gene_Name) %>%
  dplyr::filter(!is.na(Gene_Name)) %>%
  dplyr::filter(Gene_Name != '-') 
  eggnog_anno = length(gene_info$GID)

# gene to gene ontology
gene2go <- dplyr::select(emapper, GID, GO) %>%
  separate_rows(GO, sep = ',', convert = F) %>%
  filter(!is.na(GO)) %>%
  filter(str_detect(GO, '^GO')) %>%
  mutate(EVIDENCE = 'IEA') 
  
# make org package
makeOrgPackage(gene_info=gene_info,
               go=gene2go,
               maintainer='zhangsan <zhangsan@genek.cn>',
               author='zhangsan',
               outputDir="./",
               tax_id=0000,
               genus='M',
               species='y',
               goTable="go",
               version="1.0")

# build package
pkgbuild::build('.//org.My.eg.db', dest_path = ".")

# install package  
dir.create('R_Library', recursive = T)
install.packages('org.My.eg.db_1.0.tar.gz',
                 repos = NULL, #从本地安装
                 lib = 'R_Library') # 安装文件夹
library(org.My.eg.db, lib = 'R_Library')

###############################################
# GO statistics and plot
###############################################
all_gene <- getName.list(read.fasta(file = argv$proteins, 
                                    seqtype = 'AA'))
go_anno = length(unique(gene2go$GID))
go_bp <- clusterProfiler::groupGO(gene     = all_gene,
                 OrgDb    = org.My.eg.db,
                 keyType  = "GID",
                 ont      = "BP",
                 level    = 2,
                 readable = FALSE)
  
go_bp <- as.data.frame(go_bp)
go_bp$GO_Class <- "Biological Process"
  
go_cc <- clusterProfiler::groupGO(gene     = all_gene,
                 OrgDb    = org.My.eg.db,
                 keyType  = "GID",
                 ont      = "CC",
                 level    = 2,
                 readable = FALSE)
  
go_cc <- as.data.frame(go_cc)
go_cc$GO_Class <- "Cellular Component"
  
go_mf <- clusterProfiler::groupGO(gene     = all_gene,
                 OrgDb    = org.My.eg.db,
                 keyType  = "GID",
                 ont      = "MF",
                 level    = 2,
                 readable = FALSE)
go_mf <- as.data.frame(go_mf)
go_mf$GO_Class <- "Molecular Function"
  
go_all <- rbind(go_bp, go_cc, go_mf)
write.table(go_all, "go.txt", sep = "\t", quote = F)
p <- ggplot(go_all) + 
  geom_bar(aes(x = Description, 
               y = Count,
               fill = GO_Class),
           stat = "identity") + 
  facet_wrap(~GO_Class, scales = "free_x") + 
  labs(title = "GO function classification", y = "Number of genes") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
        axis.title.x = element_blank(),
        legend.position = "none")
  
ggsave("go.pdf", p, width = 20, height = 7)
  
###############################################
# Pathway statistics and plot 
###############################################
gene2pathway <- dplyr::select(emapper, GID, Pathway) %>%
  separate_rows(Pathway, sep = ',', convert = F) %>%
  filter(str_detect(Pathway, 'ko'))
  
load(file = paste(script_dir, "kegg_info.RData", sep = "/"))
  
gene2pathway <- gene2pathway %>%
  left_join(pathway2name, by = "Pathway") %>%
  dplyr::select(GID, Pathway, Pathway_Name, Pathway_Class, Pathway_Subclass) %>%
  distinct() %>%
  na.omit()
  
pathway_anno = length(unique(gene2pathway$GID))
  
pathway_stat <- dplyr::select(gene2pathway, GID, Pathway_Class, Pathway_Subclass) %>% 
  distinct() %>% 
  group_by(Pathway_Class, Pathway_Subclass) %>%
  summarise(Count = n(), Percentage = percent(n()/pathway_anno))
  
pathway_stat$Pathway_Subclass <- ordered(pathway_stat$Pathway_Subclass, levels = pathway_stat$Pathway_Subclass) 
  
p <- ggplot(pathway_stat, aes(x = Pathway_Subclass, y = Percentage)) +
  geom_bar(aes(fill = Pathway_Class), stat = 'identity') +
  geom_text(aes(label = Count), nudge_y = 0.005) +
  scale_y_continuous(labels=percent) + 
  labs(y = "Percent of genes(%)", x ="", fill = "Class") +
  coord_flip() +
  theme_classic()
  
ggsave("pathway.pdf", p, width = 20, height = 7)
write.table(gene2pathway, file = "pathway.txt", sep = "\t", quote = F)
write.table(pathway2name, file = 'pathway_name.txt', sep = '\t', quote = F)
write.table(pathway_stat, file = "pathway_stat.txt", sep = "\t", quote = F, row.names = F)
  

###############################################
# COG statistics and plot 
###############################################  
cog_funclass <- read_delim(paste(script_dir, "cog_funclass.tab", sep = "/"), 
                             "\t", escape_double = FALSE, trim_ws = TRUE)
  
insert_comma <- function(x){
  str_c(x, sep = '', collapse = ',')
}
  
gene2cog <- dplyr::select(emapper, GID, COG) %>%
  filter(!is.na(COG)) %>%
  filter(COG != '-') %>%
  mutate(COG = sapply(str_split(COG, ''), insert_comma)) %>%
  separate_rows(COG, sep = ',', convert = F) %>%
  left_join(cog_funclass, by = c('COG' = 'COG'))
  
cog_anno = length(unique(gene2cog$GID))
  
write.table(gene2cog, file = "cog.txt", sep = "\t", quote = F, row.names = F)
  
p <- ggplot(data = gene2cog) + 
  geom_bar(aes(x = COG, 
               fill = COG_Name)) +
  labs(title = "COG/KOG Function Classification ", 
       x = "",
       y = "Number of genes") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        legend.key.size=unit(1,"line"),
        legend.text = element_text(size = 7.5)) +
  guides(fill=guide_legend(ncol=1))
ggsave("cog.pdf", p, width = 16, height = 7)

###############################################
# number and percentage 
###############################################  
total_gene = length(all_gene)
anno_stat <- tibble(
  database = c("EggNOG", "GO", "COG/KOG", "KEGG Pathway"),
  number = comma(c(eggnog_anno, go_anno, cog_anno, pathway_anno), digits = 0),
  percentage = percent(c(eggnog_anno, go_anno, cog_anno, pathway_anno)/total_gene)
)
  
write.table(anno_stat, "anno_stat.txt", quote = F, row.names = F, sep = "\t")