浏览代码

增加统计结果

User 1 月之前
父节点
当前提交
c4fc257d64
共有 1 个文件被更改,包括 118 次插入0 次删除
  1. 118 0
      emapperx_split.R

+ 118 - 0
emapperx_split.R

@@ -99,3 +99,121 @@ install.packages('org.My.eg.db_1.0.tar.gz',
                  lib = 'R_Library') # 安装文件夹
 library(org.My.eg.db, lib = 'R_Library')
 
+###############################################
+# GO statistics and plot
+###############################################
+all_gene <- getName.list(read.fasta(file = argv$proteins, 
+                                    seqtype = 'AA'))
+go_anno = length(unique(gene2go$GID))
+go <- clusterProfiler::groupGO(gene     = all_gene,
+                 OrgDb    = org.My.eg.db,
+                 keyType  = "GID",
+                 ont      = argv$ontology,
+                 level    = 2,
+                 readable = FALSE)
+  
+go <- as.data.frame(go)
+go$GO_Class <- argv$ontology
+  
+go_all <- go
+
+write.table(go_all, "go.txt", sep = "\t", quote = F)
+
+p <- ggplot(go_all) + 
+  geom_bar(aes(x = Description, 
+               y = Count,
+               fill = GO_Class),
+           stat = "identity") + 
+  labs(title = "GO function classification", y = "Number of genes") +
+  theme_classic() +
+  theme(plot.title = element_text(hjust = 0.5),
+        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
+        axis.title.x = element_blank(),
+        legend.position = "none")
+  
+ggsave("go.pdf", p, width = 20, height = 7)
+  
+###############################################
+# Pathway statistics and plot 
+###############################################
+gene2pathway <- dplyr::select(emapper, GID, Pathway) %>%
+  separate_rows(Pathway, sep = ',', convert = F) %>%
+  filter(str_detect(Pathway, 'ko'))
+  
+load(file = paste(script_dir, "kegg_info.RData", sep = "/"))
+  
+gene2pathway <- gene2pathway %>%
+  left_join(pathway2name, by = "Pathway") %>%
+  dplyr::select(GID, Pathway, Pathway_Name, Pathway_Class, Pathway_Subclass) %>%
+  distinct() %>%
+  na.omit()
+  
+pathway_anno = length(unique(gene2pathway$GID))
+  
+pathway_stat <- dplyr::select(gene2pathway, GID, Pathway_Class, Pathway_Subclass) %>% 
+  distinct() %>% 
+  group_by(Pathway_Class, Pathway_Subclass) %>%
+  summarise(Count = n(), Percentage = percent(n()/pathway_anno))
+  
+pathway_stat$Pathway_Subclass <- ordered(pathway_stat$Pathway_Subclass, levels = pathway_stat$Pathway_Subclass) 
+  
+p <- ggplot(pathway_stat, aes(x = Pathway_Subclass, y = Percentage)) +
+  geom_bar(aes(fill = Pathway_Class), stat = 'identity') +
+  geom_text(aes(label = Count), nudge_y = 0.005) +
+  scale_y_continuous(labels=percent) + 
+  labs(y = "Percent of genes(%)", x ="", fill = "Class") +
+  coord_flip() +
+  theme_classic()
+  
+ggsave("pathway.pdf", p, width = 20, height = 7)
+write.table(gene2pathway, file = "pathway.txt", sep = "\t", quote = F)
+write.table(pathway2name, file = 'pathway_name.txt', sep = '\t', quote = F)
+write.table(pathway_stat, file = "pathway_stat.txt", sep = "\t", quote = F, row.names = F)
+  
+
+###############################################
+# COG statistics and plot 
+###############################################  
+cog_funclass <- read_delim(paste(script_dir, "cog_funclass.tab", sep = "/"), 
+                             "\t", escape_double = FALSE, trim_ws = TRUE)
+  
+insert_comma <- function(x){
+  str_c(x, sep = '', collapse = ',')
+}
+  
+gene2cog <- dplyr::select(emapper, GID, COG) %>%
+  filter(!is.na(COG)) %>%
+  filter(COG != '-') %>%
+  mutate(COG = sapply(str_split(COG, ''), insert_comma)) %>%
+  separate_rows(COG, sep = ',', convert = F) %>%
+  left_join(cog_funclass, by = c('COG' = 'COG'))
+  
+cog_anno = length(unique(gene2cog$GID))
+  
+write.table(gene2cog, file = "cog.txt", sep = "\t", quote = F, row.names = F)
+  
+p <- ggplot(data = gene2cog) + 
+  geom_bar(aes(x = COG, 
+               fill = COG_Name)) +
+  labs(title = "COG/KOG Function Classification ", 
+       x = "",
+       y = "Number of genes") +
+  theme_classic() +
+  theme(plot.title = element_text(hjust = 0.5),
+        legend.title = element_blank(),
+        legend.key.size=unit(1,"line"),
+        legend.text = element_text(size = 7.5)) +
+  guides(fill=guide_legend(ncol=1))
+ggsave("cog.pdf", p, width = 16, height = 7)
+
+###############################################
+# number and percentage 
+###############################################  
+total_gene = length(all_gene)
+anno_stat <- tibble(
+  database = c("EggNOG", "GO", "COG/KOG", "KEGG Pathway"),
+  number = comma(c(eggnog_anno, go_anno, cog_anno, pathway_anno), digits = 0),
+  percentage = percent(c(eggnog_anno, go_anno, cog_anno, pathway_anno)/total_gene)
+)
+  
+write.table(anno_stat, "anno_stat.txt", quote = F, row.names = F, sep = "\t")