emapperx.R 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. #!/usr/bin/env Rscript
  2. ###############################################
  3. # parse parameter
  4. ###############################################
  5. library(argparser, quietly=TRUE)
  6. p <- arg_parser("make OrgDB from emapper")
  7. p <- add_argument(p, "emapper_anno", help="emapper annotation result", type="character")
  8. p <- add_argument(p, "proteins", help="proteins in fasta format", type="character")
  9. argv <- parse_args(p)
  10. # set script dir
  11. script_dir <- dirname(strsplit(commandArgs(trailingOnly = FALSE)[4],"=")[[1]][2])
  12. ###############################################
  13. # test input
  14. ###############################################
  15. # argv <- list()
  16. # argv$emapper_anno <- 'example_data/my.emapper.annotations'
  17. # argv$proteins <- 'example_data/Sind.pep.fasta'
  18. # script_dir <- '/home/zhxd/workspace/Genomics/43.ProteinFunctionAnnotation/emcp'
  19. library(tidyverse, quietly = TRUE)
  20. library(formattable, quietly = TRUE)
  21. library(AnnotationForge, quietly = TRUE)
  22. library(seqinr, quietly = TRUE)
  23. library(clusterProfiler, quietly = TRUE)
  24. ###############################################
  25. # parse parameter
  26. ###############################################
  27. emapper <- read_delim(argv$emapper_anno,
  28. "\t", escape_double = FALSE, col_names = FALSE,
  29. comment = "#", trim_ws = TRUE) %>%
  30. dplyr::select(GID = X1,
  31. COG = X10,
  32. Gene_Name = X11,
  33. Gene_Symbol = X12,
  34. GO = X13,
  35. KO = X15,
  36. Pathway = X16
  37. )
  38. ###############################################
  39. # make OrgDB
  40. ###############################################
  41. # gene name
  42. gene_info <- dplyr::select(emapper, GID, Gene_Name) %>%
  43. dplyr::filter(!is.na(Gene_Name)) %>%
  44. dplyr::filter(Gene_Name != '-')
  45. eggnog_anno = length(gene_info$GID)
  46. # gene to gene ontology
  47. gene2go <- dplyr::select(emapper, GID, GO) %>%
  48. separate_rows(GO, sep = ',', convert = F) %>%
  49. filter(!is.na(GO)) %>%
  50. filter(str_detect(GO, '^GO')) %>%
  51. mutate(EVIDENCE = 'IEA')
  52. # make org package
  53. makeOrgPackage(gene_info=gene_info,
  54. go=gene2go,
  55. maintainer='zhangsan <zhangsan@genek.cn>',
  56. author='zhangsan',
  57. outputDir="./",
  58. tax_id=0000,
  59. genus='M',
  60. species='y',
  61. goTable="go",
  62. version="1.0")
  63. # build package
  64. pkgbuild::build('.//org.My.eg.db', dest_path = ".")
  65. # install package
  66. dir.create('R_Library', recursive = T)
  67. install.packages('org.My.eg.db_1.0.tar.gz',
  68. repos = NULL, #从本地安装
  69. lib = 'R_Library') # 安装文件夹
  70. library(org.My.eg.db, lib = 'R_Library')
  71. ###############################################
  72. # GO statistics and plot
  73. ###############################################
  74. all_gene <- getName.list(read.fasta(file = argv$proteins,
  75. seqtype = 'AA'))
  76. go_anno = length(unique(gene2go$GID))
  77. go_bp <- clusterProfiler::groupGO(gene = all_gene,
  78. OrgDb = org.My.eg.db,
  79. keyType = "GID",
  80. ont = "BP",
  81. level = 2,
  82. readable = FALSE)
  83. go_bp <- as.data.frame(go_bp)
  84. go_bp$GO_Class <- "Biological Process"
  85. go_cc <- clusterProfiler::groupGO(gene = all_gene,
  86. OrgDb = org.My.eg.db,
  87. keyType = "GID",
  88. ont = "CC",
  89. level = 2,
  90. readable = FALSE)
  91. go_cc <- as.data.frame(go_cc)
  92. go_cc$GO_Class <- "Cellular Component"
  93. go_mf <- clusterProfiler::groupGO(gene = all_gene,
  94. OrgDb = org.My.eg.db,
  95. keyType = "GID",
  96. ont = "MF",
  97. level = 2,
  98. readable = FALSE)
  99. go_mf <- as.data.frame(go_mf)
  100. go_mf$GO_Class <- "Molecular Function"
  101. go_all <- rbind(go_bp, go_cc, go_mf)
  102. write.table(go_all, "go.txt", sep = "\t", quote = F)
  103. p <- ggplot(go_all) +
  104. geom_bar(aes(x = Description,
  105. y = Count,
  106. fill = GO_Class),
  107. stat = "identity") +
  108. facet_wrap(~GO_Class, scales = "free_x") +
  109. labs(title = "GO function classification", y = "Number of genes") +
  110. theme_classic() +
  111. theme(plot.title = element_text(hjust = 0.5),
  112. axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
  113. axis.title.x = element_blank(),
  114. legend.position = "none")
  115. ggsave("go.pdf", p, width = 20, height = 7)
  116. ###############################################
  117. # Pathway statistics and plot
  118. ###############################################
  119. gene2pathway <- dplyr::select(emapper, GID, Pathway) %>%
  120. separate_rows(Pathway, sep = ',', convert = F) %>%
  121. filter(str_detect(Pathway, 'ko'))
  122. load(file = paste(script_dir, "kegg_info.RData", sep = "/"))
  123. gene2pathway <- gene2pathway %>%
  124. left_join(pathway2name, by = "Pathway") %>%
  125. dplyr::select(GID, Pathway, Pathway_Name, Pathway_Class, Pathway_Subclass) %>%
  126. distinct() %>%
  127. na.omit()
  128. pathway_anno = length(unique(gene2pathway$GID))
  129. pathway_stat <- dplyr::select(gene2pathway, GID, Pathway_Class, Pathway_Subclass) %>%
  130. distinct() %>%
  131. group_by(Pathway_Class, Pathway_Subclass) %>%
  132. summarise(Count = n(), Percentage = percent(n()/pathway_anno))
  133. pathway_stat$Pathway_Subclass <- ordered(pathway_stat$Pathway_Subclass, levels = pathway_stat$Pathway_Subclass)
  134. p <- ggplot(pathway_stat, aes(x = Pathway_Subclass, y = Percentage)) +
  135. geom_bar(aes(fill = Pathway_Class), stat = 'identity') +
  136. geom_text(aes(label = Count), nudge_y = 0.005) +
  137. scale_y_continuous(labels=percent) +
  138. labs(y = "Percent of genes(%)", x ="", fill = "Class") +
  139. coord_flip() +
  140. theme_classic()
  141. ggsave("pathway.pdf", p, width = 20, height = 7)
  142. write.table(gene2pathway, file = "pathway.txt", sep = "\t", quote = F)
  143. write.table(pathway2name, file = 'pathway_name.txt', sep = '\t', quote = F)
  144. write.table(pathway_stat, file = "pathway_stat.txt", sep = "\t", quote = F, row.names = F)
  145. ###############################################
  146. # COG statistics and plot
  147. ###############################################
  148. cog_funclass <- read_delim(paste(script_dir, "cog_funclass.tab", sep = "/"),
  149. "\t", escape_double = FALSE, trim_ws = TRUE)
  150. insert_comma <- function(x){
  151. str_c(x, sep = '', collapse = ',')
  152. }
  153. gene2cog <- dplyr::select(emapper, GID, COG) %>%
  154. filter(!is.na(COG)) %>%
  155. filter(COG != '-') %>%
  156. mutate(COG = sapply(str_split(COG, ''), insert_comma)) %>%
  157. separate_rows(COG, sep = ',', convert = F) %>%
  158. left_join(cog_funclass, by = c('COG' = 'COG'))
  159. cog_anno = length(unique(gene2cog$GID))
  160. write.table(gene2cog, file = "cog.txt", sep = "\t", quote = F, row.names = F)
  161. p <- ggplot(data = gene2cog) +
  162. geom_bar(aes(x = COG,
  163. fill = COG_Name)) +
  164. labs(title = "COG/KOG Function Classification ",
  165. x = "",
  166. y = "Number of genes") +
  167. theme_classic() +
  168. theme(plot.title = element_text(hjust = 0.5),
  169. legend.title = element_blank(),
  170. legend.key.size=unit(1,"line"),
  171. legend.text = element_text(size = 7.5)) +
  172. guides(fill=guide_legend(ncol=1))
  173. ggsave("cog.pdf", p, width = 16, height = 7)
  174. ###############################################
  175. # number and percentage
  176. ###############################################
  177. total_gene = length(all_gene)
  178. anno_stat <- tibble(
  179. database = c("EggNOG", "GO", "COG/KOG", "KEGG Pathway"),
  180. number = comma(c(eggnog_anno, go_anno, cog_anno, pathway_anno), digits = 0),
  181. percentage = percent(c(eggnog_anno, go_anno, cog_anno, pathway_anno)/total_gene)
  182. )
  183. write.table(anno_stat, "anno_stat.txt", quote = F, row.names = F, sep = "\t")