emapperx_split.R 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. #!/usr/bin/env Rscript
  2. ###############################################
  3. # parse parameter
  4. ###############################################
  5. library(argparser, quietly=TRUE)
  6. p <- arg_parser("make OrgDB from emapper")
  7. p <- add_argument(p, "emapper_anno", help="emapper annotation result", type="character")
  8. p <- add_argument(p, "proteins", help="proteins in fasta format", type="character")
  9. p <- add_argument(p, "ontology", help="GO ontology type (MF, BP, or CC)",
  10. default="BP", type="character")
  11. argv <- parse_args(p)
  12. # set script dir
  13. script_dir <- dirname(strsplit(commandArgs(trailingOnly = FALSE)[4],"=")[[1]][2])
  14. ###############################################
  15. # test input
  16. ###############################################
  17. # argv <- list()
  18. # argv$emapper_anno <- 'out.emapper.annotations'
  19. # argv$proteins <- 'proteins.fa'
  20. # script_dir <- 'emcp'
  21. library(tidyverse, quietly = TRUE)
  22. library(formattable, quietly = TRUE)
  23. library(AnnotationForge, quietly = TRUE)
  24. library(seqinr, quietly = TRUE)
  25. library(clusterProfiler, quietly = TRUE)
  26. library(GO.db, quietly = TRUE)
  27. ###############################################
  28. # parse parameter
  29. ###############################################
  30. emapper <- read_delim(argv$emapper_anno,
  31. "\t", escape_double = FALSE, col_names = FALSE,
  32. comment = "#", trim_ws = TRUE) %>%
  33. dplyr::select(GID = X1,
  34. COG = X7,
  35. Gene_Name = X8,
  36. Gene_Symbol = X9,
  37. GO = X10,
  38. KO = X12,
  39. Pathway = X13
  40. )
  41. ###############################################
  42. # make OrgDB
  43. ###############################################
  44. # gene name
  45. gene_info <- dplyr::select(emapper, GID, Gene_Name) %>%
  46. dplyr::filter(!is.na(Gene_Name)) %>%
  47. dplyr::filter(Gene_Name != '-')
  48. eggnog_anno = length(gene_info$GID)
  49. # gene to gene ontology
  50. gene2go <- dplyr::select(emapper, GID, GO) %>%
  51. separate_rows(GO, sep = ',', convert = F) %>%
  52. filter(!is.na(GO)) %>%
  53. filter(str_detect(GO, '^GO')) %>%
  54. mutate(EVIDENCE = 'IEA')
  55. go_terms <- AnnotationDbi::select(GO.db,
  56. keys = unique(gene2go$GO),
  57. columns = "ONTOLOGY",
  58. keytype = "GOID")
  59. # 分别处理三种GO类型
  60. message("拆分GO数据...")
  61. gene2go <- gene2go %>%
  62. left_join(go_terms, by = c("GO" = "GOID")) %>%
  63. filter(ONTOLOGY == argv$ontology) %>%
  64. dplyr::select(GID, GO, EVIDENCE) %>%
  65. distinct()
  66. # make org package
  67. makeOrgPackage(gene_info=gene_info,
  68. go=gene2go,
  69. maintainer='zhangsan <zhangsan@genek.cn>',
  70. author='zhangsan',
  71. outputDir="./",
  72. tax_id=0000,
  73. genus='M',
  74. species='y',
  75. goTable="go",
  76. version="1.0")
  77. # build package
  78. pkgbuild::build('.//org.My.eg.db', dest_path = ".")
  79. # install package
  80. dir.create('R_Library', recursive = T)
  81. install.packages('org.My.eg.db_1.0.tar.gz',
  82. repos = NULL, #从本地安装
  83. lib = 'R_Library') # 安装文件夹
  84. library(org.My.eg.db, lib = 'R_Library')
  85. ###############################################
  86. # GO statistics and plot
  87. ###############################################
  88. all_gene <- getName.list(read.fasta(file = argv$proteins,
  89. seqtype = 'AA'))
  90. go_anno = length(unique(gene2go$GID))
  91. go <- clusterProfiler::groupGO(gene = all_gene,
  92. OrgDb = org.My.eg.db,
  93. keyType = "GID",
  94. ont = argv$ontology,
  95. level = 2,
  96. readable = FALSE)
  97. go <- as.data.frame(go)
  98. go$GO_Class <- argv$ontology
  99. go_all <- go
  100. write.table(go_all, "go.txt", sep = "\t", quote = F)
  101. p <- ggplot(go_all) +
  102. geom_bar(aes(x = Description,
  103. y = Count,
  104. fill = GO_Class),
  105. stat = "identity") +
  106. labs(title = "GO function classification", y = "Number of genes") +
  107. theme_classic() +
  108. theme(plot.title = element_text(hjust = 0.5),
  109. axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
  110. axis.title.x = element_blank(),
  111. legend.position = "none")
  112. ggsave("go.pdf", p, width = 20, height = 7)
  113. ###############################################
  114. # Pathway statistics and plot
  115. ###############################################
  116. gene2pathway <- dplyr::select(emapper, GID, Pathway) %>%
  117. separate_rows(Pathway, sep = ',', convert = F) %>%
  118. filter(str_detect(Pathway, 'ko'))
  119. load(file = paste(script_dir, "kegg_info.RData", sep = "/"))
  120. gene2pathway <- gene2pathway %>%
  121. left_join(pathway2name, by = "Pathway") %>%
  122. dplyr::select(GID, Pathway, Pathway_Name, Pathway_Class, Pathway_Subclass) %>%
  123. distinct() %>%
  124. na.omit()
  125. pathway_anno = length(unique(gene2pathway$GID))
  126. pathway_stat <- dplyr::select(gene2pathway, GID, Pathway_Class, Pathway_Subclass) %>%
  127. distinct() %>%
  128. group_by(Pathway_Class, Pathway_Subclass) %>%
  129. summarise(Count = n(), Percentage = percent(n()/pathway_anno))
  130. pathway_stat$Pathway_Subclass <- ordered(pathway_stat$Pathway_Subclass, levels = pathway_stat$Pathway_Subclass)
  131. p <- ggplot(pathway_stat, aes(x = Pathway_Subclass, y = Percentage)) +
  132. geom_bar(aes(fill = Pathway_Class), stat = 'identity') +
  133. geom_text(aes(label = Count), nudge_y = 0.005) +
  134. scale_y_continuous(labels=percent) +
  135. labs(y = "Percent of genes(%)", x ="", fill = "Class") +
  136. coord_flip() +
  137. theme_classic()
  138. ggsave("pathway.pdf", p, width = 20, height = 7)
  139. write.table(gene2pathway, file = "pathway.txt", sep = "\t", quote = F)
  140. write.table(pathway2name, file = 'pathway_name.txt', sep = '\t', quote = F)
  141. write.table(pathway_stat, file = "pathway_stat.txt", sep = "\t", quote = F, row.names = F)
  142. ###############################################
  143. # COG statistics and plot
  144. ###############################################
  145. cog_funclass <- read_delim(paste(script_dir, "cog_funclass.tab", sep = "/"),
  146. "\t", escape_double = FALSE, trim_ws = TRUE)
  147. insert_comma <- function(x){
  148. str_c(x, sep = '', collapse = ',')
  149. }
  150. gene2cog <- dplyr::select(emapper, GID, COG) %>%
  151. filter(!is.na(COG)) %>%
  152. filter(COG != '-') %>%
  153. mutate(COG = sapply(str_split(COG, ''), insert_comma)) %>%
  154. separate_rows(COG, sep = ',', convert = F) %>%
  155. left_join(cog_funclass, by = c('COG' = 'COG'))
  156. cog_anno = length(unique(gene2cog$GID))
  157. write.table(gene2cog, file = "cog.txt", sep = "\t", quote = F, row.names = F)
  158. p <- ggplot(data = gene2cog) +
  159. geom_bar(aes(x = COG,
  160. fill = COG_Name)) +
  161. labs(title = "COG/KOG Function Classification ",
  162. x = "",
  163. y = "Number of genes") +
  164. theme_classic() +
  165. theme(plot.title = element_text(hjust = 0.5),
  166. legend.title = element_blank(),
  167. legend.key.size=unit(1,"line"),
  168. legend.text = element_text(size = 7.5)) +
  169. guides(fill=guide_legend(ncol=1))
  170. ggsave("cog.pdf", p, width = 16, height = 7)
  171. ###############################################
  172. # number and percentage
  173. ###############################################
  174. total_gene = length(all_gene)
  175. anno_stat <- tibble(
  176. database = c("EggNOG", "GO", "COG/KOG", "KEGG Pathway"),
  177. number = comma(c(eggnog_anno, go_anno, cog_anno, pathway_anno), digits = 0),
  178. percentage = percent(c(eggnog_anno, go_anno, cog_anno, pathway_anno)/total_gene)
  179. )
  180. write.table(anno_stat, "anno_stat.txt", quote = F, row.names = F, sep = "\t")