emapperx_split.R 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. #!/usr/bin/env Rscript
  2. ###############################################
  3. # parse parameter
  4. ###############################################
  5. library(argparser, quietly=TRUE)
  6. p <- arg_parser("make OrgDB from emapper")
  7. p <- add_argument(p, "emapper_anno", help="emapper annotation result", type="character")
  8. p <- add_argument(p, "proteins", help="proteins in fasta format", type="character")
  9. p <- add_argument(p, "ontology", help="GO ontology type (MF, BP, or CC)",
  10. default="BP", type="character")
  11. argv <- parse_args(p)
  12. # set script dir
  13. script_dir <- dirname(strsplit(commandArgs(trailingOnly = FALSE)[4],"=")[[1]][2])
  14. ###############################################
  15. # test input
  16. ###############################################
  17. # argv <- list()
  18. # argv$emapper_anno <- 'out.emapper.annotations'
  19. # argv$proteins <- 'proteins.fa'
  20. # script_dir <- 'emcp'
  21. library(tidyverse, quietly = TRUE)
  22. library(formattable, quietly = TRUE)
  23. library(AnnotationForge, quietly = TRUE)
  24. library(seqinr, quietly = TRUE)
  25. library(clusterProfiler, quietly = TRUE)
  26. library(GO.db, quietly = TRUE)
  27. ###############################################
  28. # parse parameter
  29. ###############################################
  30. emapper <- read_delim(argv$emapper_anno,
  31. "\t", escape_double = FALSE, col_names = FALSE,
  32. comment = "#", trim_ws = TRUE) %>%
  33. dplyr::select(GID = X1,
  34. COG = X7,
  35. Gene_Name = X8,
  36. Gene_Symbol = X9,
  37. GO = X10,
  38. KO = X12,
  39. Pathway = X13
  40. )
  41. ###############################################
  42. # make OrgDB
  43. ###############################################
  44. # gene name
  45. gene_info <- dplyr::select(emapper, GID, Gene_Name) %>%
  46. dplyr::filter(!is.na(Gene_Name)) %>%
  47. dplyr::filter(Gene_Name != '-')
  48. eggnog_anno = length(gene_info$GID)
  49. # gene to gene ontology
  50. gene2go <- dplyr::select(emapper, GID, GO) %>%
  51. separate_rows(GO, sep = ',', convert = F) %>%
  52. filter(!is.na(GO)) %>%
  53. filter(str_detect(GO, '^GO')) %>%
  54. mutate(EVIDENCE = 'IEA')
  55. go_terms <- AnnotationDbi::select(GO.db,
  56. keys = unique(gene2go$GO),
  57. columns = "ONTOLOGY",
  58. keytype = "GOID")
  59. # 分别处理三种GO类型
  60. message("拆分GO数据...")
  61. gene2go <- gene2go %>%
  62. left_join(go_terms, by = c("GO" = "GOID")) %>%
  63. filter(ONTOLOGY == argv$ontology) %>%
  64. dplyr::select(GID, GO, EVIDENCE) %>%
  65. distinct()
  66. # make org package
  67. makeOrgPackage(gene_info=gene_info,
  68. go=gene2go,
  69. maintainer='zhangsan <zhangsan@genek.cn>',
  70. author='zhangsan',
  71. outputDir="./",
  72. tax_id=0000,
  73. genus='M',
  74. species='y',
  75. goTable="go",
  76. version="1.0")
  77. # build package
  78. pkgbuild::build('.//org.My.eg.db', dest_path = ".")
  79. # install package
  80. dir.create('R_Library', recursive = T)
  81. install.packages('org.My.eg.db_1.0.tar.gz',
  82. repos = NULL, #从本地安装
  83. lib = 'R_Library') # 安装文件夹
  84. library(org.My.eg.db, lib = 'R_Library')