tripal_phylogeny.taxonomy_import.inc 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. <?php
  2. /**
  3. *
  4. */
  5. function tripal_phylogeny_taxonomy_load_form($form, &$form_state) {
  6. $form['instructions'] = array(
  7. '#type' => 'item',
  8. '#markup' => t('The NCBI Taxonmic Importer examines the organisms
  9. currently present in the database and queries NCBI for the
  10. taxonomic details. If the importer is able to match the
  11. genus and species with NCBI the species details will be imported,
  12. and a page containing the taxonomic tree will be created.'),
  13. );
  14. $form['import_ncbi'] = array(
  15. '#type' => 'submit',
  16. '#name' => 'import_sync',
  17. '#value' => 'Import NCBI taxonomic data'
  18. );
  19. return $form;
  20. }
  21. /**
  22. *
  23. * @param unknown $form
  24. * @param unknown $form_state
  25. */
  26. function tripal_phylogeny_taxonomy_load_form_submit($form, &$form_state) {
  27. global $user;
  28. $sync = $form_state['values']['do_sync'];
  29. $args = array();
  30. tripal_add_job("Import NCBI Taxonomy", 'tripal_phylogeny',
  31. 'tripal_phylogeny_ncbi_taxonomy_import', $args, $user->uid);
  32. }
  33. /**
  34. *
  35. * @param unknown $job_id
  36. */
  37. function tripal_phylogeny_ncbi_taxonomy_import($job_id) {
  38. print "\nNOTE: Importing of NCBI taxonomy data is performed using a database transaction. \n" .
  39. "If the load fails or is terminated prematurely then the entire set of \n" .
  40. "insertions/updates is rolled back and will not be found in the database\n\n";
  41. $transaction = db_transaction();
  42. try {
  43. // TDDO: there should be an API function named tripal_insert_analysis().
  44. // But until then we have to insert the analysis manually.
  45. // Get the version of this module for the analysis record:
  46. $info = system_get_info('module', 'tripal_phylogeny');
  47. $version = $info['version'];
  48. $analysis_name = 'NCBI Taxonomy Tree Import';
  49. // If the analysis record already exists then don't add it again.
  50. $analysis = chado_select_record('analysis', array('*'), array('name' => $analysis_name));
  51. if (count($analysis) == 0) {
  52. $values = array(
  53. 'name' => 'NCBI Taxonomy Tree Import',
  54. 'description' => 'Used to import NCBI taxonomy details for organisms in this database.',
  55. 'program' => 'Tripal Phylogeny Module NCBI Taxonomy Importer',
  56. 'programversion' => $version,
  57. 'sourcename' => 'NCBI Taxonomy',
  58. 'sourceuri' => 'http://www.ncbi.nlm.nih.gov/taxonomy',
  59. );
  60. $analysis = chado_insert_record('analysis', $values);
  61. if (!$analysis) {
  62. throw new Exception("Cannot add NCBI Taxonomy Tree Import Analysis.");
  63. }
  64. }
  65. else {
  66. $analysis = $analysis[0];
  67. }
  68. // If the tree already exists then don't insert it again.
  69. $tree_name = 'NCBI Taxonomy Tree';
  70. $phylotree = chado_select_record('phylotree', array('*'), array('name' => $tree_name));
  71. if (count($phylotree) == 0) {
  72. // Add the taxonomic tree.
  73. $options = array(
  74. 'name' => 'NCBI Taxonomy Tree',
  75. 'description' => 'The Taxonomy Database is a curated classification and nomenclature for all of the organisms in the public sequence databases.',
  76. 'leaf_type' => 'taxonomy',
  77. 'analysis_id' => $analysis->analysis_id,
  78. 'tree_file' => '/dev/null',
  79. 'format' => 'taxonomy',
  80. 'no_load' => TRUE,
  81. );
  82. $errors = array();
  83. $warnings = array();
  84. $success = tripal_insert_phylotree($options, $errors, $warnings);
  85. if (!$success) {
  86. throw new Exception("Cannot add the NCBI Taxonomy Tree phylotree record.");
  87. }
  88. $phylotree = (object) $options;
  89. }
  90. else {
  91. $phylotree = $phylotree[0];
  92. }
  93. // Clean out the phylotree in the event this is a reload
  94. chado_delete_record('phylonode', array('phylotree_id' => $phylotree->phylotree_id));
  95. // The taxonomic tree must have a root, so create that first.
  96. $tree = array(
  97. 'name' => 'root',
  98. 'depth' => 0,
  99. 'is_root' => 1,
  100. 'is_leaf' => 0,
  101. 'is_internal' => 0,
  102. 'left_index' => 0,
  103. 'right_index' => 0,
  104. 'branch_set' => array(),
  105. );
  106. // Get the "rank" cvterm. It requires that the TAXRANK vocabulary is loaded.
  107. $rank_cvterm = tripal_get_cvterm(array(
  108. 'name' => 'rank',
  109. 'cv_id' => array('name' => 'tripal_phylogeny')
  110. ));
  111. // Get the list of orgnaisms
  112. $sql = "SELECT O.* FROM {organism} O";
  113. $organisms = chado_query($sql);
  114. while ($organism = $organisms->fetchObject()) {
  115. // Build the query string to get the information about this species.
  116. $term = $organism->genus . ' ' . $organism->species;
  117. $term = urlencode($term);
  118. $search_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?".
  119. "db=taxonomy" .
  120. "&term=$term";
  121. // Get the search response from NCBI.
  122. $rfh = fopen($search_url, "r");
  123. $xml_text = '';
  124. while (!feof($rfh)) {
  125. $xml_text .= fread($rfh, 255);
  126. }
  127. fclose($rfh);
  128. // Parse the XML to get the taxonomy ID
  129. $xml = new SimpleXMLElement($xml_text);
  130. if ($xml) {
  131. $taxid = (string) $xml->IdList->Id;
  132. if ($taxid) {
  133. print "$taxid\t$organism->genus $organism->species\n";
  134. // If we have a taxonomy ID we can now get the details.
  135. $fetch_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?".
  136. "db=taxonomy" .
  137. "&id=$taxid";
  138. // Get the search response from NCBI.
  139. $rfh = fopen($fetch_url, "r");
  140. $xml_text = '';
  141. while (!feof($rfh)) {
  142. $xml_text .= fread($rfh, 255);
  143. }
  144. fclose($rfh);
  145. $xml = new SimpleXMLElement($xml_text);
  146. if ($xml) {
  147. $taxon = $xml->Taxon;
  148. // Add in the organism properties
  149. $lineage = (string) $taxon->Lineage;
  150. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'lineage', $lineage);
  151. $genetic_code = (string) $taxon->GeneticCode->GCId;
  152. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'genetic_code', $genetic_code);
  153. $genetic_code_name = (string) $taxon->GeneticCode->GCName;
  154. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'genetic_code_name', $genetic_code_name);
  155. $mito_genetic_code = (string) $taxon->MitoGeneticCode->MGCId;
  156. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'mitochondrial_genetic_code', $mito_genetic_code);
  157. $mito_genetic_code_name = (string) $taxon->MitoGeneticCode->MGCName;
  158. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'mitochondrial_genetic_code_name', $mito_genetic_code_name);
  159. $division = (string) $taxon->Division;
  160. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'division', $division);
  161. $name_ranks = array();
  162. foreach ($taxon->OtherNames->children() as $child) {
  163. $type = $child->getName();
  164. $name = (string) $child;
  165. if (!array_key_exists($type, $name_ranks)) {
  166. $name_ranks[$type] = 0;
  167. }
  168. switch ($type) {
  169. case 'GenbankCommonName':
  170. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'genbank_common_name', $name, $name_ranks[$type]);
  171. break;
  172. case 'Synonym':
  173. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'synonym', $name, $name_ranks[$type]);
  174. break;
  175. case 'CommonName':
  176. case 'Includes':
  177. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'other_name', $name, $name_ranks[$type]);
  178. break;
  179. case 'EquivalentName':
  180. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'equivalent_name', $name, $name_ranks[$type]);
  181. break;
  182. case 'Anamorph':
  183. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'anamorph', $name, $name_ranks[$type]);
  184. break;
  185. case 'Name':
  186. // skip the Name stanza
  187. break;
  188. default:
  189. print "NOTICE: Skipping unrecognzed name type: $type\n";
  190. // do nothing for unrecognized types
  191. }
  192. $name_ranks[$type]++;
  193. }
  194. // Generate a nested array structure that can be used for importing the tree.
  195. $parent = (string) $taxon->ParentTaxId;
  196. $rank = (string) $taxon->Rank;
  197. $sci_name = (string) $taxon->ScientificName;
  198. $lineage_depth = preg_split('/;\s*/', $lineage);
  199. $parent = $tree;
  200. $i = 1;
  201. foreach ($taxon->LineageEx->children() as $child) {
  202. $tid = (string) $child->TaxID;
  203. $name = (string) $child->ScientificName;
  204. $node_rank = (string) $child->Rank;
  205. $node = array(
  206. 'name' => $name,
  207. 'depth' => $i,
  208. 'is_root' => 0,
  209. 'is_leaf' => 0,
  210. 'is_internal' => 1,
  211. 'left_index' => 0,
  212. 'right_index' => 0,
  213. 'parent' => $parent,
  214. 'branch_set' => array(),
  215. 'parent' => $parent['name'],
  216. 'properties' => array(
  217. $rank_cvterm->cvterm_id => $node_rank,
  218. ),
  219. );
  220. $parent = $node;
  221. tripal_phylogeny_taxonomy_import_add_node($tree, $node, $lineage_depth);
  222. $i++;
  223. }
  224. // Now add in the leaf node
  225. $node = array(
  226. 'name' => $sci_name,
  227. 'depth' => $i,
  228. 'is_root' => 0,
  229. 'is_leaf' => 1,
  230. 'is_internal' => 0,
  231. 'left_index' => 0,
  232. 'right_index' => 0,
  233. 'parent' => $parent['name'],
  234. 'organism_id' => $organism->organism_id,
  235. 'properties' => array(
  236. $rank_cvterm->cvterm_id => $rank,
  237. ),
  238. );
  239. tripal_phylogeny_taxonomy_import_add_node($tree, $node, $lineage_depth);
  240. // Set the indecies for the tree.
  241. tripal_phylogeny_assign_tree_indices($tree);
  242. } // end: if ($xml) { ...
  243. } // end: if ($taxid) { ...
  244. } // end: if ($xml) { ...
  245. } // end: while ($organism = $organisms->fetchObject()) { ...
  246. // print json_encode(($tree));
  247. // Now add the tree
  248. $options = array('taxonomy' => 1);
  249. tripal_phylogeny_import_tree($tree, $phylotree, $options);
  250. // If ther user requested to sync the tree then do it.
  251. //if ($sync) {
  252. chado_node_sync_records('phylotree', FALSE, FALSE,
  253. array(), $ids = array($phylotree->phylotree_id));
  254. //}
  255. }
  256. catch (Exception $e) {
  257. $transaction->rollback();
  258. print "\n"; // make sure we start errors on new line
  259. watchdog_exception('tripal_phylogeny', $e);
  260. print "FAILED: Rolling back database changes...\n";
  261. }
  262. }
  263. /**
  264. *
  265. * @param unknown $node
  266. */
  267. function tripal_phylogeny_taxonomy_import_add_node(&$tree, $node, $lineage_depth) {
  268. // Get the branch set for the tree root.
  269. $branch_set = &$tree['branch_set'];
  270. // Iterate through the tree up until the depth where this node will
  271. // be placed.
  272. $node_depth = $node['depth'];
  273. for ($i = 1; $i <= $node_depth; $i++) {
  274. // Iterate through any existing nodes in the branch set to see if
  275. // the node name matches the correct name for the lineage at this
  276. // depth. If it matches then it is inside of this branch set that
  277. // we will place the node.
  278. for ($j = 0; $j < count($branch_set); $j++) {
  279. // If this node already exists in the tree then return.
  280. if ($branch_set[$j]['name'] == $node['name'] and
  281. $branch_set[$j]['depth'] = $node['depth']) {
  282. return;
  283. }
  284. // Otherwise, set the branch to be the current branch and continue.
  285. if ($branch_set[$j]['name'] == $lineage_depth[$i-1]) {
  286. $branch_set = &$branch_set[$j]['branch_set'];
  287. break;
  288. }
  289. }
  290. }
  291. // Add the node to the last branch set. This should be where this node goes.
  292. $branch_set[] = $node;
  293. }
  294. /**
  295. *
  296. * @param unknown $organism_id
  297. * @param unknown $term_name
  298. * @param unknown $value
  299. */
  300. function tripal_phylogeny_taxonomy_add_organism_property($organism_id, $term_name, $value, $rank = 0) {
  301. if (!$value) {
  302. return;
  303. }
  304. $record = array(
  305. 'table' => 'organism',
  306. 'id' => $organism_id
  307. );
  308. $property = array(
  309. 'type_name' => $term_name,
  310. 'cv_name' => organism_property,
  311. 'value' => $value
  312. );
  313. // Delete all properties of this type if the rank is zero.
  314. if ($rank == 0) {
  315. chado_delete_property($record, $property);
  316. }
  317. chado_insert_property($record, $property);
  318. }