tripal_phylogeny.taxonomy.inc 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. <?php
  2. /**
  3. *
  4. */
  5. function tripal_phylogeny_taxonomy_load_form($form, &$form_state) {
  6. $form['instructions'] = array(
  7. '#type' => 'item',
  8. '#markup' => t('The NCBI Taxonmic Importer examines the organisms
  9. currently present in the database and queries NCBI for the
  10. taxonomic details. If the importer is able to match the
  11. genus and species with NCBI the species details will be imported,
  12. and a page containing the taxonomic tree will be created.'),
  13. );
  14. $form['import_ncbi'] = array(
  15. '#type' => 'submit',
  16. '#name' => 'import_sync',
  17. '#value' => 'Import NCBI taxonomic data'
  18. );
  19. return $form;
  20. }
  21. /**
  22. *
  23. * @param unknown $form
  24. * @param unknown $form_state
  25. */
  26. function tripal_phylogeny_taxonomy_load_form_submit($form, &$form_state) {
  27. global $user;
  28. $sync = $form_state['values']['do_sync'];
  29. $args = array();
  30. tripal_add_job("Import NCBI Taxonomy", 'tripal_phylogeny',
  31. 'tripal_phylogeny_ncbi_taxonomy_import', $args, $user->uid);
  32. }
  33. /**
  34. *
  35. * @param unknown $job_id
  36. */
  37. function tripal_phylogeny_ncbi_taxonomy_import($job_id) {
  38. print "\nNOTE: Importing of NCBI taxonomy data is performed using a database transaction. \n" .
  39. "If the load fails or is terminated prematurely then the entire set of \n" .
  40. "insertions/updates is rolled back and will not be found in the database\n\n";
  41. $transaction = db_transaction();
  42. try {
  43. // TDDO: there should be an API function named tripal_insert_analysis().
  44. // But until then we have to insert the analysis manually.
  45. // Get the version of this module for the analysis record:
  46. $info = system_get_info('module', 'tripal_phylogeny');
  47. $version = $info['version'];
  48. $analysis_name = 'NCBI Taxonomy Tree Import';
  49. // If the analysis record already exists then don't add it again.
  50. $analysis = chado_select_record('analysis', array('*'), array('name' => $analysis_name));
  51. if (count($analysis) == 0) {
  52. $values = array(
  53. 'name' => 'NCBI Taxonomy Tree Import',
  54. 'description' => 'Used to import NCBI taxonomy details for organisms in this database.',
  55. 'program' => 'Tripal Phylogeny Module NCBI Taxonomy Importer',
  56. 'programversion' => $version,
  57. 'sourcename' => 'NCBI Taxonomy',
  58. 'sourceuri' => 'http://www.ncbi.nlm.nih.gov/taxonomy',
  59. );
  60. $analysis = chado_insert_record('analysis', $values);
  61. if (!$analysis) {
  62. throw new Exception("Cannot add NCBI Taxonomy Tree Import Analysis.");
  63. }
  64. }
  65. else {
  66. $analysis = $analysis[0];
  67. }
  68. // If the tree already exists then don't insert it again.
  69. global $site_name;
  70. $tree_name = $site_name . 'Taxonomy Tree';
  71. $phylotree = chado_select_record('phylotree', array('*'), array('name' => $tree_name));
  72. if (count($phylotree) == 0) {
  73. // Add the taxonomic tree.
  74. $options = array(
  75. 'name' => $site_name . 'Taxonomy Tree',
  76. 'description' => 'The taxonomic tree of organisms represented in this database.',
  77. 'leaf_type' => 'taxonomy',
  78. 'analysis_id' => $analysis->analysis_id,
  79. 'tree_file' => '/dev/null',
  80. 'format' => 'taxonomy',
  81. 'no_load' => TRUE,
  82. );
  83. $errors = array();
  84. $warnings = array();
  85. $success = tripal_insert_phylotree($options, $errors, $warnings);
  86. if (!$success) {
  87. throw new Exception("Cannot add the Taxonomy Tree record.");
  88. }
  89. $phylotree = (object) $options;
  90. }
  91. else {
  92. $phylotree = $phylotree[0];
  93. }
  94. // Clean out the phylotree in the event this is a reload
  95. chado_delete_record('phylonode', array('phylotree_id' => $phylotree->phylotree_id));
  96. // The taxonomic tree must have a root, so create that first.
  97. $tree = array(
  98. 'name' => 'root',
  99. 'depth' => 0,
  100. 'is_root' => 1,
  101. 'is_leaf' => 0,
  102. 'is_internal' => 0,
  103. 'left_index' => 0,
  104. 'right_index' => 0,
  105. 'branch_set' => array(),
  106. );
  107. // Get the "rank" cvterm. It requires that the TAXRANK vocabulary is loaded.
  108. $rank_cvterm = tripal_get_cvterm(array(
  109. 'name' => 'rank',
  110. 'cv_id' => array('name' => 'tripal_phylogeny')
  111. ));
  112. // Get the list of orgnaisms
  113. $sql = "SELECT O.* FROM {organism} O";
  114. $organisms = chado_query($sql);
  115. while ($organism = $organisms->fetchObject()) {
  116. // Build the query string to get the information about this species.
  117. $term = $organism->genus . ' ' . $organism->species;
  118. $term = urlencode($term);
  119. $search_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?".
  120. "db=taxonomy" .
  121. "&term=$term";
  122. // Get the search response from NCBI.
  123. $rfh = fopen($search_url, "r");
  124. $xml_text = '';
  125. while (!feof($rfh)) {
  126. $xml_text .= fread($rfh, 255);
  127. }
  128. fclose($rfh);
  129. // Parse the XML to get the taxonomy ID
  130. $xml = new SimpleXMLElement($xml_text);
  131. if ($xml) {
  132. $taxid = (string) $xml->IdList->Id;
  133. if ($taxid) {
  134. print "$taxid\t$organism->genus $organism->species\n";
  135. // If we have a taxonomy ID we can now get the details.
  136. $fetch_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?".
  137. "db=taxonomy" .
  138. "&id=$taxid";
  139. // Get the search response from NCBI.
  140. $rfh = fopen($fetch_url, "r");
  141. $xml_text = '';
  142. while (!feof($rfh)) {
  143. $xml_text .= fread($rfh, 255);
  144. }
  145. fclose($rfh);
  146. $xml = new SimpleXMLElement($xml_text);
  147. if ($xml) {
  148. $taxon = $xml->Taxon;
  149. // Add in the organism properties
  150. $lineage = (string) $taxon->Lineage;
  151. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'lineage', $lineage);
  152. $genetic_code = (string) $taxon->GeneticCode->GCId;
  153. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'genetic_code', $genetic_code);
  154. $genetic_code_name = (string) $taxon->GeneticCode->GCName;
  155. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'genetic_code_name', $genetic_code_name);
  156. $mito_genetic_code = (string) $taxon->MitoGeneticCode->MGCId;
  157. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'mitochondrial_genetic_code', $mito_genetic_code);
  158. $mito_genetic_code_name = (string) $taxon->MitoGeneticCode->MGCName;
  159. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'mitochondrial_genetic_code_name', $mito_genetic_code_name);
  160. $division = (string) $taxon->Division;
  161. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'division', $division);
  162. $name_ranks = array();
  163. foreach ($taxon->OtherNames->children() as $child) {
  164. $type = $child->getName();
  165. $name = (string) $child;
  166. if (!array_key_exists($type, $name_ranks)) {
  167. $name_ranks[$type] = 0;
  168. }
  169. switch ($type) {
  170. case 'GenbankCommonName':
  171. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'genbank_common_name', $name, $name_ranks[$type]);
  172. break;
  173. case 'Synonym':
  174. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'synonym', $name, $name_ranks[$type]);
  175. break;
  176. case 'CommonName':
  177. case 'Includes':
  178. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'other_name', $name, $name_ranks[$type]);
  179. break;
  180. case 'EquivalentName':
  181. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'equivalent_name', $name, $name_ranks[$type]);
  182. break;
  183. case 'Anamorph':
  184. tripal_phylogeny_taxonomy_add_organism_property($organism->organism_id, 'anamorph', $name, $name_ranks[$type]);
  185. break;
  186. case 'Name':
  187. // skip the Name stanza
  188. break;
  189. default:
  190. print "NOTICE: Skipping unrecognzed name type: $type\n";
  191. // do nothing for unrecognized types
  192. }
  193. $name_ranks[$type]++;
  194. }
  195. // Generate a nested array structure that can be used for importing the tree.
  196. $parent = (string) $taxon->ParentTaxId;
  197. $rank = (string) $taxon->Rank;
  198. $sci_name = (string) $taxon->ScientificName;
  199. $lineage_depth = preg_split('/;\s*/', $lineage);
  200. $parent = $tree;
  201. $i = 1;
  202. foreach ($taxon->LineageEx->children() as $child) {
  203. $tid = (string) $child->TaxID;
  204. $name = (string) $child->ScientificName;
  205. $node_rank = (string) $child->Rank;
  206. $node = array(
  207. 'name' => $name,
  208. 'depth' => $i,
  209. 'is_root' => 0,
  210. 'is_leaf' => 0,
  211. 'is_internal' => 1,
  212. 'left_index' => 0,
  213. 'right_index' => 0,
  214. 'parent' => $parent,
  215. 'branch_set' => array(),
  216. 'parent' => $parent['name'],
  217. 'properties' => array(
  218. $rank_cvterm->cvterm_id => $node_rank,
  219. ),
  220. );
  221. $parent = $node;
  222. tripal_phylogeny_taxonomy_import_add_node($tree, $node, $lineage_depth);
  223. $i++;
  224. }
  225. // Now add in the leaf node
  226. $node = array(
  227. 'name' => $sci_name,
  228. 'depth' => $i,
  229. 'is_root' => 0,
  230. 'is_leaf' => 1,
  231. 'is_internal' => 0,
  232. 'left_index' => 0,
  233. 'right_index' => 0,
  234. 'parent' => $parent['name'],
  235. 'organism_id' => $organism->organism_id,
  236. 'properties' => array(
  237. $rank_cvterm->cvterm_id => $rank,
  238. ),
  239. );
  240. tripal_phylogeny_taxonomy_import_add_node($tree, $node, $lineage_depth);
  241. // Set the indecies for the tree.
  242. tripal_phylogeny_assign_tree_indices($tree);
  243. } // end: if ($xml) { ...
  244. } // end: if ($taxid) { ...
  245. } // end: if ($xml) { ...
  246. } // end: while ($organism = $organisms->fetchObject()) { ...
  247. // print json_encode(($tree));
  248. // Now add the tree
  249. $options = array('taxonomy' => 1);
  250. tripal_phylogeny_import_tree($tree, $phylotree, $options);
  251. // If ther user requested to sync the tree then do it.
  252. //if ($sync) {
  253. chado_node_sync_records('phylotree', FALSE, FALSE,
  254. array(), $ids = array($phylotree->phylotree_id));
  255. //}
  256. }
  257. catch (Exception $e) {
  258. $transaction->rollback();
  259. print "\n"; // make sure we start errors on new line
  260. watchdog_exception('tripal_phylogeny', $e);
  261. print "FAILED: Rolling back database changes...\n";
  262. }
  263. }
  264. /**
  265. *
  266. * @param unknown $node
  267. */
  268. function tripal_phylogeny_taxonomy_import_add_node(&$tree, $node, $lineage_depth) {
  269. // Get the branch set for the tree root.
  270. $branch_set = &$tree['branch_set'];
  271. // Iterate through the tree up until the depth where this node will
  272. // be placed.
  273. $node_depth = $node['depth'];
  274. for ($i = 1; $i <= $node_depth; $i++) {
  275. // Iterate through any existing nodes in the branch set to see if
  276. // the node name matches the correct name for the lineage at this
  277. // depth. If it matches then it is inside of this branch set that
  278. // we will place the node.
  279. for ($j = 0; $j < count($branch_set); $j++) {
  280. // If this node already exists in the tree then return.
  281. if ($branch_set[$j]['name'] == $node['name'] and
  282. $branch_set[$j]['depth'] = $node['depth']) {
  283. return;
  284. }
  285. // Otherwise, set the branch to be the current branch and continue.
  286. if ($branch_set[$j]['name'] == $lineage_depth[$i-1]) {
  287. $branch_set = &$branch_set[$j]['branch_set'];
  288. break;
  289. }
  290. }
  291. }
  292. // Add the node to the last branch set. This should be where this node goes.
  293. $branch_set[] = $node;
  294. }
  295. /**
  296. *
  297. * @param unknown $organism_id
  298. * @param unknown $term_name
  299. * @param unknown $value
  300. */
  301. function tripal_phylogeny_taxonomy_add_organism_property($organism_id, $term_name, $value, $rank = 0) {
  302. if (!$value) {
  303. return;
  304. }
  305. $record = array(
  306. 'table' => 'organism',
  307. 'id' => $organism_id
  308. );
  309. $property = array(
  310. 'type_name' => $term_name,
  311. 'cv_name' => organism_property,
  312. 'value' => $value
  313. );
  314. // Delete all properties of this type if the rank is zero.
  315. if ($rank == 0) {
  316. chado_delete_property($record, $property);
  317. }
  318. chado_insert_property($record, $property);
  319. }