tripal_chado.taxonomy_importer.inc 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. <?php
  2. /**
  3. *
  4. */
  5. function tripal_chado_taxonomy_load_form($form, &$form_state) {
  6. $form['instructions'] = array(
  7. '#type' => 'item',
  8. '#markup' => '',
  9. );
  10. $form['import_existing'] = array(
  11. '#type' => 'checkbox',
  12. '#title' => 'Import taxonomy for existing species.',
  13. '#description' => t('The NCBI Taxonmic Importer examines the organisms
  14. currently present in the database and queries NCBI for the
  15. taxonomic details. If the importer is able to match the
  16. genus and species with NCBI the species details will be imported,
  17. and a page containing the taxonomic tree will be created.'),
  18. );
  19. $form['submit'] = array(
  20. '#type' => 'submit',
  21. '#name' => 'import',
  22. '#value' => 'Submit',
  23. );
  24. return $form;
  25. }
  26. /**
  27. *
  28. * @param unknown $form
  29. * @param unknown $form_state
  30. */
  31. function tripal_chado_taxonomy_load_form_validate($form, &$form_state) {
  32. global $user;
  33. if (!$form_state['values']['import_existing']) {
  34. form_set_error('import_exists', 'Please confirm the import by clicking the checkbox.');
  35. }
  36. }
  37. /**
  38. *
  39. * @param unknown $form
  40. * @param unknown $form_state
  41. */
  42. function tripal_chado_taxonomy_load_form_submit($form, &$form_state) {
  43. global $user;
  44. if ($form_state['values']['import_existing']) {
  45. $args = array();
  46. $includes = array();
  47. $includes[] = module_load_include('inc', 'tripal_chado', 'includes/loaders/tripal_chado.taxonomy_importer');
  48. tripal_add_job("Import NCBI Taxonomy", 'tripal_chado',
  49. 'tripal_chado_ncbi_taxonomy_import', $args, $user->uid, 10, $includes);
  50. }
  51. }
  52. /**
  53. *
  54. * @param unknown $job_id
  55. */
  56. function tripal_chado_ncbi_taxonomy_import($job_id) {
  57. print "\nNOTE: Importing of NCBI taxonomy data is performed using a database transaction. \n" .
  58. "If the load fails or is terminated prematurely then the entire set of \n" .
  59. "insertions/updates is rolled back and will not be found in the database\n\n";
  60. $transaction = db_transaction();
  61. try {
  62. // TDDO: there should be an API function named tripal_insert_analysis().
  63. // But until then we have to insert the analysis manually.
  64. // Get the version of this module for the analysis record:
  65. $info = system_get_info('module', 'tripal_chado');
  66. $version = $info['version'];
  67. $analysis_name = 'NCBI Taxonomy Tree Import';
  68. // If the analysis record already exists then don't add it again.
  69. $analysis = chado_select_record('analysis', array('*'), array('name' => $analysis_name));
  70. if (count($analysis) == 0) {
  71. $values = array(
  72. 'name' => 'NCBI Taxonomy Tree Import',
  73. 'description' => 'Used to import NCBI taxonomy details for organisms in this database.',
  74. 'program' => 'Tripal Phylogeny Module NCBI Taxonomy Importer',
  75. 'programversion' => $version,
  76. 'sourcename' => 'NCBI Taxonomy',
  77. 'sourceuri' => 'http://www.ncbi.nlm.nih.gov/taxonomy',
  78. );
  79. $analysis = chado_insert_record('analysis', $values);
  80. if (!$analysis) {
  81. throw new Exception("Cannot add NCBI Taxonomy Tree Import Analysis.");
  82. }
  83. }
  84. else {
  85. $analysis = $analysis[0];
  86. }
  87. // If the tree already exists then don't insert it again.
  88. global $site_name;
  89. $tree_name = $site_name . 'Taxonomy Tree';
  90. $phylotree = chado_select_record('phylotree', array('*'), array('name' => $tree_name));
  91. if (count($phylotree) == 0) {
  92. // Add the taxonomic tree.
  93. $options = array(
  94. 'name' => $site_name . 'Taxonomy Tree',
  95. 'description' => 'The taxonomic tree of species present on this site. Click a species name for more details.',
  96. 'leaf_type' => 'taxonomy',
  97. 'analysis_id' => $analysis->analysis_id,
  98. 'tree_file' => '/dev/null',
  99. 'format' => 'taxonomy',
  100. 'no_load' => TRUE,
  101. );
  102. $errors = array();
  103. $warnings = array();
  104. $success = tripal_insert_phylotree($options, $errors, $warnings);
  105. if (!$success) {
  106. throw new Exception("Cannot add the Taxonomy Tree record.");
  107. }
  108. $phylotree = (object) $options;
  109. }
  110. else {
  111. $phylotree = $phylotree[0];
  112. }
  113. // Clean out the phylotree in the event this is a reload
  114. chado_delete_record('phylonode', array('phylotree_id' => $phylotree->phylotree_id));
  115. // The taxonomic tree must have a root, so create that first.
  116. $tree = array(
  117. 'name' => 'root',
  118. 'depth' => 0,
  119. 'is_root' => 1,
  120. 'is_leaf' => 0,
  121. 'is_internal' => 0,
  122. 'left_index' => 0,
  123. 'right_index' => 0,
  124. 'branch_set' => array(),
  125. );
  126. // Get the "rank" cvterm. It requires that the TAXRANK vocabulary is loaded.
  127. $rank_cvterm = tripal_get_cvterm(array(
  128. 'name' => 'rank',
  129. 'cv_id' => array('name' => 'local')
  130. ));
  131. // Get the list of organisms
  132. $sql = "SELECT O.* FROM {organism} O";
  133. $organisms = chado_query($sql);
  134. while ($organism = $organisms->fetchObject()) {
  135. // Build the query string to get the information about this species.
  136. $term = $organism->genus . ' ' . $organism->species;
  137. $term = urlencode($term);
  138. $search_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?".
  139. "db=taxonomy" .
  140. "&term=$term";
  141. // Get the search response from NCBI.
  142. $rfh = fopen($search_url, "r");
  143. $xml_text = '';
  144. while (!feof($rfh)) {
  145. $xml_text .= fread($rfh, 255);
  146. }
  147. fclose($rfh);
  148. // Parse the XML to get the taxonomy ID
  149. $xml = new SimpleXMLElement($xml_text);
  150. if ($xml) {
  151. $taxid = (string) $xml->IdList->Id;
  152. if ($taxid) {
  153. print "$taxid\t$organism->genus $organism->species\n";
  154. // If we have a taxonomy ID we can now get the details.
  155. $fetch_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?".
  156. "db=taxonomy" .
  157. "&id=$taxid";
  158. // Get the search response from NCBI.
  159. $rfh = fopen($fetch_url, "r");
  160. $xml_text = '';
  161. while (!feof($rfh)) {
  162. $xml_text .= fread($rfh, 255);
  163. }
  164. fclose($rfh);
  165. $xml = new SimpleXMLElement($xml_text);
  166. if ($xml) {
  167. $taxon = $xml->Taxon;
  168. // Add in the organism properties
  169. $lineage = (string) $taxon->Lineage;
  170. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'lineage', $lineage);
  171. $genetic_code = (string) $taxon->GeneticCode->GCId;
  172. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'genetic_code', $genetic_code);
  173. $genetic_code_name = (string) $taxon->GeneticCode->GCName;
  174. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'genetic_code_name', $genetic_code_name);
  175. $mito_genetic_code = (string) $taxon->MitoGeneticCode->MGCId;
  176. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'mitochondrial_genetic_code', $mito_genetic_code);
  177. $mito_genetic_code_name = (string) $taxon->MitoGeneticCode->MGCName;
  178. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'mitochondrial_genetic_code_name', $mito_genetic_code_name);
  179. $division = (string) $taxon->Division;
  180. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'division', $division);
  181. $name_ranks = array();
  182. foreach ($taxon->OtherNames->children() as $child) {
  183. $type = $child->getName();
  184. $name = (string) $child;
  185. if (!array_key_exists($type, $name_ranks)) {
  186. $name_ranks[$type] = 0;
  187. }
  188. switch ($type) {
  189. case 'GenbankCommonName':
  190. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'genbank_common_name', $name, $name_ranks[$type]);
  191. break;
  192. case 'Synonym':
  193. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'synonym', $name, $name_ranks[$type]);
  194. break;
  195. case 'CommonName':
  196. case 'Includes':
  197. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'other_name', $name, $name_ranks[$type]);
  198. break;
  199. case 'EquivalentName':
  200. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'equivalent_name', $name, $name_ranks[$type]);
  201. break;
  202. case 'Anamorph':
  203. tripal_chado_taxonomy_add_organism_property($organism->organism_id, 'anamorph', $name, $name_ranks[$type]);
  204. break;
  205. case 'Name':
  206. // skip the Name stanza
  207. break;
  208. default:
  209. print "NOTICE: Skipping unrecognzed name type: $type\n";
  210. // do nothing for unrecognized types
  211. }
  212. $name_ranks[$type]++;
  213. }
  214. // Generate a nested array structure that can be used for importing the tree.
  215. $parent = (string) $taxon->ParentTaxId;
  216. $rank = (string) $taxon->Rank;
  217. $sci_name = (string) $taxon->ScientificName;
  218. $lineage_depth = preg_split('/;\s*/', $lineage);
  219. $parent = $tree;
  220. $i = 1;
  221. foreach ($taxon->LineageEx->children() as $child) {
  222. $tid = (string) $child->TaxID;
  223. $name = (string) $child->ScientificName;
  224. $node_rank = (string) $child->Rank;
  225. $node = array(
  226. 'name' => $name,
  227. 'depth' => $i,
  228. 'is_root' => 0,
  229. 'is_leaf' => 0,
  230. 'is_internal' => 1,
  231. 'left_index' => 0,
  232. 'right_index' => 0,
  233. 'parent' => $parent,
  234. 'branch_set' => array(),
  235. 'parent' => $parent['name'],
  236. 'properties' => array(
  237. $rank_cvterm->cvterm_id => $node_rank,
  238. ),
  239. );
  240. $parent = $node;
  241. tripal_chado_taxonomy_import_add_node($tree, $node, $lineage_depth);
  242. $i++;
  243. }
  244. // Now add in the leaf node
  245. $node = array(
  246. 'name' => $sci_name,
  247. 'depth' => $i,
  248. 'is_root' => 0,
  249. 'is_leaf' => 1,
  250. 'is_internal' => 0,
  251. 'left_index' => 0,
  252. 'right_index' => 0,
  253. 'parent' => $parent['name'],
  254. 'organism_id' => $organism->organism_id,
  255. 'properties' => array(
  256. $rank_cvterm->cvterm_id => $rank,
  257. ),
  258. );
  259. tripal_chado_taxonomy_import_add_node($tree, $node, $lineage_depth);
  260. // Set the indecies for the tree.
  261. tripal_assign_phylogeny_tree_indices($tree);
  262. } // end: if ($xml) { ...
  263. } // end: if ($taxid) { ...
  264. } // end: if ($xml) { ...
  265. } // end: while ($organism = $organisms->fetchObject()) { ...
  266. // print json_encode(($tree));
  267. // Now add the tree
  268. $options = array('taxonomy' => 1);
  269. tripal_phylogeny_import_tree($tree, $phylotree, $options);
  270. }
  271. catch (Exception $e) {
  272. $transaction->rollback();
  273. print "\n"; // make sure we start errors on new line
  274. watchdog_exception('tripal_chado', $e);
  275. print "FAILED: Rolling back database changes...\n";
  276. }
  277. }
  278. /**
  279. *
  280. * @param unknown $node
  281. */
  282. function tripal_chado_taxonomy_import_add_node(&$tree, $node, $lineage_depth) {
  283. // Get the branch set for the tree root.
  284. $branch_set = &$tree['branch_set'];
  285. // Iterate through the tree up until the depth where this node will
  286. // be placed.
  287. $node_depth = $node['depth'];
  288. for ($i = 1; $i <= $node_depth; $i++) {
  289. // Iterate through any existing nodes in the branch set to see if
  290. // the node name matches the correct name for the lineage at this
  291. // depth. If it matches then it is inside of this branch set that
  292. // we will place the node.
  293. for ($j = 0; $j < count($branch_set); $j++) {
  294. // If this node already exists in the tree then return.
  295. if ($branch_set[$j]['name'] == $node['name'] and
  296. $branch_set[$j]['depth'] = $node['depth']) {
  297. return;
  298. }
  299. // Otherwise, set the branch to be the current branch and continue.
  300. if ($branch_set[$j]['name'] == $lineage_depth[$i-1]) {
  301. $branch_set = &$branch_set[$j]['branch_set'];
  302. break;
  303. }
  304. }
  305. }
  306. // Add the node to the last branch set. This should be where this node goes.
  307. $branch_set[] = $node;
  308. }
  309. /**
  310. *
  311. * @param unknown $organism_id
  312. * @param unknown $term_name
  313. * @param unknown $value
  314. */
  315. function tripal_chado_taxonomy_add_organism_property($organism_id, $term_name, $value, $rank = 0) {
  316. if (!$value) {
  317. return;
  318. }
  319. $record = array(
  320. 'table' => 'organism',
  321. 'id' => $organism_id
  322. );
  323. $property = array(
  324. 'type_name' => $term_name,
  325. 'cv_name' => organism_property,
  326. 'value' => $value
  327. );
  328. // Delete all properties of this type if the rank is zero.
  329. if ($rank == 0) {
  330. chado_delete_property($record, $property);
  331. }
  332. chado_insert_property($record, $property);
  333. }