TaxonomyImporter.inc 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920
  1. <?php
  2. class TaxonomyImporter extends TripalImporter {
  3. /**
  4. * The name of this loader. This name will be presented to the site
  5. * user.
  6. */
  7. public static $name = 'Chado NCBI Taxonomy Loader';
  8. /**
  9. * The machine name for this loader. This name will be used to construct
  10. * the URL for the loader.
  11. */
  12. public static $machine_name = 'chado_taxonomy';
  13. /**
  14. * A brief description for this loader. This description will be
  15. * presented to the site user.
  16. */
  17. public static $description = 'Imports new organisms from NCBI using taxonomy IDs, or loads taxonomic details about existing organisms.';
  18. /**
  19. * An array containing the extensions of allowed file types.
  20. */
  21. public static $file_types = array();
  22. /**
  23. * Provides information to the user about the file upload. Typically this
  24. * may include a description of the file types allowed.
  25. */
  26. public static $upload_description = '';
  27. /**
  28. * The title that should appear above the upload button.
  29. */
  30. public static $upload_title = 'File Upload';
  31. /**
  32. * If the loader should require an analysis record. To maintain provenance
  33. * we should always indiate where the data we are uploading comes from.
  34. * The method that Tripal attempts to use for this by associating upload files
  35. * with an analysis record. The analysis record provides the details for
  36. * how the file was created or obtained. Set this to FALSE if the loader
  37. * should not require an analysis when loading. if $use_analysis is set to
  38. * true then the form values will have an 'analysis_id' key in the $form_state
  39. * array on submitted forms.
  40. */
  41. public static $use_analysis = FALSE;
  42. /**
  43. * If the $use_analysis value is set above then this value indicates if the
  44. * analysis should be required.
  45. */
  46. public static $require_analysis = FALSE;
  47. /**
  48. * Text that should appear on the button at the bottom of the importer
  49. * form.
  50. */
  51. public static $button_text = 'Import from NCBI Taxonomy';
  52. /**
  53. * Indicates the methods that the file uploader will support.
  54. */
  55. public static $methods = array(
  56. // Allow the user to upload a file to the server.
  57. 'file_upload' => FALSE,
  58. // Allow the user to provide the path on the Tripal server for the file.
  59. 'file_local' => FALSE,
  60. // Allow the user to provide a remote URL for the file.
  61. 'file_remote' => FALSE,
  62. );
  63. /**
  64. * Indicates if the file must be provided. An example when it may not be
  65. * necessary to require that the user provide a file for uploading if the
  66. * loader keeps track of previous files and makes those available for
  67. * selection.
  68. */
  69. public static $file_required = FALSE;
  70. /**
  71. * The array of arguments used for this loader. Each argument should
  72. * be a separate array containing a machine_name, name, and description
  73. * keys. This information is used to build the help text for the loader.
  74. */
  75. public static $argument_list = array();
  76. /**
  77. * Indicates how many files are allowed to be uploaded. By default this is
  78. * set to allow only one file. Change to any positive number. A value of
  79. * zero indicates an unlimited number of uploaded files are allowed.
  80. */
  81. public static $cardinality = 0;
  82. /**
  83. * Holds the list of all orgainsms currently in Chado. This list
  84. * is needed when checking to see if an organism has already been
  85. * loaded.
  86. */
  87. private $all_orgs = array();
  88. /**
  89. * The record from the Chado phylotree table that refers to this
  90. * Taxonomic tree.
  91. */
  92. private $phylotree = NULL;
  93. /**
  94. * The temporary tree array used by the Tripal Phylotree API for
  95. * importing a new tree.
  96. */
  97. private $tree = NULL;
  98. /**
  99. * @see TripalImporter::form()
  100. */
  101. public function form($form, &$form_state) {
  102. $form['instructions'] = array(
  103. '#type' => 'fieldset',
  104. '#title' => 'instructions',
  105. '#description' => t('This form is used to import species from the NCBI
  106. Taxonomy database into this site. Alternatively, it can import details
  107. about organisms from the NCBI Taxonomy database for organisms that
  108. already exist on this site. This loader will also construct
  109. the taxonomic tree for the species loaded.'),
  110. );
  111. $form['taxonomy_ids'] = array(
  112. '#type' => 'textarea',
  113. '#title' => 'Taxonomy ID',
  114. '#description' => t('Please provide a list of NCBI taxonomy IDs separated
  115. by spaces, tabs or new lines.
  116. The information about these organisms will be downloaded and new organism
  117. records will be added to this site.')
  118. );
  119. $form['import_existing'] = array(
  120. '#type' => 'checkbox',
  121. '#title' => 'Import details for existing species.',
  122. '#description' => t('The NCBI Taxonomic Importer examines the organisms
  123. currently present in the database and queries NCBI for the
  124. taxonomic details. If the importer is able to match the
  125. genus and species with NCBI the species details will be imported,
  126. and a page containing the taxonomic tree will be created.'),
  127. '#default value' => 1,
  128. );
  129. return $form;
  130. }
  131. /**
  132. * @see TripalImporter::formValidate()
  133. */
  134. public function formValidate($form, &$form_state) {
  135. global $user;
  136. $import_existing = $form_state['values']['import_existing'];
  137. $taxonomy_ids = $form_state['values']['taxonomy_ids'];
  138. // make sure that we have numeric values, one per line.
  139. if ($taxonomy_ids) {
  140. $tax_ids = preg_split("/[\s\n\t\r]+/", $taxonomy_ids);
  141. $bad_ids = array();
  142. foreach ($tax_ids as $tax_id) {
  143. $tax_id = trim($tax_id);
  144. if (!preg_match('/^\d+$/', $tax_id)) {
  145. $bad_ids[] = $tax_id;
  146. }
  147. }
  148. if (count($bad_ids) > 0) {
  149. form_set_error('taxonomy_ids',
  150. t('Taxonomy IDs must be numeric. The following are not valid: "@ids".',
  151. array('@ids' => implode('", "', $bad_ids))));
  152. }
  153. }
  154. }
  155. /**
  156. * Performs the import.
  157. */
  158. public function run() {
  159. global $site_name;
  160. $arguments = $this->arguments['run_args'];
  161. $taxonomy_ids = $arguments['taxonomy_ids'];
  162. $import_existing = $arguments['import_existing'];
  163. // Get the list of all organisms as we'll need this to lookup existing
  164. // organisms.
  165. if (chado_get_version() > 1.2) {
  166. $sql = "
  167. SELECT O.*, CVT.name as type
  168. FROM {organism} O
  169. LEFT JOIN {cvterm} CVT ON CVT.cvterm_id = O.type_id
  170. ORDER BY O.genus, O.species
  171. ";
  172. }
  173. else {
  174. $sql = "
  175. SELECT O.*, '' as type
  176. FROM {organism} O
  177. ORDER BY O.genus, O.species
  178. ";
  179. }
  180. $results = chado_query($sql);
  181. while ($item = $results->fetchObject()) {
  182. $this->all_orgs[] = $item;
  183. }
  184. // Get the phylotree object.
  185. $this->logMessage('Initializing Tree...');
  186. $this->phylotree = $this->initTree();
  187. $this->logMessage('Rebuilding Tree...');
  188. $this->tree = $this->rebuildTree();
  189. // Clean out the phnylondes for this tree in the event this is a reload
  190. chado_delete_record('phylonode', array('phylotree_id' => $this->phylotree->phylotree_id));
  191. // Get the taxonomy IDs provided by the user (if any).
  192. $tax_ids = array();
  193. if ($taxonomy_ids) {
  194. $tax_ids = preg_split("/[\s\n\t\r]+/", $taxonomy_ids);
  195. }
  196. // Set the number of items to handle.
  197. if ($taxonomy_ids and $import_existing) {
  198. $this->setTotalItems(count($this->all_orgs) + count($tax_ids));
  199. }
  200. if ($taxonomy_ids and !$import_existing) {
  201. $this->setTotalItems(count($tax_ids));
  202. }
  203. if (!$taxonomy_ids and $import_existing) {
  204. $this->setTotalItems(count($this->all_orgs));
  205. }
  206. $this->setItemsHandled($num_handled);
  207. // If the user wants to import new taxonomy IDs then do that.
  208. if ($taxonomy_ids){
  209. $this->logMessage('Importing Taxonomy IDs...');
  210. foreach ($tax_ids as $tax_id) {
  211. $tax_id = trim($tax_id);
  212. $this->importRecord($tax_id);
  213. $this->addItemsHandled(1);
  214. }
  215. }
  216. // If the user wants to update existing records then do that.
  217. if ($import_existing) {
  218. $this->logMessage('Updating Existing...');
  219. $this->updateExisting();
  220. }
  221. // Now import the tree.
  222. $options = array('taxonomy' => 1);
  223. chado_phylogeny_import_tree($this->tree, $this->phylotree, $options);
  224. }
  225. /**
  226. * Create the taxonomic tree in Chado.
  227. *
  228. * If the tree already exists it will not be recreated.
  229. *
  230. * @throws Exception
  231. * @return
  232. * Returns the phylotree object.
  233. */
  234. private function initTree() {
  235. // Add the taxonomy tree record into the phylotree table. If the tree
  236. // already exists then don't insert it again.
  237. $tree_name = $site_name . 'Taxonomy Tree';
  238. $phylotree = chado_select_record('phylotree', array('*'), array('name' => $tree_name));
  239. if (count($phylotree) == 0) {
  240. // Add the taxonomic tree.
  241. $phylotree = array(
  242. 'name' => $site_name . 'Taxonomy Tree',
  243. 'description' => 'A phylogenetic tree based on taxonomic rank.',
  244. 'leaf_type' => 'taxonomy',
  245. 'tree_file' => '/dev/null',
  246. 'format' => 'taxonomy',
  247. 'no_load' => TRUE,
  248. );
  249. $errors = array();
  250. $warnings = array();
  251. $success = tripal_insert_phylotree($phylotree, $errors, $warnings);
  252. if (!$success) {
  253. throw new Exception("Cannot add the Taxonomy Tree record.");
  254. }
  255. $phylotree = (object) $phylotree;
  256. }
  257. else {
  258. $phylotree = $phylotree[0];
  259. }
  260. return $phylotree;
  261. }
  262. /**
  263. * Iterates through all existing organisms and rebuilds the taxonomy tree.
  264. *
  265. * The phloytree API doesn't support adding nodes to existing trees only
  266. * importing whole trees. So, we must rebuild the tree using the current
  267. * organisms and then we can add to it.
  268. *
  269. */
  270. private function rebuildTree() {
  271. $lineage_nodes[] = array();
  272. // Get the "rank" cvterm. It requires that the TAXRANK vocabulary is loaded.
  273. $rank_cvterm = chado_get_cvterm(array(
  274. 'name' => 'rank',
  275. 'cv_id' => array('name' => 'local')
  276. ));
  277. // The taxonomic tree must have a root, so create that first.
  278. $tree = array(
  279. 'name' => 'root',
  280. 'depth' => 0,
  281. 'is_root' => 1,
  282. 'is_leaf' => 0,
  283. 'is_internal' => 0,
  284. 'left_index' => 0,
  285. 'right_index' => 0,
  286. 'branch_set' => array(),
  287. );
  288. $total = count($this->all_orgs);
  289. $j = 1;
  290. foreach ($this->all_orgs as $organism) {
  291. $sci_name = chado_get_organism_scientific_name($organism);
  292. //$this->logMessage("- " . ($j++) . " of $total. Adding @organism", array('@organism' => $sci_name));
  293. // First get the phylonode record for this organism.
  294. $sql = "
  295. SELECT P.*
  296. FROM {phylonode} P
  297. INNER JOIN {phylonode_organism} PO on PO.phylonode_id = P.phylonode_id
  298. WHERE P.phylotree_id = :phylotree_id AND PO.organism_id = :organism_id
  299. ";
  300. $args = array(
  301. ':phylotree_id' => $this->phylotree->phylotree_id,
  302. ':organism_id' => $organism->organism_id,
  303. );
  304. $result = chado_query($sql, $args);
  305. if (!$result) {
  306. continue;
  307. }
  308. $phylonode = $result->fetchObject();
  309. // Next get the lineage for this organism.
  310. $lineage = $this->getProperty($organism->organism_id, 'lineage');
  311. if (!$lineage) {
  312. continue;
  313. }
  314. $lineage_depth = preg_split('/;\s*/', $lineage->value);
  315. // Now rebuild the tree by first creating the nodes for the full
  316. // lineage and then adding the organism as a leaf node.
  317. $parent = $tree;
  318. $i = 1;
  319. $lineage_good = TRUE;
  320. foreach ($lineage_depth as $child) {
  321. // We need to find the node in the phylotree for this level of the
  322. // lineage, but there's a lot of repeats and we don't want to keep
  323. // doing the same queries over and over, so we store the nodes
  324. // we've already seen in the $lineage_nodes array for fast lookup.
  325. if (array_key_exists($child, $lineage_nodes)) {
  326. $phylonode = $lineage_nodes[$child];
  327. if (!$phylonode) {
  328. $lineage_good = FALSE;
  329. continue;
  330. }
  331. }
  332. else {
  333. $values = array(
  334. 'phylotree_id' => $this->phylotree->phylotree_id,
  335. 'label' => $child,
  336. );
  337. $columns = array('*');
  338. $phylonode = chado_select_record('phylonode', $columns, $values);
  339. if (count($phylonode) == 0) {
  340. $lineage_nodes[$child] = NULL;
  341. $lineage_good = FALSE;
  342. continue;
  343. }
  344. $phylonode = $phylonode[0];
  345. $lineage_nodes[$child] = $phylonode;
  346. $values = array(
  347. 'phylonode_id' => $phylonode->phylonode_id,
  348. 'type_id' => $rank_cvterm->cvterm_id,
  349. );
  350. $columns = array('*');
  351. $phylonodeprop = chado_select_record('phylonodeprop', $columns, $values);
  352. }
  353. $name = $child;
  354. $node_rank = (string) $child->Rank;
  355. $node = array(
  356. 'name' => $name,
  357. 'depth' => $i,
  358. 'is_root' => 0,
  359. 'is_leaf' => 0,
  360. 'is_internal' => 1,
  361. 'left_index' => 0,
  362. 'right_index' => 0,
  363. 'parent' => $parent,
  364. 'branch_set' => array(),
  365. 'parent' => $parent['name'],
  366. 'properties' => array(
  367. $rank_cvterm->cvterm_id => $phylonodeprop[0]->value,
  368. ),
  369. );
  370. $parent = $node;
  371. $this->addTaxonomyNode($tree, $node, $lineage_depth);
  372. $i++;
  373. } // end foreach ($lineage_depth as $child) { ...
  374. // If $stop is set then we had problems setting the lineage so
  375. // skip adding the leaf node below.
  376. if (!$lineage_good) {
  377. continue;
  378. }
  379. $rank_type = 'species';
  380. if (property_exists($organism, 'type_id') and $organism->type_id) {
  381. $rank_type = $organism->type;
  382. }
  383. // Now add in the leaf node
  384. $sci_name = chado_get_organism_scientific_name($organism);
  385. $node = array(
  386. 'name' => $sci_name,
  387. 'depth' => $i,
  388. 'is_root' => 0,
  389. 'is_leaf' => 1,
  390. 'is_internal' => 0,
  391. 'left_index' => 0,
  392. 'right_index' => 0,
  393. 'parent' => $parent['name'],
  394. 'organism_id' => $organism->organism_id,
  395. 'properties' => array(
  396. $rank_cvterm->cvterm_id => $rank_type,
  397. ),
  398. );
  399. $this->addTaxonomyNode($tree, $node, $lineage_depth);
  400. // Set the indecies for the tree.
  401. chado_assign_phylogeny_tree_indices($tree);
  402. }
  403. return $tree;
  404. }
  405. /**
  406. * Imports details from NCBI Taxonomy for organisms that alrady exist.
  407. */
  408. private function updateExisting() {
  409. foreach ($this->all_orgs as $organism) {
  410. // If the organism record is marked as new then let's skip it because
  411. // it was newly added and should have the updated information already.
  412. if ($organism->is_new) {
  413. continue;
  414. }
  415. // TODO: we should check if the organism already has a taxonomy ID.
  416. // if so we should use that instead of the scientific name.
  417. // Build the query string to get the information about this species.
  418. $sci_name = chado_get_organism_scientific_name($organism);
  419. $sci_name = urlencode($sci_name);
  420. $search_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?".
  421. "db=taxonomy" .
  422. "&term=$sci_name";
  423. // Get the search response from NCBI.
  424. $rfh = fopen($search_url, "r");
  425. $xml_text = '';
  426. while (!feof($rfh)) {
  427. $xml_text .= fread($rfh, 255);
  428. }
  429. fclose($rfh);
  430. // Parse the XML to get the taxonomy ID
  431. $xml = new SimpleXMLElement($xml_text);
  432. if ($xml) {
  433. $taxid = (string) $xml->IdList->Id;
  434. if ($taxid) {
  435. $this->importRecord($taxid, $organism);
  436. }
  437. }
  438. $this->addItemsHandled(1);
  439. }
  440. }
  441. /**
  442. * Checks the Chado database to see if the organism already exists.
  443. *
  444. * @param $taxid
  445. * The taxonomic ID for the organism.
  446. * @param $sci_name
  447. * The scientific name for the organism as returned by NCBI
  448. */
  449. private function findOrganism($taxid, $sci_name) {
  450. $organism = NULL;
  451. // First check the taxid to see if it's present and assocaited with an
  452. // organism already.
  453. $values = array(
  454. 'db_id' => array(
  455. 'name' => NCBITaxon
  456. ),
  457. 'accession' => $taxid,
  458. );
  459. $columns = array('dbxref_id');
  460. $dbxref = chado_select_record('dbxref', $columns, $values);
  461. if (count($dbxref) > 0) {
  462. $columns = array('organism_id');
  463. $values = array('dbxref_id' => $dbxref[0]->dbxref_id);
  464. $organism_dbxref = chado_select_record('organism_dbxref', $columns, $values);
  465. if (count($organism_dbxref) >0) {
  466. $organism_id = $organism_dbxref[0]->organism_id;
  467. $columns = array('*');
  468. $values = array('organism_id' => $organism_id);
  469. $organism = chado_select_record('organism', $columns, $values);
  470. if (count($organism) > 0) {
  471. $organism = $organism[0];
  472. }
  473. }
  474. }
  475. // If the caller did not provide an organism then we want to try and
  476. // add one. But, it only makes sense to add one if this record
  477. // is of rank species.
  478. // First check if the full name (including the infrasepcific name)
  479. // are all present in the genus and species name. This would have
  480. // been the Chado v1.2 (or less) of storing species.
  481. if (!$organism) {
  482. $sql = "
  483. SELECT organism_id
  484. FROM {organism}
  485. WHERE concat(genus, ' ', species) = :sci_name
  486. ";
  487. $results = chado_query($sql, array(':sci_name' => $sci_name));
  488. if ($results) {
  489. $item = $results->fetchObject();
  490. $columns = array('*');
  491. $values = array('organism_id' => $item->organism_id);
  492. $organism = chado_select_record('organism', $columns, $values);
  493. if (count($organism) > 0) {
  494. $organism = $organism[0];
  495. }
  496. }
  497. }
  498. // Second, check if the full name includes the infraspecific name.
  499. if (!$organism) {
  500. foreach ($this->all_orgs as $item) {
  501. $internal_sci_name = chado_get_organism_scientific_name($item);
  502. if ($sci_name == $internal_sci_name) {
  503. $organism = $item;
  504. }
  505. }
  506. }
  507. return $organism;
  508. }
  509. /**
  510. * Adds a new organism record to Chado.
  511. *
  512. * @param sci_name
  513. * The scientific name as provied by NCBI Taxonomy.
  514. * @param $rank
  515. * The rank of the organism as provied by NCBI Taxonomy.
  516. */
  517. private function addOrganism($sci_name, $rank) {
  518. $organism = NULL;
  519. $matches = array();
  520. $genus = '';
  521. $species = '';
  522. $infra = '';
  523. $values = array();
  524. // Check if the scientific name has an infraspecific part or is just
  525. // a species name.
  526. if (preg_match('/^(.+?)\s+(.+?)\s+(.+)$/', $sci_name, $matches)) {
  527. $genus = $matches[1];
  528. $species = $matches[2];
  529. $infra = $matches[3];
  530. // Get the CV term for the rank.
  531. $type = chado_get_cvterm(array(
  532. 'name' => preg_replace('/ /','_', $rank),
  533. 'cv_id' => array('name' => 'taxonomic_rank')
  534. ));
  535. // Remove the rank from the infraspecific name.
  536. $abbrev = chado_abbreviate_infraspecific_rank($rank);
  537. $infra = preg_replace("/$abbrev/", "", $infra);
  538. $infra = trim($infra);
  539. $values = array(
  540. 'genus' => $genus,
  541. 'species' => $species,
  542. 'abbreviation' => $genus[0] . '. ' . $species,
  543. 'type_id' => $type->cvterm_id,
  544. 'infraspecific_name' => $infra,
  545. );
  546. $organism = chado_insert_record('organism', $values);
  547. $organism = (object) $organism;
  548. $organism->type = $rank;
  549. }
  550. else if (preg_match('/^(.+?)\s+(.+?)$/', $sci_name, $matches)) {
  551. $genus = $matches[1];
  552. $species = $matches[2];
  553. $infra = '';
  554. $values = array(
  555. 'genus' => $genus,
  556. 'species' => $species,
  557. 'abbreviation' => $genus[0] . '. ' . $species,
  558. );
  559. $organism = chado_insert_record('organism', $values);
  560. $organism = (object) $organism;
  561. }
  562. if ($organism) {
  563. $organism->is_new = TRUE;
  564. $this->all_orgs[] = $organism;
  565. }
  566. return $organism;
  567. }
  568. /**
  569. * Imports an organism from the NCBI taxonomy DB by its taxonomy ID
  570. *
  571. * @param $taxid
  572. * The NCBI Taxonomy ID.
  573. * @param $organism
  574. * The organism object to which this taxonomy belongs. If the organism
  575. * is NULL then it will be created.
  576. */
  577. private function importRecord($taxid, $organism = NULL) {
  578. $adds_organism = $organism ? FALSE : TRUE;
  579. // Get the "rank" cvterm. It requires that the TAXRANK vocabulary is loaded.
  580. $rank_cvterm = chado_get_cvterm(array(
  581. 'name' => 'rank',
  582. 'cv_id' => array('name' => 'local')
  583. ));
  584. // Get the details for this taxonomy.
  585. $fetch_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?".
  586. "db=taxonomy" .
  587. "&id=$taxid";
  588. // Get the search response from NCBI.
  589. $rfh = fopen($fetch_url, "r");
  590. $xml_text = '';
  591. while (!feof($rfh)) {
  592. $xml_text .= fread($rfh, 255);
  593. }
  594. fclose($rfh);
  595. $xml = new SimpleXMLElement($xml_text);
  596. if ($xml) {
  597. $taxon = $xml->Taxon;
  598. // Get the genus and species from the xml.
  599. $parent = (string) $taxon->ParentTaxId;
  600. $rank = (string) $taxon->Rank;
  601. $sci_name = (string) $taxon->ScientificName;
  602. //$this->logMessage(' - Importing @sci_name', array('@sci_name' => $sci_name));
  603. // If we don't have an organism record provided then see if there
  604. // is one provided by Chado, if not, the try to add one.
  605. if (!$organism) {
  606. $organism = $this->findOrganism($taxid, $sci_name);
  607. if (!$organism) {
  608. $organism = $this->addOrganism($sci_name, $rank);
  609. if (!$organism) {
  610. throw new Exception(t('Cannot add organism: @sci_name', array('@sci_name' => $sci_name)));
  611. }
  612. }
  613. }
  614. // Associate the Dbxref with the organism.
  615. $this->addDbxref($organism->organism_id, $taxid);
  616. // Get properties for this organism.
  617. $lineage = (string) $taxon->Lineage;
  618. $genetic_code = (string) $taxon->GeneticCode->GCId;
  619. $genetic_code_name = (string) $taxon->GeneticCode->GCName;
  620. $mito_genetic_code = (string) $taxon->MitoGeneticCode->MGCId;
  621. $mito_genetic_code_name = (string) $taxon->MitoGeneticCode->MGCName;
  622. $division = (string) $taxon->Division;
  623. // Add in the organism properties.
  624. $this->addProperty($organism->organism_id, 'division', $division);
  625. $this->addProperty($organism->organism_id, 'mitochondrial_genetic_code_name', $mito_genetic_code_name);
  626. $this->addProperty($organism->organism_id, 'mitochondrial_genetic_code', $mito_genetic_code);
  627. $this->addProperty($organism->organism_id, 'genetic_code_name', $genetic_code_name);
  628. $this->addProperty($organism->organism_id, 'lineage', $lineage);
  629. $this->addProperty($organism->organism_id, 'genetic_code', $genetic_code);
  630. $name_ranks = array();
  631. if ($taxon->OtherNames->children) {
  632. foreach ($taxon->OtherNames->children() as $child) {
  633. $type = $child->getName();
  634. $name = (string) $child;
  635. if (!array_key_exists($type, $name_ranks)) {
  636. $name_ranks[$type] = 0;
  637. }
  638. switch ($type) {
  639. case 'GenbankCommonName':
  640. $this->addProperty($organism->organism_id, 'genbank_common_name', $name, $name_ranks[$type]);
  641. break;
  642. case 'Synonym':
  643. case 'GenbankSynonym':
  644. $this->addProperty($organism->organism_id, 'synonym', $name, $name_ranks[$type]);
  645. break;
  646. case 'CommonName':
  647. // If we had to add the organism then include the commone name too.
  648. if ($adds_organism) {
  649. $organism->common_name = $name;
  650. $values = array('organism_id' => $organism->id);
  651. chado_update_record('organism', $values, $organism);
  652. }
  653. case 'Includes':
  654. $this->addProperty($organism->organism_id, 'other_name', $name, $name_ranks[$type]);
  655. break;
  656. case 'EquivalentName':
  657. $this->addProperty($organism->organism_id, 'equivalent_name', $name, $name_ranks[$type]);
  658. break;
  659. case 'Anamorph':
  660. $this->addProperty($organism->organism_id, 'anamorph', $name, $name_ranks[$type]);
  661. break;
  662. case 'Name':
  663. // skip the Name stanza
  664. break;
  665. default:
  666. print "NOTICE: Skipping unrecognzed name type: $type\n";
  667. // do nothing for unrecognized types
  668. }
  669. $name_ranks[$type]++;
  670. }
  671. }
  672. // Generate a nested array structure that can be used for importing the tree.
  673. $lineage_depth = preg_split('/;\s*/', $lineage);
  674. $parent = $this->tree;
  675. $i = 1;
  676. foreach ($taxon->LineageEx->children() as $child) {
  677. $tid = (string) $child->TaxID;
  678. $name = (string) $child->ScientificName;
  679. $node_rank = (string) $child->Rank;
  680. $node = array(
  681. 'name' => $name,
  682. 'depth' => $i,
  683. 'is_root' => 0,
  684. 'is_leaf' => 0,
  685. 'is_internal' => 1,
  686. 'left_index' => 0,
  687. 'right_index' => 0,
  688. 'parent' => $parent,
  689. 'branch_set' => array(),
  690. 'parent' => $parent['name'],
  691. 'properties' => array(
  692. $rank_cvterm->cvterm_id => $node_rank,
  693. ),
  694. );
  695. $parent = $node;
  696. $this->addTaxonomyNode($this->tree, $node, $lineage_depth);
  697. $i++;
  698. }
  699. // Now add in the leaf node
  700. $node = array(
  701. 'name' => $sci_name,
  702. 'depth' => $i,
  703. 'is_root' => 0,
  704. 'is_leaf' => 1,
  705. 'is_internal' => 0,
  706. 'left_index' => 0,
  707. 'right_index' => 0,
  708. 'parent' => $parent['name'],
  709. 'organism_id' => $organism->organism_id,
  710. 'properties' => array(
  711. $rank_cvterm->cvterm_id => $rank,
  712. ),
  713. );
  714. $this->addTaxonomyNode($this->tree, $node, $lineage_depth);
  715. // Set the indecies for the tree.
  716. chado_assign_phylogeny_tree_indices($this->tree);
  717. }
  718. }
  719. /**
  720. *
  721. */
  722. private function addTaxonomyNode(&$tree, $node, $lineage_depth) {
  723. // Get the branch set for the tree root.
  724. $branch_set = &$tree['branch_set'];
  725. // Iterate through the tree up until the depth where this node will
  726. // be placed.
  727. $node_depth = $node['depth'];
  728. for ($i = 1; $i <= $node_depth; $i++) {
  729. // Iterate through any existing nodes in the branch set to see if
  730. // the node name matches the correct name for the lineage at this
  731. // depth. If it matches then it is inside of this branch set that
  732. // we will place the node.
  733. for ($j = 0; $j < count($branch_set); $j++) {
  734. // If this node already exists in the tree then return.
  735. if ($branch_set[$j]['name'] == $node['name'] and
  736. $branch_set[$j]['depth'] = $node['depth']) {
  737. return;
  738. }
  739. // Otherwise, set the branch to be the current branch and continue.
  740. if ($branch_set[$j]['name'] == $lineage_depth[$i-1]) {
  741. $branch_set = &$branch_set[$j]['branch_set'];
  742. break;
  743. }
  744. }
  745. }
  746. // Add the node to the last branch set. This should be where this node goes.
  747. $branch_set[] = $node;
  748. }
  749. /**
  750. * Retrieves a property for a given organism.
  751. *
  752. * @param $organism_id
  753. * The organism ID to which the property is added.
  754. * @param $term_name
  755. * The name of the organism property term. This term must be
  756. * present in the 'organism_property' cv.
  757. * @param $rank
  758. * The order for this property. The first instance of this term for
  759. * this organism should be zero. Defaults to zero.
  760. * @return
  761. * The property object.
  762. */
  763. private function getProperty($organism_id, $term_name, $rank = 0) {
  764. $record = array(
  765. 'table' => 'organism',
  766. 'id' => $organism_id
  767. );
  768. $property = array(
  769. 'type_name' => $term_name,
  770. 'cv_name' => 'organism_property',
  771. 'value' => $value,
  772. 'rank' => $rank
  773. );
  774. return chado_get_property($record, $property);
  775. }
  776. /**
  777. * Adds a property to an organism node.
  778. *
  779. * @param $organism_id
  780. * The organism ID to which the property is added.
  781. * @param $term_name
  782. * The name of the organism property term. This term must be
  783. * present in the 'organism_property' cv.
  784. * @param $value
  785. * The value of the property.
  786. * @param $rank
  787. * The order for this property. The first instance of this term for
  788. * this organism should be zero. Defaults to zero.
  789. */
  790. private function addProperty($organism_id, $term_name, $value, $rank = 0) {
  791. if (!$value) {
  792. return;
  793. }
  794. $record = array(
  795. 'table' => 'organism',
  796. 'id' => $organism_id
  797. );
  798. $property = array(
  799. 'type_name' => $term_name,
  800. 'cv_name' => 'organism_property',
  801. 'value' => $value
  802. );
  803. // Delete all properties of this type if the rank is zero.
  804. if ($rank == 0) {
  805. chado_delete_property($record, $property);
  806. }
  807. chado_insert_property($record, $property);
  808. }
  809. /**
  810. *
  811. * @param unknown $organism_id
  812. * @param unknown $taxId
  813. */
  814. private function addDbxref($organism_id, $taxId) {
  815. $db = chado_get_db(array('name' => 'NCBITaxon'));
  816. $values = array(
  817. 'db_id' => $db->db_id,
  818. 'accession' => $taxId
  819. );
  820. $dbxref = chado_insert_dbxref($values);
  821. $values = array(
  822. 'dbxref_id' => $dbxref->dbxref_id,
  823. 'organism_id' => $organism_id,
  824. );
  825. if (!chado_select_record('organism_dbxref', ['organism_dbxref_id'], $values)) {
  826. chado_insert_record('organism_dbxref', $values);
  827. }
  828. }
  829. }