TaxonomyImporter.inc 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030
  1. <?php
  2. class TaxonomyImporter extends TripalImporter {
  3. /**
  4. * The name of this loader. This name will be presented to the site
  5. * user.
  6. */
  7. public static $name = 'Chado NCBI Taxonomy Loader';
  8. /**
  9. * The machine name for this loader. This name will be used to construct
  10. * the URL for the loader.
  11. */
  12. public static $machine_name = 'chado_taxonomy';
  13. /**
  14. * A brief description for this loader. This description will be
  15. * presented to the site user.
  16. */
  17. public static $description = 'Imports new organisms from NCBI using taxonomy IDs, or loads taxonomic details about existing organisms.';
  18. /**
  19. * An array containing the extensions of allowed file types.
  20. */
  21. public static $file_types = [];
  22. /**
  23. * Provides information to the user about the file upload. Typically this
  24. * may include a description of the file types allowed.
  25. */
  26. public static $upload_description = '';
  27. /**
  28. * The title that should appear above the upload button.
  29. */
  30. public static $upload_title = 'File Upload';
  31. /**
  32. * If the loader should require an analysis record. To maintain provenance
  33. * we should always indicate where the data we are uploading comes from.
  34. * The method that Tripal attempts to use for this by associating upload files
  35. * with an analysis record. The analysis record provides the details for
  36. * how the file was created or obtained. Set this to FALSE if the loader
  37. * should not require an analysis when loading. if $use_analysis is set to
  38. * true then the form values will have an 'analysis_id' key in the $form_state
  39. * array on submitted forms.
  40. */
  41. public static $use_analysis = FALSE;
  42. /**
  43. * If the $use_analysis value is set above then this value indicates if the
  44. * analysis should be required.
  45. */
  46. public static $require_analysis = FALSE;
  47. /**
  48. * Text that should appear on the button at the bottom of the importer
  49. * form.
  50. */
  51. public static $button_text = 'Import from NCBI Taxonomy';
  52. /**
  53. * Indicates the methods that the file uploader will support.
  54. */
  55. public static $methods = [
  56. // Allow the user to upload a file to the server.
  57. 'file_upload' => FALSE,
  58. // Allow the user to provide the path on the Tripal server for the file.
  59. 'file_local' => FALSE,
  60. // Allow the user to provide a remote URL for the file.
  61. 'file_remote' => FALSE,
  62. ];
  63. /**
  64. * Indicates if the file must be provided. An example when it may not be
  65. * necessary to require that the user provide a file for uploading if the
  66. * loader keeps track of previous files and makes those available for
  67. * selection.
  68. */
  69. public static $file_required = FALSE;
  70. /**
  71. * The array of arguments used for this loader. Each argument should
  72. * be a separate array containing a machine_name, name, and description
  73. * keys. This information is used to build the help text for the loader.
  74. */
  75. public static $argument_list = [];
  76. /**
  77. * Indicates how many files are allowed to be uploaded. By default this is
  78. * set to allow only one file. Change to any positive number. A value of
  79. * zero indicates an unlimited number of uploaded files are allowed.
  80. */
  81. public static $cardinality = 0;
  82. /**
  83. * Holds the list of all orgainsms currently in Chado. This list
  84. * is needed when checking to see if an organism has already been
  85. * loaded.
  86. */
  87. private $all_orgs = [];
  88. /**
  89. * The record from the Chado phylotree table that refers to this
  90. * Taxonomic tree.
  91. */
  92. private $phylotree = NULL;
  93. /**
  94. * The temporary tree array used by the Tripal Phylotree API for
  95. * importing a new tree.
  96. */
  97. private $tree = NULL;
  98. /**
  99. * @see TripalImporter::form()
  100. */
  101. public function form($form, &$form_state) {
  102. $form['instructions'] = [
  103. '#type' => 'fieldset',
  104. '#title' => 'instructions',
  105. '#description' => t('This form is used to import species from the NCBI
  106. Taxonomy database into this site. Alternatively, it can import details
  107. about organisms from the NCBI Taxonomy database for organisms that
  108. already exist on this site. This loader will also construct
  109. the taxonomic tree for the species loaded.'),
  110. ];
  111. $form['ncbi_api_key'] = [
  112. '#type' => 'textfield',
  113. '#title' => t('(Optional) NCBI API key:'),
  114. '#description' => t('Tripal imports Taxonomy information using NCBI\'s ')
  115. . l('EUtils API', 'https://www.ncbi.nlm.nih.gov/books/NBK25500/')
  116. . t(', which limits users and programs to a maximum of 3 requests per second without an API key. '
  117. . 'However, NCBI allows users and programs to an increased maximum of 10 requests per second if '
  118. . 'they provide a valid API key. This is particularly useful in speeding up large taxonomy imports. '
  119. . 'For more information on NCBI API keys, please ')
  120. . l('see here', 'https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Coming_in_December_2018_API_Key', array(
  121. 'attributes' => array(
  122. 'target' => 'blank',
  123. ),
  124. )) . '.',
  125. '#default_value' => variable_get('tripal_taxon_importer_ncbi_api_key', NULL),
  126. '#ajax' => array(
  127. 'callback' => 'tripal_taxon_importer_set_ncbi_api_key',
  128. 'wrapper' => 'ncbi_api_key',
  129. ),
  130. '#prefix' => '<div id="ncbi_api_key">',
  131. '#suffix' => '</div>',
  132. ];
  133. $form['taxonomy_ids'] = [
  134. '#type' => 'textarea',
  135. '#title' => 'Taxonomy ID',
  136. '#description' => t('Please provide a list of NCBI taxonomy IDs separated
  137. by spaces, tabs or new lines.
  138. The information about these organisms will be downloaded and new organism
  139. records will be added to this site.'),
  140. ];
  141. $form['import_existing'] = [
  142. '#type' => 'checkbox',
  143. '#title' => 'Import details for existing species.',
  144. '#description' => t('The NCBI Taxonomic Importer examines the organisms
  145. currently present in the database and queries NCBI for the
  146. taxonomic details. If the importer is able to match the
  147. genus and species with NCBI the species details will be imported,
  148. and a page containing the taxonomic tree will be created.'),
  149. '#default_value' => 1,
  150. ];
  151. return $form;
  152. }
  153. /**
  154. * @see TripalImporter::formValidate()
  155. */
  156. public function formValidate($form, &$form_state) {
  157. global $user;
  158. $import_existing = $form_state['values']['import_existing'];
  159. $taxonomy_ids = $form_state['values']['taxonomy_ids'];
  160. // make sure that we have numeric values, one per line.
  161. if ($taxonomy_ids) {
  162. $tax_ids = preg_split("/[\s\n\t\r]+/", $taxonomy_ids);
  163. $bad_ids = [];
  164. foreach ($tax_ids as $tax_id) {
  165. $tax_id = trim($tax_id);
  166. if (!preg_match('/^\d+$/', $tax_id)) {
  167. $bad_ids[] = $tax_id;
  168. }
  169. }
  170. if (count($bad_ids) > 0) {
  171. form_set_error('taxonomy_ids',
  172. t('Taxonomy IDs must be numeric. The following are not valid: "@ids".',
  173. ['@ids' => implode('", "', $bad_ids)]));
  174. }
  175. }
  176. }
  177. /**
  178. * Performs the import.
  179. */
  180. public function run() {
  181. global $site_name;
  182. $arguments = $this->arguments['run_args'];
  183. $taxonomy_ids = $arguments['taxonomy_ids'];
  184. $import_existing = $arguments['import_existing'];
  185. // Get the list of all organisms as we'll need this to lookup existing
  186. // organisms.
  187. if (chado_get_version() > 1.2) {
  188. $sql = "
  189. SELECT O.*, CVT.name as type
  190. FROM {organism} O
  191. LEFT JOIN {cvterm} CVT ON CVT.cvterm_id = O.type_id
  192. ORDER BY O.genus, O.species
  193. ";
  194. }
  195. else {
  196. $sql = "
  197. SELECT O.*, '' as type
  198. FROM {organism} O
  199. ORDER BY O.genus, O.species
  200. ";
  201. }
  202. $results = chado_query($sql);
  203. while ($item = $results->fetchObject()) {
  204. $this->all_orgs[] = $item;
  205. }
  206. // Get the phylotree object.
  207. $this->logMessage('Initializing Tree...');
  208. $this->phylotree = $this->initTree();
  209. $this->logMessage('Rebuilding Tree...');
  210. $this->tree = $this->rebuildTree();
  211. // Clean out the phnylondes for this tree in the event this is a reload
  212. chado_delete_record('phylonode', ['phylotree_id' => $this->phylotree->phylotree_id]);
  213. // Get the taxonomy IDs provided by the user (if any).
  214. $tax_ids = [];
  215. if ($taxonomy_ids) {
  216. $tax_ids = preg_split("/[\s\n\t\r]+/", $taxonomy_ids);
  217. }
  218. // Set the number of items to handle.
  219. if ($taxonomy_ids and $import_existing) {
  220. $this->setTotalItems(count($this->all_orgs) + count($tax_ids));
  221. }
  222. if ($taxonomy_ids and !$import_existing) {
  223. $this->setTotalItems(count($tax_ids));
  224. }
  225. if (!$taxonomy_ids and $import_existing) {
  226. $this->setTotalItems(count($this->all_orgs));
  227. }
  228. $this->setItemsHandled(0);
  229. // If the user wants to import new taxonomy IDs then do that.
  230. if ($taxonomy_ids) {
  231. $this->logMessage('Importing Taxonomy IDs...');
  232. $api_key = variable_get('tripal_taxon_importer_ncbi_api_key', NULL);
  233. $sleep_time = 333334;
  234. if (!empty($api_key)) {
  235. $sleep_time = 100000;
  236. }
  237. foreach ($tax_ids as $tax_id) {
  238. $start = microtime(TRUE);
  239. $tax_id = trim($tax_id);
  240. $result = $this->importRecord($tax_id);
  241. // Only addItemsHandled if the importRecord was a success.
  242. if ($result) {
  243. $this->addItemsHandled(1);
  244. }
  245. $remaining_sleep = $sleep_time - ((int) (1e6 * (microtime(TRUE) - $start)));
  246. if ($remaining_sleep > 0) {
  247. usleep($remaining_sleep);
  248. }
  249. }
  250. }
  251. // If the user wants to update existing records then do that.
  252. if ($import_existing) {
  253. $this->logMessage('Updating Existing...');
  254. $this->updateExisting();
  255. }
  256. // Now import the tree.
  257. $options = ['taxonomy' => 1];
  258. chado_phylogeny_import_tree($this->tree, $this->phylotree, $options);
  259. }
  260. /**
  261. * Create the taxonomic tree in Chado.
  262. *
  263. * If the tree already exists it will not be recreated.
  264. *
  265. * @throws Exception
  266. * @return
  267. * Returns the phylotree object.
  268. */
  269. private function initTree() {
  270. // Add the taxonomy tree record into the phylotree table. If the tree
  271. // already exists then don't insert it again.
  272. $site_name = variable_get('site_name');
  273. $tree_name = $site_name . 'Taxonomy Tree';
  274. $phylotree = chado_select_record('phylotree', ['*'], ['name' => $tree_name]);
  275. if (count($phylotree) == 0) {
  276. // Add the taxonomic tree.
  277. $phylotree = [
  278. 'name' => $site_name . 'Taxonomy Tree',
  279. 'description' => 'A phylogenetic tree based on taxonomic rank.',
  280. 'leaf_type' => 'taxonomy',
  281. 'tree_file' => '/dev/null',
  282. 'format' => 'taxonomy',
  283. 'no_load' => TRUE,
  284. ];
  285. $errors = [];
  286. $warnings = [];
  287. $success = tripal_insert_phylotree($phylotree, $errors, $warnings);
  288. if (!$success) {
  289. throw new Exception("Cannot add the Taxonomy Tree record.");
  290. }
  291. $phylotree = (object) $phylotree;
  292. }
  293. else {
  294. $phylotree = $phylotree[0];
  295. }
  296. return $phylotree;
  297. }
  298. /**
  299. * Iterates through all existing organisms and rebuilds the taxonomy tree.
  300. *
  301. * The phloytree API doesn't support adding nodes to existing trees only
  302. * importing whole trees. So, we must rebuild the tree using the current
  303. * organisms and then we can add to it.
  304. *
  305. */
  306. private function rebuildTree() {
  307. $lineage_nodes[] = [];
  308. // Get the "rank" cvterm. It requires that the TAXRANK vocabulary is loaded.
  309. $rank_cvterm = chado_get_cvterm([
  310. 'name' => 'rank',
  311. 'cv_id' => ['name' => 'local'],
  312. ]);
  313. // The taxonomic tree must have a root, so create that first.
  314. $tree = [
  315. 'name' => 'root',
  316. 'depth' => 0,
  317. 'is_root' => 1,
  318. 'is_leaf' => 0,
  319. 'is_internal' => 0,
  320. 'left_index' => 0,
  321. 'right_index' => 0,
  322. 'branch_set' => [],
  323. ];
  324. $total = count($this->all_orgs);
  325. $j = 1;
  326. foreach ($this->all_orgs as $organism) {
  327. $sci_name = chado_get_organism_scientific_name($organism);
  328. //$this->logMessage("- " . ($j++) . " of $total. Adding @organism", array('@organism' => $sci_name));
  329. // First get the phylonode record for this organism.
  330. $sql = "
  331. SELECT P.*
  332. FROM {phylonode} P
  333. INNER JOIN {phylonode_organism} PO on PO.phylonode_id = P.phylonode_id
  334. WHERE P.phylotree_id = :phylotree_id AND PO.organism_id = :organism_id
  335. ";
  336. $args = [
  337. ':phylotree_id' => $this->phylotree->phylotree_id,
  338. ':organism_id' => $organism->organism_id,
  339. ];
  340. $result = chado_query($sql, $args);
  341. if (!$result) {
  342. continue;
  343. }
  344. $phylonode = $result->fetchObject();
  345. // Next get the lineage for this organism.
  346. $lineage = $this->getProperty($organism->organism_id, 'lineage');
  347. if (!$lineage) {
  348. continue;
  349. }
  350. $lineage_depth = preg_split('/;\s*/', $lineage->value);
  351. // Now rebuild the tree by first creating the nodes for the full
  352. // lineage and then adding the organism as a leaf node.
  353. $parent = $tree;
  354. $i = 1;
  355. $lineage_good = TRUE;
  356. foreach ($lineage_depth as $child) {
  357. // We need to find the node in the phylotree for this level of the
  358. // lineage, but there's a lot of repeats and we don't want to keep
  359. // doing the same queries over and over, so we store the nodes
  360. // we've already seen in the $lineage_nodes array for fast lookup.
  361. if (array_key_exists($child, $lineage_nodes)) {
  362. $phylonode = $lineage_nodes[$child];
  363. if (!$phylonode) {
  364. $lineage_good = FALSE;
  365. continue;
  366. }
  367. }
  368. else {
  369. $values = [
  370. 'phylotree_id' => $this->phylotree->phylotree_id,
  371. 'label' => $child,
  372. ];
  373. $columns = ['*'];
  374. $phylonode = chado_select_record('phylonode', $columns, $values);
  375. if (count($phylonode) == 0) {
  376. $lineage_nodes[$child] = NULL;
  377. $lineage_good = FALSE;
  378. continue;
  379. }
  380. $phylonode = $phylonode[0];
  381. $lineage_nodes[$child] = $phylonode;
  382. $values = [
  383. 'phylonode_id' => $phylonode->phylonode_id,
  384. 'type_id' => $rank_cvterm->cvterm_id,
  385. ];
  386. $columns = ['*'];
  387. $phylonodeprop = chado_select_record('phylonodeprop', $columns, $values);
  388. }
  389. $name = $child;
  390. $node_rank = (string) $child->Rank;
  391. $node = [
  392. 'name' => $name,
  393. 'depth' => $i,
  394. 'is_root' => 0,
  395. 'is_leaf' => 0,
  396. 'is_internal' => 1,
  397. 'left_index' => 0,
  398. 'right_index' => 0,
  399. 'parent' => $parent,
  400. 'branch_set' => [],
  401. 'parent' => $parent['name'],
  402. 'properties' => [
  403. $rank_cvterm->cvterm_id => $phylonodeprop[0]->value,
  404. ],
  405. ];
  406. $parent = $node;
  407. $this->addTaxonomyNode($tree, $node, $lineage_depth);
  408. $i++;
  409. } // end foreach ($lineage_depth as $child) { ...
  410. // If $stop is set then we had problems setting the lineage so
  411. // skip adding the leaf node below.
  412. if (!$lineage_good) {
  413. continue;
  414. }
  415. $rank_type = 'species';
  416. if (property_exists($organism, 'type_id') and $organism->type_id) {
  417. $rank_type = $organism->type;
  418. }
  419. // Now add in the leaf node
  420. $sci_name = chado_get_organism_scientific_name($organism);
  421. $node = [
  422. 'name' => $sci_name,
  423. 'depth' => $i,
  424. 'is_root' => 0,
  425. 'is_leaf' => 1,
  426. 'is_internal' => 0,
  427. 'left_index' => 0,
  428. 'right_index' => 0,
  429. 'parent' => $parent['name'],
  430. 'organism_id' => $organism->organism_id,
  431. 'properties' => [
  432. $rank_cvterm->cvterm_id => $rank_type,
  433. ],
  434. ];
  435. $this->addTaxonomyNode($tree, $node, $lineage_depth);
  436. // Set the indecies for the tree.
  437. chado_assign_phylogeny_tree_indices($tree);
  438. }
  439. return $tree;
  440. }
  441. /**
  442. * Imports details from NCBI Taxonomy for organisms that alrady exist.
  443. */
  444. private function updateExisting() {
  445. $total = count($this->all_orgs);
  446. $api_key = variable_get('tripal_taxon_importer_ncbi_api_key', NULL);
  447. $sleep_time = 333334;
  448. if (!empty($api_key)) {
  449. $sleep_time = 100000;
  450. }
  451. foreach ($this->all_orgs as $organism) {
  452. // If the organism record is marked as new then let's skip it because
  453. // it was newly added and should have the updated information already.
  454. if ($organism->is_new) {
  455. continue;
  456. }
  457. // TODO: we should check if the organism already has a taxonomy ID.
  458. // if so we should use that instead of the scientific name.
  459. $start = microtime(TRUE);
  460. // Build the query string to get the information about this species.
  461. $sci_name = chado_get_organism_scientific_name($organism);
  462. $sci_name = urlencode($sci_name);
  463. $search_url = "https://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" .
  464. "db=taxonomy" .
  465. "&term=$sci_name";
  466. if (!empty($api_key)) {
  467. $search_url .= "&api_key=" . $api_key;
  468. }
  469. // Get the search response from NCBI.
  470. $rfh = fopen($search_url, "r");
  471. $xml_text = '';
  472. if (!$rfh) {
  473. $this->logMessage("Could not look up !sci_name", ['!sci_name' => $sci_name], TRIPAL_WARNING);
  474. continue;
  475. }
  476. while (!feof($rfh)) {
  477. $xml_text .= fread($rfh, 255);
  478. }
  479. fclose($rfh);
  480. $remaining_sleep = $sleep_time - ((int) (1e6 * (microtime(TRUE) - $start)));
  481. if ($remaining_sleep > 0) {
  482. usleep($remaining_sleep);
  483. }
  484. // Parse the XML to get the taxonomy ID
  485. $result = FALSE;
  486. $start = microtime(TRUE);
  487. $xml = new SimpleXMLElement($xml_text);
  488. if ($xml) {
  489. $taxid = (string) $xml->IdList->Id;
  490. if ($taxid) {
  491. $result = $this->importRecord($taxid, $organism);
  492. }
  493. }
  494. if ($result) {
  495. $this->addItemsHandled(1);
  496. }
  497. $remaining_sleep = $sleep_time - ((int) (1e6 * (microtime(TRUE) - $start)));
  498. if ($remaining_sleep > 0) {
  499. usleep($remaining_sleep);
  500. }
  501. }
  502. }
  503. /**
  504. * Checks the Chado database to see if the organism already exists.
  505. *
  506. * @param $taxid
  507. * The taxonomic ID for the organism.
  508. * @param $sci_name
  509. * The scientific name for the organism as returned by NCBI
  510. */
  511. private function findOrganism($taxid, $sci_name) {
  512. $organism = NULL;
  513. // First check the taxid to see if it's present and associated with an
  514. // organism already.
  515. $values = [
  516. 'db_id' => [
  517. 'name' => 'NCBITaxon',
  518. ],
  519. 'accession' => $taxid,
  520. ];
  521. $columns = ['dbxref_id'];
  522. $dbxref = chado_select_record('dbxref', $columns, $values);
  523. if (count($dbxref) > 0) {
  524. $columns = ['organism_id'];
  525. $values = ['dbxref_id' => $dbxref[0]->dbxref_id];
  526. $organism_dbxref = chado_select_record('organism_dbxref', $columns, $values);
  527. if (count($organism_dbxref) > 0) {
  528. $organism_id = $organism_dbxref[0]->organism_id;
  529. $columns = ['*'];
  530. $values = ['organism_id' => $organism_id];
  531. $organism = chado_select_record('organism', $columns, $values);
  532. if (count($organism) > 0) {
  533. $organism = $organism[0];
  534. }
  535. }
  536. }
  537. // If the caller did not provide an organism then we want to try and
  538. // add one. But, it only makes sense to add one if this record
  539. // is of rank species.
  540. // First check if the full name (including the infrasepcific name)
  541. // are all present in the genus and species name. This would have
  542. // been the Chado v1.2 (or less) of storing species.
  543. if (!$organism) {
  544. $sql = "
  545. SELECT organism_id
  546. FROM {organism}
  547. WHERE concat(genus, ' ', species) = :sci_name
  548. ";
  549. $results = chado_query($sql, [':sci_name' => $sci_name]);
  550. $item = $results->fetchObject();
  551. if ($item) {
  552. $columns = ['*'];
  553. $values = ['organism_id' => $item->organism_id];
  554. $organism = chado_select_record('organism', $columns, $values);
  555. if (count($organism) > 0) {
  556. $organism = $organism[0];
  557. }
  558. }
  559. }
  560. // Second, check if the full name includes the infraspecific name.
  561. if (!$organism) {
  562. foreach ($this->all_orgs as $item) {
  563. $internal_sci_name = chado_get_organism_scientific_name($item);
  564. if ($sci_name == $internal_sci_name) {
  565. $organism = $item;
  566. }
  567. }
  568. }
  569. return $organism;
  570. }
  571. /**
  572. * Adds a new organism record to Chado.
  573. *
  574. * @param sci_name
  575. * The scientific name as provied by NCBI Taxonomy.
  576. * @param $rank
  577. * The rank of the organism as provied by NCBI Taxonomy.
  578. */
  579. private function addOrganism($sci_name, $rank) {
  580. $organism = NULL;
  581. $matches = [];
  582. $genus = '';
  583. $species = '';
  584. $infra = '';
  585. $values = [];
  586. // Check if the scientific name has an infraspecific part or is just
  587. // a species name.
  588. if (preg_match('/^(.+?)\s+(.+?)\s+(.+)$/', $sci_name, $matches)) {
  589. $genus = $matches[1];
  590. $species = $matches[2];
  591. $infra = $matches[3];
  592. // Get the CV term for the rank.
  593. $type = chado_get_cvterm([
  594. 'name' => preg_replace('/ /', '_', $rank),
  595. 'cv_id' => ['name' => 'taxonomic_rank'],
  596. ]);
  597. // Remove the rank from the infraspecific name.
  598. $abbrev = chado_abbreviate_infraspecific_rank($rank);
  599. $infra = preg_replace("/$abbrev/", "", $infra);
  600. $infra = trim($infra);
  601. $values = [
  602. 'genus' => $genus,
  603. 'species' => $species,
  604. 'abbreviation' => $genus[0] . '. ' . $species,
  605. 'type_id' => $type->cvterm_id,
  606. 'infraspecific_name' => $infra,
  607. ];
  608. $organism = chado_insert_record('organism', $values);
  609. $organism = (object) $organism;
  610. $organism->type = $rank;
  611. }
  612. else {
  613. if (preg_match('/^(.+?)\s+(.+?)$/', $sci_name, $matches)) {
  614. $genus = $matches[1];
  615. $species = $matches[2];
  616. $infra = '';
  617. $values = [
  618. 'genus' => $genus,
  619. 'species' => $species,
  620. 'abbreviation' => $genus[0] . '. ' . $species,
  621. ];
  622. $organism = chado_insert_record('organism', $values);
  623. $organism = (object) $organism;
  624. }
  625. }
  626. if ($organism) {
  627. $organism->is_new = TRUE;
  628. $this->all_orgs[] = $organism;
  629. }
  630. return $organism;
  631. }
  632. /**
  633. * Imports an organism from the NCBI taxonomy DB by its taxonomy ID
  634. *
  635. * @param $taxid
  636. * The NCBI Taxonomy ID.
  637. * @param $organism
  638. * The organism object to which this taxonomy belongs. If the organism
  639. * is NULL then it will be created.
  640. */
  641. private function importRecord($taxid, $organism = NULL) {
  642. $adds_organism = $organism ? FALSE : TRUE;
  643. // Get the "rank" cvterm. It requires that the TAXRANK vocabulary is loaded.
  644. $rank_cvterm = chado_get_cvterm([
  645. 'name' => 'rank',
  646. 'cv_id' => ['name' => 'local'],
  647. ]);
  648. // Get the details for this taxonomy.
  649. $fetch_url = "https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" .
  650. "db=taxonomy" .
  651. "&id=$taxid";
  652. $api_key = variable_get('tripal_taxon_importer_ncbi_api_key', NULL);
  653. if (!empty($api_key)) {
  654. $fetch_url .= "&api_key=" . $api_key;
  655. }
  656. // Get the search response from NCBI.
  657. $xml = FALSE;
  658. $rfh = fopen($fetch_url, "r");
  659. if ($rfh) {
  660. $xml_text = '';
  661. while (!feof($rfh)) {
  662. $xml_text .= fread($rfh, 255);
  663. }
  664. fclose($rfh);
  665. $xml = new SimpleXMLElement($xml_text);
  666. }
  667. if ($xml) {
  668. $taxon = $xml->Taxon;
  669. // Get the genus and species from the xml.
  670. $parent = (string) $taxon->ParentTaxId;
  671. $rank = (string) $taxon->Rank;
  672. $sci_name = (string) $taxon->ScientificName;
  673. //$this->logMessage(' - Importing @sci_name', array('@sci_name' => $sci_name));
  674. // If we don't have an organism record provided then see if there
  675. // is one provided by Chado, if not, the try to add one.
  676. if (!$organism) {
  677. $organism = $this->findOrganism($taxid, $sci_name);
  678. if (!$organism) {
  679. $organism = $this->addOrganism($sci_name, $rank);
  680. if (!$organism) {
  681. throw new Exception(t('Cannot add organism: @sci_name', ['@sci_name' => $sci_name]));
  682. }
  683. }
  684. }
  685. // Associate the Dbxref with the organism.
  686. $this->addDbxref($organism->organism_id, $taxid);
  687. // Get properties for this organism.
  688. $lineage = (string) $taxon->Lineage;
  689. $genetic_code = (string) $taxon->GeneticCode->GCId;
  690. $genetic_code_name = (string) $taxon->GeneticCode->GCName;
  691. $mito_genetic_code = (string) $taxon->MitoGeneticCode->MGCId;
  692. $mito_genetic_code_name = (string) $taxon->MitoGeneticCode->MGCName;
  693. $division = (string) $taxon->Division;
  694. // Add in the organism properties.
  695. $this->addProperty($organism->organism_id, 'division', $division);
  696. $this->addProperty($organism->organism_id, 'mitochondrial_genetic_code_name', $mito_genetic_code_name);
  697. $this->addProperty($organism->organism_id, 'mitochondrial_genetic_code', $mito_genetic_code);
  698. $this->addProperty($organism->organism_id, 'genetic_code_name', $genetic_code_name);
  699. $this->addProperty($organism->organism_id, 'lineage', $lineage);
  700. $this->addProperty($organism->organism_id, 'genetic_code', $genetic_code);
  701. $name_ranks = [];
  702. if ($taxon->OtherNames->children) {
  703. foreach ($taxon->OtherNames->children() as $child) {
  704. $type = $child->getName();
  705. $name = (string) $child;
  706. if (!array_key_exists($type, $name_ranks)) {
  707. $name_ranks[$type] = 0;
  708. }
  709. switch ($type) {
  710. case 'GenbankCommonName':
  711. $this->addProperty($organism->organism_id, 'genbank_common_name', $name, $name_ranks[$type]);
  712. break;
  713. case 'Synonym':
  714. case 'GenbankSynonym':
  715. $this->addProperty($organism->organism_id, 'synonym', $name, $name_ranks[$type]);
  716. break;
  717. case 'CommonName':
  718. // If we had to add the organism then include the commone name too.
  719. if ($adds_organism) {
  720. $organism->common_name = $name;
  721. $values = ['organism_id' => $organism->id];
  722. chado_update_record('organism', $values, $organism);
  723. }
  724. case 'Includes':
  725. $this->addProperty($organism->organism_id, 'other_name', $name, $name_ranks[$type]);
  726. break;
  727. case 'EquivalentName':
  728. $this->addProperty($organism->organism_id, 'equivalent_name', $name, $name_ranks[$type]);
  729. break;
  730. case 'Anamorph':
  731. $this->addProperty($organism->organism_id, 'anamorph', $name, $name_ranks[$type]);
  732. break;
  733. case 'Name':
  734. // skip the Name stanza
  735. break;
  736. default:
  737. print "NOTICE: Skipping unrecognzed name type: $type\n";
  738. // do nothing for unrecognized types
  739. }
  740. $name_ranks[$type]++;
  741. }
  742. }
  743. // Generate a nested array structure that can be used for importing the tree.
  744. $lineage_depth = preg_split('/;\s*/', $lineage);
  745. $parent = $this->tree;
  746. $i = 1;
  747. foreach ($taxon->LineageEx->children() as $child) {
  748. $tid = (string) $child->TaxID;
  749. $name = (string) $child->ScientificName;
  750. $node_rank = (string) $child->Rank;
  751. $node = [
  752. 'name' => $name,
  753. 'depth' => $i,
  754. 'is_root' => 0,
  755. 'is_leaf' => 0,
  756. 'is_internal' => 1,
  757. 'left_index' => 0,
  758. 'right_index' => 0,
  759. 'parent' => $parent,
  760. 'branch_set' => [],
  761. 'parent' => $parent['name'],
  762. 'properties' => [
  763. $rank_cvterm->cvterm_id => $node_rank,
  764. ],
  765. ];
  766. $parent = $node;
  767. $this->addTaxonomyNode($this->tree, $node, $lineage_depth);
  768. $i++;
  769. }
  770. // Now add in the leaf node
  771. $node = [
  772. 'name' => $sci_name,
  773. 'depth' => $i,
  774. 'is_root' => 0,
  775. 'is_leaf' => 1,
  776. 'is_internal' => 0,
  777. 'left_index' => 0,
  778. 'right_index' => 0,
  779. 'parent' => $parent['name'],
  780. 'organism_id' => $organism->organism_id,
  781. 'properties' => [
  782. $rank_cvterm->cvterm_id => $rank,
  783. ],
  784. ];
  785. $this->addTaxonomyNode($this->tree, $node, $lineage_depth);
  786. // Set the indecies for the tree.
  787. chado_assign_phylogeny_tree_indices($this->tree);
  788. return TRUE;
  789. }
  790. return FALSE;
  791. }
  792. /**
  793. *
  794. */
  795. private function addTaxonomyNode(&$tree, $node, $lineage_depth) {
  796. // Get the branch set for the tree root.
  797. $branch_set = &$tree['branch_set'];
  798. // Iterate through the tree up until the depth where this node will
  799. // be placed.
  800. $node_depth = $node['depth'];
  801. for ($i = 1; $i <= $node_depth; $i++) {
  802. // Iterate through any existing nodes in the branch set to see if
  803. // the node name matches the correct name for the lineage at this
  804. // depth. If it matches then it is inside of this branch set that
  805. // we will place the node.
  806. for ($j = 0; $j < count($branch_set); $j++) {
  807. // If this node already exists in the tree then return.
  808. if ($branch_set[$j]['name'] == $node['name'] and
  809. $branch_set[$j]['depth'] = $node['depth']) {
  810. return;
  811. }
  812. // Otherwise, set the branch to be the current branch and continue.
  813. if ($branch_set[$j]['name'] == $lineage_depth[$i - 1]) {
  814. $branch_set = &$branch_set[$j]['branch_set'];
  815. break;
  816. }
  817. }
  818. }
  819. // Add the node to the last branch set. This should be where this node goes.
  820. $branch_set[] = $node;
  821. }
  822. /**
  823. * Retrieves a property for a given organism.
  824. *
  825. * @param $organism_id
  826. * The organism ID to which the property is added.
  827. * @param $term_name
  828. * The name of the organism property term. This term must be
  829. * present in the 'organism_property' cv.
  830. * @param $rank
  831. * The order for this property. The first instance of this term for
  832. * this organism should be zero. Defaults to zero.
  833. *
  834. * @return
  835. * The property object.
  836. */
  837. private function getProperty($organism_id, $term_name, $rank = 0) {
  838. $record = [
  839. 'table' => 'organism',
  840. 'id' => $organism_id,
  841. ];
  842. $property = [
  843. 'type_name' => $term_name,
  844. 'cv_name' => 'organism_property',
  845. 'value' => $value,
  846. 'rank' => $rank,
  847. ];
  848. return chado_get_property($record, $property);
  849. }
  850. /**
  851. * Adds a property to an organism node.
  852. *
  853. * @param $organism_id
  854. * The organism ID to which the property is added.
  855. * @param $term_name
  856. * The name of the organism property term. This term must be
  857. * present in the 'organism_property' cv.
  858. * @param $value
  859. * The value of the property.
  860. * @param $rank
  861. * The order for this property. The first instance of this term for
  862. * this organism should be zero. Defaults to zero.
  863. */
  864. private function addProperty($organism_id, $term_name, $value, $rank = 0) {
  865. if (!$value) {
  866. return;
  867. }
  868. $record = [
  869. 'table' => 'organism',
  870. 'id' => $organism_id,
  871. ];
  872. $property = [
  873. 'type_name' => $term_name,
  874. 'cv_name' => 'organism_property',
  875. 'value' => $value,
  876. ];
  877. // Delete all properties of this type if the rank is zero.
  878. if ($rank == 0) {
  879. chado_delete_property($record, $property);
  880. }
  881. chado_insert_property($record, $property);
  882. }
  883. /**
  884. *
  885. * @param unknown $organism_id
  886. * @param unknown $taxId
  887. */
  888. private function addDbxref($organism_id, $taxId) {
  889. $db = chado_get_db(['name' => 'NCBITaxon']);
  890. $values = [
  891. 'db_id' => $db->db_id,
  892. 'accession' => $taxId,
  893. ];
  894. $dbxref = chado_insert_dbxref($values);
  895. $values = [
  896. 'dbxref_id' => $dbxref->dbxref_id,
  897. 'organism_id' => $organism_id,
  898. ];
  899. if (!chado_select_record('organism_dbxref', ['organism_dbxref_id'], $values)) {
  900. chado_insert_record('organism_dbxref', $values);
  901. }
  902. }
  903. }
  904. /**
  905. * Ajax callback for the TaxonomyImporter::form() function.
  906. *
  907. * It is called when the user makes a change to the NCBI API key field and then
  908. * moves their cursor out of the field.
  909. *
  910. * @param $form
  911. * The new form element.
  912. * @param $form_state
  913. * The state of the new form element.
  914. *
  915. * @return array
  916. * The new api key field.
  917. */
  918. function tripal_taxon_importer_set_ncbi_api_key($form, $form_state) {
  919. variable_set('tripal_taxon_importer_ncbi_api_key', check_plain($form_state['values']['ncbi_api_key']));
  920. drupal_set_message('NCBI API key has been saved successfully!');
  921. return $form['ncbi_api_key'];
  922. }