GFF3ImporterTest.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563
  1. <?php
  2. namespace Tests;
  3. use StatonLab\TripalTestSuite\DBTransaction;
  4. use StatonLab\TripalTestSuite\TripalTestCase;
  5. class GFF3ImporterTest extends TripalTestCase {
  6. use DBTransaction;
  7. /**
  8. * Confirm basic GFF importer functionality.
  9. *
  10. * @group gff
  11. */
  12. public function testGFFImporter() {
  13. $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff'];
  14. $analysis = factory('chado.analysis')->create();
  15. $organism = factory('chado.organism')->create();
  16. $run_args = [
  17. 'analysis_id' => $analysis->analysis_id,
  18. 'organism_id' => $organism->organism_id,
  19. 'use_transaction' => 1,
  20. 'add_only' => 0,
  21. 'update' => 1,
  22. 'create_organism' => 0,
  23. 'create_target' => 0,
  24. ///regexps for mRNA and protein.
  25. 're_mrna' => NULL,
  26. 're_protein' => NULL,
  27. //optional
  28. 'target_organism_id' => NULL,
  29. 'target_type' => NULL,
  30. 'start_line' => NULL,
  31. 'landmark_type' => NULL,
  32. 'alt_id_attr' => NULL,
  33. ];
  34. $this->loadLandmarks($analysis, $organism);
  35. $this->runGFFLoader($run_args, $gff_file);
  36. $name = 'FRAEX38873_v2_000000110.2.exon4';
  37. $query = db_select('chado.feature', 'f')
  38. ->fields('f', ['uniquename'])
  39. ->condition('f.uniquename', $name)
  40. ->execute()
  41. ->fetchField();
  42. $this->assertEquals($name, $query);
  43. }
  44. /**
  45. * Run the GFF loader on small_gene.gff for testing.
  46. *
  47. * This gff has many attributes that we would like to test in the
  48. * testGFFImporterAttribute*() methods.
  49. */
  50. private function initGFFImporterAttributes() {
  51. $gff = ['file_local' => __DIR__ . '/../data/small_gene.gff'];
  52. $fasta = ['file_local' => __DIR__ . '/../data/short_scaffold.fasta'];
  53. $analysis = factory('chado.analysis')->create();
  54. $organism = factory('chado.organism')->create();
  55. $run_args = [
  56. 'analysis_id' => $analysis->analysis_id,
  57. 'organism_id' => $organism->organism_id,
  58. 'use_transaction' => 1,
  59. 'add_only' => 0,
  60. 'update' => 1,
  61. 'create_organism' => 0,
  62. 'create_target' => 0,
  63. ///regexps for mRNA and protein.
  64. 're_mrna' => NULL,
  65. 're_protein' => NULL,
  66. //optional
  67. 'target_organism_id' => $organism->organism_id,
  68. 'target_type' => NULL,
  69. 'start_line' => NULL,
  70. 'landmark_type' => NULL,
  71. 'alt_id_attr' => NULL,
  72. ];
  73. $this->loadLandmarks($analysis, $organism, $fasta);
  74. $this->runGFFLoader($run_args, $gff);
  75. $this->organism = $organism;
  76. $this->analysis = $analysis;
  77. $this->gene_cvt = chado_get_cvterm(array(
  78. 'name' => 'gene',
  79. 'cv_id' => array(
  80. 'name' => 'sequence',
  81. ),
  82. ))->cvterm_id;
  83. $this->mrna_cvt = chado_get_cvterm(array(
  84. 'name' => 'mRNA',
  85. 'cv_id' => array(
  86. 'name' => 'sequence',
  87. ),
  88. ))->cvterm_id;
  89. $this->supercontig_cvt = chado_get_cvterm(array(
  90. 'name' => 'supercontig',
  91. 'cv_id' => array(
  92. 'name' => 'sequence',
  93. ),
  94. ))->cvterm_id;
  95. $this->gene_1_uname = 'test_gene_001';
  96. $this->gene_2_uname = 'test_gene_002';
  97. $this->scaffold_1_uname = 'scaffold1';
  98. }
  99. /**
  100. * Ensures that the feature record is loaded correctly into chado.
  101. *
  102. * @group gff
  103. */
  104. public function testGFFImporterAttributeFeature() {
  105. $this->initGFFImporterAttributes();
  106. $organism = $this->organism;
  107. $query = db_select('chado.feature', 'f')
  108. ->fields('f')
  109. ->condition('uniquename', $this->gene_1_uname)
  110. ->condition('type_id', $this->gene_cvt)
  111. ->execute();
  112. $gene_1 = $query->fetchObject();
  113. $this->assertEquals('test_gene_001', $gene_1->uniquename);
  114. $this->assertEquals('test_gene_001', $gene_1->name);
  115. $this->assertEquals($organism->organism_id, $gene_1->organism_id);
  116. $this->assertEquals($this->gene_cvt, $gene_1->type_id);
  117. }
  118. /**
  119. * Ensures the feature alias is loaded correctly into chado.
  120. *
  121. * @group gff
  122. */
  123. public function testGFFImporterAttributeAlias() {
  124. $this->initGFFImporterAttributes();
  125. $alias = 'first_test_gene';
  126. $gene_1 = db_select('chado.feature', 'f')
  127. ->fields('f')
  128. ->condition('uniquename', $this->gene_1_uname)
  129. ->condition('type_id', $this->gene_cvt)
  130. ->execute()->fetchObject();
  131. $query = db_select('chado.feature_synonym', 'fs');
  132. $query->join('chado.synonym', 's', 's.synonym_id = fs.synonym_id');
  133. $query->fields('s');
  134. $query->condition('fs.feature_id', $gene_1->feature_id);
  135. $query = $query->execute();
  136. $result = $query->fetchObject();
  137. $this->assertEquals($alias, $result->name);
  138. }
  139. /**
  140. * Ensures that the dbxref records are loaded correctly into chado.
  141. *
  142. * @group gff
  143. */
  144. public function testGFFImporterAttributeDbxref() {
  145. $this->initGFFImporterAttributes();
  146. $test_db_name = 'TEST_DB';
  147. $dbx_accession = 'test_gene_dbx_001';
  148. $test_db = chado_get_db(array('name' => $test_db_name));
  149. $gff_db = chado_get_db(array('name' => 'GFF_source'));
  150. $gene_1 = db_select('chado.feature', 'f')
  151. ->fields('f')
  152. ->condition('uniquename', $this->gene_1_uname)
  153. ->condition('type_id', $this->gene_cvt)
  154. ->execute()->fetchObject();
  155. $dbx_query = db_select('chado.feature_dbxref', 'fdbx');
  156. $dbx_query->join('chado.dbxref', 'dbx', 'dbx.dbxref_id = fdbx.dbxref_id');
  157. $dbx_query->fields('dbx');
  158. $dbx_query->condition('fdbx.feature_id', $gene_1->feature_id);
  159. $gff_query = clone $dbx_query;
  160. $dbx_query->condition('dbx.db_id', $test_db->db_id);
  161. $dbx_query = $dbx_query->execute();
  162. $gff_query->condition('dbx.db_id', $gff_db->db_id);
  163. $gff_query = $gff_query->execute();
  164. $dbxref = $dbx_query->fetchObject();
  165. $gff_dbxref = $gff_query->fetchObject();
  166. $this->assertEquals($dbx_accession, $dbxref->accession);
  167. $this->assertEquals($this->gene_1_uname, $gff_dbxref->accession);
  168. }
  169. /**
  170. * Ensures ontology term records loaded correctly into chado.
  171. *
  172. * @group gff
  173. */
  174. public function testGFFImporterAttributeOntology() {
  175. $this->initGFFImporterAttributes();
  176. $ontology_db = 'SO';
  177. $ontology_accession = '0000704';
  178. $gene_1 = db_select('chado.feature', 'f')
  179. ->fields('f')
  180. ->condition('uniquename', $this->gene_1_uname)
  181. ->condition('type_id', $this->gene_cvt)
  182. ->execute()->fetchObject();
  183. $term = chado_get_cvterm(array(
  184. 'dbxref_id' => array(
  185. 'accession' => $ontology_accession,
  186. 'db_id' => array(
  187. 'name' => $ontology_db,
  188. ),
  189. ),
  190. ));
  191. $feature_cvt = db_select('chado.feature_cvterm', 'fcvt')
  192. ->fields('fcvt')
  193. ->condition('cvterm_id', $term->cvterm_id)
  194. ->condition('feature_id', $gene_1->feature_id)
  195. ->execute();
  196. $this->assertEquals(1, $feature_cvt->rowCount());
  197. }
  198. /**
  199. * Ensures feature parent record loaded correctly into chado.
  200. *
  201. * @group gff
  202. */
  203. public function testGFFImporterAttributeParent() {
  204. $this->initGFFImporterAttributes();
  205. $mrna_uname = 'test_mrna_001.1';
  206. $rel_cvt = chado_get_cvterm(array(
  207. 'name' => 'part_of',
  208. 'cv_id' => array(
  209. 'name' => 'sequence',
  210. ),
  211. ))->cvterm_id;
  212. $mrna = db_select('chado.feature', 'f')
  213. ->fields('f')
  214. ->condition('uniquename', $mrna_uname)
  215. ->condition('type_id', $this->mrna_cvt)
  216. ->execute()->fetchObject();
  217. $query = db_select('chado.feature_relationship', 'fr');
  218. $query->join('chado.feature', 'f', 'f.feature_id = fr.object_id');
  219. $query->fields('f');
  220. $query->condition('fr.subject_id', $mrna->feature_id);
  221. $query->condition('fr.type_id', $rel_cvt);
  222. $query = $query->execute();
  223. $parent = $query->fetchObject();
  224. $this->assertEquals('test_gene_001', $parent->uniquename);
  225. $this->assertEquals('test_gene_001', $parent->name);
  226. $this->assertEquals($this->gene_cvt, $parent->type_id);
  227. $this->assertEquals($this->organism->organism_id, $parent->organism_id);
  228. }
  229. /**
  230. * Ensure target record loaded correctly into chado.
  231. *
  232. * @group gff
  233. */
  234. public function testGFFImporterAttributeTarget() {
  235. $this->initGFFImporterAttributes();
  236. $target_feature = 'scaffold1';
  237. $start = 99;
  238. $end = 200;
  239. $target_type = 'supercontig';
  240. $target_cvt = chado_get_cvterm(array(
  241. 'name' => $target_type,
  242. 'cv_id' => array(
  243. 'name' => 'sequence',
  244. ),
  245. ))->cvterm_id;
  246. $source_feature = db_select('chado.feature', 'f')
  247. ->fields('f')
  248. ->condition('uniquename', $target_feature)
  249. ->condition('type_id', $target_cvt)
  250. ->execute()->fetchObject();
  251. $gene_1 = db_select('chado.feature', 'f')
  252. ->fields('f')
  253. ->condition('uniquename', $this->gene_1_uname)
  254. ->condition('type_id', $this->gene_cvt)
  255. ->execute()->fetchObject();
  256. $featureloc = db_select('chado.featureloc', 'fl')
  257. ->fields('fl')
  258. ->condition('fl.feature_id', $gene_1->feature_id)
  259. ->condition('fl.srcfeature_id', $source_feature->feature_id)
  260. ->execute()->fetchObject();
  261. $this->assertEquals($start, $featureloc->fmin);
  262. $this->assertEquals($end, $featureloc->fmax);
  263. }
  264. /**
  265. * Ensure properties loaded correctly into chado.
  266. *
  267. * @group gff
  268. */
  269. public function testGFFImporterAttributeProperty() {
  270. $this->initGFFImporterAttributes();
  271. $gap_1 = 'test_gap_1';
  272. $gap_2 = 'test_gap_2';
  273. $note_val = 'test_gene_001_note';
  274. $gene_1 = db_select('chado.feature', 'f')
  275. ->fields('f')
  276. ->condition('uniquename', $this->gene_1_uname)
  277. ->condition('type_id', $this->gene_cvt)
  278. ->execute()->fetchObject();
  279. $gap_cvt = chado_get_cvterm(array(
  280. 'name' => 'Gap',
  281. 'cv_id' => array(
  282. 'name' => 'feature_property',
  283. ),
  284. ))->cvterm_id;
  285. $note_cvt = chado_get_cvterm(array(
  286. 'name' => 'Note',
  287. 'cv_id' => array(
  288. 'name' => 'feature_property',
  289. ),
  290. ))->cvterm_id;
  291. // Assert gaps loaded correctly
  292. $gaps_query = db_select('chado.featureprop', 'fp')
  293. ->fields('fp')
  294. ->condition('feature_id', $gene_1->feature_id)
  295. ->condition('type_id', $gap_cvt)
  296. ->execute();
  297. while (($gap = $gaps_query->fetchObject())) {
  298. $gaps[$gap->value] = $gap;
  299. }
  300. $this->assertEquals($gap_1, $gaps[$gap_1]->value);
  301. $this->assertEquals(0, $gaps[$gap_1]->rank);
  302. // Assert note loaded correctly
  303. $note = db_select('chado.featureprop', 'fp')
  304. ->fields('fp')
  305. ->condition('feature_id', $gene_1->feature_id)
  306. ->condition('type_id', $note_cvt)
  307. ->execute()->fetchObject();
  308. $this->assertEquals($note_val, $note->value);
  309. $this->assertEquals(0, $note->rank);
  310. }
  311. /**
  312. * Ensure derives from information loaded correctly into chado.
  313. *
  314. * @group gff
  315. */
  316. public function testGFFImporterAttributeDerivesFrom() {
  317. $this->initGFFImporterAttributes();
  318. $gene_2 = db_select('chado.feature', 'f')
  319. ->fields('f')
  320. ->condition('uniquename', $this->gene_2_uname)
  321. ->condition('type_id', $this->gene_cvt)
  322. ->execute()->fetchObject();
  323. $derivesfrom_cvt = chado_get_cvterm(array(
  324. 'name' => 'derives_from',
  325. 'cv_id' => array(
  326. 'name' => 'sequence',
  327. ),
  328. ))->cvterm_id;
  329. $query = db_select('chado.feature', 'f');
  330. $query->join('chado.feature_relationship', 'fr', 'f.feature_id = fr.object_id');
  331. $query->fields('f');
  332. $query->condition('fr.subject_id', $gene_2->feature_id);
  333. $query->condition('fr.type_id', $derivesfrom_cvt);
  334. $query = $query->execute();
  335. $derivesfrom_feature = $query->fetchObject();
  336. $this->assertEquals($this->gene_1_uname, $derivesfrom_feature->uniquename);
  337. $this->assertEquals($this->gene_1_uname, $derivesfrom_feature->name);
  338. $this->assertEquals($this->gene_cvt, $derivesfrom_feature->type_id);
  339. }
  340. /**
  341. * Ensure FASTA information loaded correctly into chado.
  342. *
  343. * @group gff
  344. */
  345. public function testGFFImporterAttributeFastas() {
  346. $this->initGFFImporterAttributes();
  347. $scaffold = db_select('chado.feature', 'f')
  348. ->fields('f')
  349. ->condition('uniquename', $this->scaffold_1_uname)
  350. ->condition('type_id', $this->supercontig_cvt)
  351. ->execute()->fetchObject();
  352. $this->assertEquals(720, $scaffold->seqlen);
  353. $this->assertEquals(720, strlen($scaffold->residues));
  354. $this->assertEquals('83578d8afdaec399c682aa6c0ddd29c9', $scaffold->md5checksum);
  355. }
  356. /**
  357. * Add a skip protein option. Test that when checked, implicit proteins are
  358. * not created, but that they are created when unchecked.
  359. *
  360. * @group gff
  361. * @ticket 77
  362. *
  363. */
  364. public function testGFFNoProteinOption() {
  365. $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff'];
  366. $analysis = factory('chado.analysis')->create();
  367. $organism = factory('chado.organism')->create();
  368. $run_args = [
  369. //The new argument
  370. 'skip_protein' => 1,
  371. ///
  372. 'analysis_id' => $analysis->analysis_id,
  373. 'organism_id' => $organism->organism_id,
  374. 'use_transaction' => 1,
  375. 'add_only' => 0,
  376. 'update' => 1,
  377. 'create_organism' => 0,
  378. 'create_target' => 0,
  379. ///regexps for mRNA and protein.
  380. 're_mrna' => NULL,
  381. 're_protein' => NULL,
  382. //optional
  383. 'target_organism_id' => NULL,
  384. 'target_type' => NULL,
  385. 'start_line' => NULL,
  386. 'landmark_type' => NULL,
  387. 'alt_id_attr' => NULL,
  388. ];
  389. $this->loadLandmarks($analysis, $organism);
  390. $this->runGFFLoader($run_args, $gff_file);
  391. $identifier = [
  392. 'cv_id' => ['name' => 'sequence'],
  393. 'name' => 'polypeptide',
  394. ];
  395. $protein_type_id = tripal_get_cvterm($identifier);
  396. //This works i think i just dont have proteins described in the GFF.
  397. $name = 'FRAEX38873_v2_000000110.1-protein';
  398. $query = db_select('chado.feature', 'f')
  399. ->fields('f', ['uniquename'])
  400. ->condition('f.uniquename', $name)
  401. ->condition('f.type_id', $protein_type_id->cvterm_id)
  402. ->execute()
  403. ->fetchField();
  404. $this->assertFalse($query);
  405. $run_args['skip_protein'] = 0;
  406. $this->runGFFLoader($run_args, $gff_file);
  407. $query = db_select('chado.feature', 'f')
  408. ->fields('f', ['uniquename'])
  409. ->condition('f.uniquename', $name)
  410. ->condition('f.type_id', $protein_type_id->cvterm_id)
  411. ->execute()
  412. ->fetchObject();
  413. $this->assertEquals($name, $query->uniquename);
  414. }
  415. /**
  416. * The GFF importer should still create explicitly defined proteins if
  417. * skip_protein is true.
  418. *
  419. * @group gff
  420. * @ticket 77
  421. */
  422. public function testGFFImporterLoadsExplicitProteins() {
  423. $gff_file = ['file_local' => __DIR__ . '/../data/simpleGFF.gff'];
  424. $analysis = factory('chado.analysis')->create();
  425. $organism = factory('chado.organism')->create();
  426. $run_args = [
  427. //The new argument
  428. 'skip_protein' => 1,
  429. ///
  430. 'analysis_id' => $analysis->analysis_id,
  431. 'organism_id' => $organism->organism_id,
  432. 'use_transaction' => 1,
  433. 'add_only' => 0,
  434. 'update' => 1,
  435. 'create_organism' => 0,
  436. 'create_target' => 0,
  437. ///regexps for mRNA and protein.
  438. 're_mrna' => NULL,
  439. 're_protein' => NULL,
  440. //optional
  441. 'target_organism_id' => NULL,
  442. 'target_type' => NULL,
  443. 'start_line' => NULL,
  444. 'landmark_type' => NULL,
  445. 'alt_id_attr' => NULL,
  446. ];
  447. $this->loadLandmarks($analysis, $organism);
  448. $this->runGFFLoader($run_args, $gff_file);
  449. $name = 'FRAEX38873_v2_000000010.1.3_test_protein';
  450. $query = db_select('chado.feature', 'f')
  451. ->fields('f', ['uniquename'])
  452. ->condition('f.uniquename', $name)
  453. ->execute()
  454. ->fetchField();
  455. $this->assertEquals($name, $query);
  456. }
  457. private function runGFFLoader($run_args, $file) {
  458. // silent(function ($run_args, $file) {
  459. module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/GFF3Importer');
  460. $importer = new \GFF3Importer();
  461. $importer->create($run_args, $file);
  462. $importer->prepareFiles();
  463. $importer->run();
  464. // });
  465. }
  466. private function loadLandmarks($analysis, $organism, $landmark_file = array()) {
  467. if (empty($landmark_file)) {
  468. $landmark_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/sequences/empty_landmarks.fasta'];
  469. }
  470. $run_args = [
  471. 'organism_id' => $organism->organism_id,
  472. 'analysis_id' => $analysis->analysis_id,
  473. 'seqtype' => 'supercontig',
  474. 'method' => 2, //default insert and update
  475. 'match_type' => 1, //unique name default
  476. //optional
  477. 're_name' => NULL,
  478. 're_uname' => NULL,
  479. 're_accession' => NULL,
  480. 'db_id' => NULL,
  481. 'rel_type' => NULL,
  482. 're_subject' => NULL,
  483. 'parent_type' => NULL,
  484. ];
  485. module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/FASTAImporter');
  486. //silent(function ($run_args, $landmark_file) {
  487. $importer = new \FASTAImporter();
  488. $importer->create($run_args, $landmark_file);
  489. $importer->prepareFiles();
  490. $importer->run();
  491. // });
  492. }
  493. }