GFF3ImporterTest.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. <?php
  2. namespace Tests;
  3. use StatonLab\TripalTestSuite\DBTransaction;
  4. use StatonLab\TripalTestSuite\TripalTestCase;
  5. class GFF3ImporterTest extends TripalTestCase {
  6. use DBTransaction;
  7. /**
  8. * Confirm basic GFF importer functionality.
  9. *
  10. * @group gff
  11. */
  12. public function testGFFImporter() {
  13. $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff'];
  14. $analysis = factory('chado.analysis')->create();
  15. $organism = factory('chado.organism')->create();
  16. $run_args = [
  17. 'analysis_id' => $analysis->analysis_id,
  18. 'organism_id' => $organism->organism_id,
  19. 'use_transaction' => 1,
  20. 'add_only' => 0,
  21. 'update' => 1,
  22. 'create_organism' => 0,
  23. 'create_target' => 0,
  24. ///regexps for mRNA and protein.
  25. 're_mrna' => NULL,
  26. 're_protein' => NULL,
  27. //optional
  28. 'target_organism_id' => NULL,
  29. 'target_type' => NULL,
  30. 'start_line' => NULL,
  31. 'landmark_type' => NULL,
  32. 'alt_id_attr' => NULL,
  33. ];
  34. $this->loadLandmarks($analysis, $organism);
  35. $this->runGFFLoader($run_args, $gff_file);
  36. $name = 'FRAEX38873_v2_000000110.2.exon4';
  37. $query = db_select('chado.feature', 'f')
  38. ->fields('f', ['uniquename'])
  39. ->condition('f.uniquename', $name)
  40. ->execute()
  41. ->fetchField();
  42. $this->assertEquals($name, $query);
  43. }
  44. /**
  45. * Run the GFF loader on small_gene.gff for testing.
  46. *
  47. * This gff has many attributes that we would like to test in the
  48. * testGFFImporterAttribute*() methods.
  49. */
  50. private function initGFFImporterAttributes() {
  51. $gff = ['file_local' => __DIR__ . '/../data/small_gene.gff'];
  52. $fasta = ['file_local' => __DIR__ . '/../data/short_scaffold.fasta'];
  53. $analysis = factory('chado.analysis')->create();
  54. $organism = factory('chado.organism')->create();
  55. $run_args = [
  56. 'analysis_id' => $analysis->analysis_id,
  57. 'organism_id' => $organism->organism_id,
  58. 'use_transaction' => 1,
  59. 'add_only' => 0,
  60. 'update' => 1,
  61. 'create_organism' => 0,
  62. 'create_target' => 0,
  63. ///regexps for mRNA and protein.
  64. 're_mrna' => NULL,
  65. 're_protein' => NULL,
  66. //optional
  67. 'target_organism_id' => $organism->organism_id,
  68. 'target_type' => NULL,
  69. 'start_line' => NULL,
  70. 'landmark_type' => NULL,
  71. 'alt_id_attr' => NULL,
  72. ];
  73. $this->loadLandmarks($analysis, $organism, $fasta);
  74. $this->runGFFLoader($run_args, $gff);
  75. $this->organism = $organism;
  76. $this->analysis = $analysis;
  77. $this->gene_cvt = chado_get_cvterm(array(
  78. 'name' => 'gene',
  79. 'cv_id' => array(
  80. 'name' => 'sequence',
  81. ),
  82. ))->cvterm_id;
  83. $this->mrna_cvt = chado_get_cvterm(array(
  84. 'name' => 'mRNA',
  85. 'cv_id' => array(
  86. 'name' => 'sequence',
  87. ),
  88. ))->cvterm_id;
  89. $this->supercontig_cvt = chado_get_cvterm(array(
  90. 'name' => 'supercontig',
  91. 'cv_id' => array(
  92. 'name' => 'sequence',
  93. ),
  94. ))->cvterm_id;
  95. $this->gene_1_uname = 'test_gene_001';
  96. $this->gene_2_uname = 'test_gene_002';
  97. $this->scaffold_1_uname = 'scaffold1';
  98. }
  99. /**
  100. * Ensures that the feature record is loaded correctly into chado.
  101. *
  102. * @group gff
  103. */
  104. public function testGFFImporterAttributeFeature() {
  105. $this->initGFFImporterAttributes();
  106. $organism = $this->organism;
  107. $query = db_select('chado.feature', 'f')
  108. ->fields('f')
  109. ->condition('uniquename', $this->gene_1_uname)
  110. ->condition('type_id', $this->gene_cvt)
  111. ->execute();
  112. $gene_1 = $query->fetchObject();
  113. $this->assertEquals('test_gene_001', $gene_1->uniquename);
  114. $this->assertEquals('test_gene_001', $gene_1->name);
  115. $this->assertEquals($organism->organism_id, $gene_1->organism_id);
  116. $this->assertEquals($this->gene_cvt, $gene_1->type_id);
  117. }
  118. /**
  119. * Ensures the feature alias is loaded correctly into chado.
  120. *
  121. * @group gff
  122. */
  123. public function testGFFImporterAttributeAlias() {
  124. $this->initGFFImporterAttributes();
  125. $alias = 'first_test_gene';
  126. $gene_1 = db_select('chado.feature', 'f')
  127. ->fields('f')
  128. ->condition('uniquename', $this->gene_1_uname)
  129. ->condition('type_id', $this->gene_cvt)
  130. ->execute()->fetchObject();
  131. $query = db_select('chado.feature_synonym', 'fs');
  132. $query->join('chado.synonym', 's', 's.synonym_id = fs.synonym_id');
  133. $query->fields('s');
  134. $query->condition('fs.feature_id', $gene_1->feature_id);
  135. $query = $query->execute();
  136. $result = $query->fetchObject();
  137. $this->assertEquals($alias, $result->name);
  138. }
  139. /**
  140. * Ensures that the dbxref records are loaded correctly into chado.
  141. *
  142. * @group gff
  143. */
  144. public function testGFFImporterAttributeDbxref() {
  145. $this->initGFFImporterAttributes();
  146. $test_db_name = 'TEST_DB';
  147. $dbx_accession = 'test_gene_dbx_001';
  148. $test_db = chado_get_db(array('name' => $test_db_name));
  149. $gff_db = chado_get_db(array('name' => 'GFF_source'));
  150. $gene_1 = db_select('chado.feature', 'f')
  151. ->fields('f')
  152. ->condition('uniquename', $this->gene_1_uname)
  153. ->condition('type_id', $this->gene_cvt)
  154. ->execute()->fetchObject();
  155. $dbx_query = db_select('chado.feature_dbxref', 'fdbx');
  156. $dbx_query->join('chado.dbxref', 'dbx', 'dbx.dbxref_id = fdbx.dbxref_id');
  157. $dbx_query->fields('dbx');
  158. $dbx_query->condition('fdbx.feature_id', $gene_1->feature_id);
  159. $gff_query = clone $dbx_query;
  160. $dbx_query->condition('dbx.db_id', $test_db->db_id);
  161. $dbx_query = $dbx_query->execute();
  162. $gff_query->condition('dbx.db_id', $gff_db->db_id);
  163. $gff_query = $gff_query->execute();
  164. $dbxref = $dbx_query->fetchObject();
  165. $gff_dbxref = $gff_query->fetchObject();
  166. $this->assertEquals($dbx_accession, $dbxref->accession);
  167. $this->assertEquals($this->gene_1_uname, $gff_dbxref->accession);
  168. }
  169. /**
  170. * Ensures ontology term records loaded correctly into chado.
  171. *
  172. * @group gff
  173. */
  174. public function testGFFImporterAttributeOntology() {
  175. $this->initGFFImporterAttributes();
  176. $ontology_db = 'SO';
  177. $ontology_accession = '0000704';
  178. $gene_1 = db_select('chado.feature', 'f')
  179. ->fields('f')
  180. ->condition('uniquename', $this->gene_1_uname)
  181. ->condition('type_id', $this->gene_cvt)
  182. ->execute()->fetchObject();
  183. $term = chado_get_cvterm(array(
  184. 'dbxref_id' => array(
  185. 'accession' => $ontology_accession,
  186. 'db_id' => array(
  187. 'name' => $ontology_db,
  188. ),
  189. ),
  190. ));
  191. $feature_cvt = db_select('chado.feature_cvterm', 'fcvt')
  192. ->fields('fcvt')
  193. ->condition('cvterm_id', $term->cvterm_id)
  194. ->condition('feature_id', $gene_1->feature_id)
  195. ->execute();
  196. $this->assertEquals(1, $feature_cvt->rowCount());
  197. }
  198. /**
  199. * Ensures feature parent record loaded correctly into chado.
  200. *
  201. * @group gff
  202. */
  203. public function testGFFImporterAttributeParent() {
  204. $this->initGFFImporterAttributes();
  205. $mrna_uname = 'test_mrna_001.1';
  206. $rel_cvt = chado_get_cvterm(array(
  207. 'name' => 'part_of',
  208. 'cv_id' => array(
  209. 'name' => 'sequence',
  210. ),
  211. ))->cvterm_id;
  212. $mrna = db_select('chado.feature', 'f')
  213. ->fields('f')
  214. ->condition('uniquename', $mrna_uname)
  215. ->condition('type_id', $this->mrna_cvt)
  216. ->execute()->fetchObject();
  217. $query = db_select('chado.feature_relationship', 'fr');
  218. $query->join('chado.feature', 'f', 'f.feature_id = fr.object_id');
  219. $query->fields('f');
  220. $query->condition('fr.subject_id', $mrna->feature_id);
  221. $query->condition('fr.type_id', $rel_cvt);
  222. $query = $query->execute();
  223. $parent = $query->fetchObject();
  224. $this->assertEquals('test_gene_001', $parent->uniquename);
  225. $this->assertEquals('test_gene_001', $parent->name);
  226. $this->assertEquals($this->gene_cvt, $parent->type_id);
  227. $this->assertEquals($this->organism->organism_id, $parent->organism_id);
  228. }
  229. /**
  230. * Ensure target record loaded correctly into chado.
  231. *
  232. * @group gff
  233. */
  234. public function testGFFImporterAttributeTarget() {
  235. $this->initGFFImporterAttributes();
  236. $target_feature = 'scaffold1';
  237. $start = 99;
  238. $end = 200;
  239. $target_type = 'supercontig';
  240. $target_cvt = chado_get_cvterm(array(
  241. 'name' => $target_type,
  242. 'cv_id' => array(
  243. 'name' => 'sequence',
  244. ),
  245. ))->cvterm_id;
  246. $source_feature = db_select('chado.feature', 'f')
  247. ->fields('f')
  248. ->condition('uniquename', $target_feature)
  249. ->condition('type_id', $target_cvt)
  250. ->execute()->fetchObject();
  251. $gene_1 = db_select('chado.feature', 'f')
  252. ->fields('f')
  253. ->condition('uniquename', $this->gene_1_uname)
  254. ->condition('type_id', $this->gene_cvt)
  255. ->execute()->fetchObject();
  256. $featureloc = db_select('chado.featureloc', 'fl')
  257. ->fields('fl')
  258. ->condition('fl.feature_id', $gene_1->feature_id)
  259. ->condition('fl.srcfeature_id', $source_feature->feature_id)
  260. ->execute()->fetchObject();
  261. $this->assertEquals($start, $featureloc->fmin);
  262. $this->assertEquals($end, $featureloc->fmax);
  263. }
  264. /**
  265. * Ensure properties loaded correctly into chado.
  266. *
  267. * @group gff
  268. */
  269. public function testGFFImporterAttributeProperty() {
  270. $this->initGFFImporterAttributes();
  271. $gap_1 = 'test_gap_1';
  272. $gap_2 = 'test_gap_2';
  273. $note_val = 'test_gene_001_note';
  274. $gene_1 = db_select('chado.feature', 'f')
  275. ->fields('f')
  276. ->condition('uniquename', $this->gene_1_uname)
  277. ->condition('type_id', $this->gene_cvt)
  278. ->execute()->fetchObject();
  279. $gap_cvt = chado_get_cvterm(array(
  280. 'name' => 'Gap',
  281. 'cv_id' => array(
  282. 'name' => 'feature_property',
  283. ),
  284. ))->cvterm_id;
  285. $note_cvt = chado_get_cvterm(array(
  286. 'name' => 'Note',
  287. 'cv_id' => array(
  288. 'name' => 'feature_property',
  289. ),
  290. ))->cvterm_id;
  291. // Assert gaps loaded correctly
  292. $gaps_query = db_select('chado.featureprop', 'fp')
  293. ->fields('fp')
  294. ->condition('feature_id', $gene_1->feature_id)
  295. ->condition('type_id', $gap_cvt)
  296. ->execute();
  297. while (($gap = $gaps_query->fetchObject())) {
  298. $gaps[$gap->value] = $gap;
  299. }
  300. $this->assertEquals($gap_1, $gaps[$gap_1]->value);
  301. $this->assertEquals(0, $gaps[$gap_1]->rank);
  302. $this->assertEquals($gap_2, $gaps[$gap_2]->value);
  303. $this->assertEquals(1, $gaps[$gap_2]->rank);
  304. // Assert note loaded correctly
  305. $note = db_select('chado.featureprop', 'fp')
  306. ->fields('fp')
  307. ->condition('feature_id', $gene_1->feature_id)
  308. ->condition('type_id', $note_cvt)
  309. ->execute()->fetchObject();
  310. $this->assertEquals($note_val, $note->value);
  311. $this->assertEquals(0, $note->rank);
  312. }
  313. /**
  314. * Ensure derives from information loaded correctly into chado.
  315. *
  316. * @group gff
  317. */
  318. public function testGFFImporterAttributeDerivesFrom() {
  319. $this->initGFFImporterAttributes();
  320. $gene_2 = db_select('chado.feature', 'f')
  321. ->fields('f')
  322. ->condition('uniquename', $this->gene_2_uname)
  323. ->condition('type_id', $this->gene_cvt)
  324. ->execute()->fetchObject();
  325. $derivesfrom_cvt = chado_get_cvterm(array(
  326. 'name' => 'derives_from',
  327. 'cv_id' => array(
  328. 'name' => 'sequence',
  329. ),
  330. ))->cvterm_id;
  331. $query = db_select('chado.feature', 'f');
  332. $query->join('chado.feature_relationship', 'fr', 'f.feature_id = fr.object_id');
  333. $query->fields('f');
  334. $query->condition('fr.subject_id', $gene_2->feature_id);
  335. $query->condition('fr.type_id', $derivesfrom_cvt);
  336. $query = $query->execute();
  337. $derivesfrom_feature = $query->fetchObject();
  338. $this->assertEquals($this->gene_1_uname, $derivesfrom_feature->uniquename);
  339. $this->assertEquals($this->gene_1_uname, $derivesfrom_feature->name);
  340. $this->assertEquals($this->gene_cvt, $derivesfrom_feature->type_id);
  341. }
  342. /**
  343. * Ensure FASTA information loaded correctly into chado.
  344. *
  345. * @group gff
  346. */
  347. public function testGFFImporterAttributeFastas() {
  348. $this->initGFFImporterAttributes();
  349. $scaffold = db_select('chado.feature', 'f')
  350. ->fields('f')
  351. ->condition('uniquename', $this->scaffold_1_uname)
  352. ->condition('type_id', $this->supercontig_cvt)
  353. ->execute()->fetchObject();
  354. $this->assertEquals(1000, $scaffold->seqlen);
  355. $this->assertEquals(1000, strlen($scaffold->residues));
  356. $this->assertEquals('0154424abe69dd64cd428c330d480ba0', $scaffold->md5checksum);
  357. }
  358. /**
  359. * Add a skip protein option. Test that when checked, implicit proteins are
  360. * not created, but that they are created when unchecked.
  361. *
  362. * @group gff
  363. * @ticket 77
  364. *
  365. */
  366. public function testGFFNoProteinOption() {
  367. $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff'];
  368. $analysis = factory('chado.analysis')->create();
  369. $organism = factory('chado.organism')->create();
  370. $run_args = [
  371. //The new argument
  372. 'skip_protein' => 1,
  373. ///
  374. 'analysis_id' => $analysis->analysis_id,
  375. 'organism_id' => $organism->organism_id,
  376. 'use_transaction' => 1,
  377. 'add_only' => 0,
  378. 'update' => 1,
  379. 'create_organism' => 0,
  380. 'create_target' => 0,
  381. ///regexps for mRNA and protein.
  382. 're_mrna' => NULL,
  383. 're_protein' => NULL,
  384. //optional
  385. 'target_organism_id' => NULL,
  386. 'target_type' => NULL,
  387. 'start_line' => NULL,
  388. 'landmark_type' => NULL,
  389. 'alt_id_attr' => NULL,
  390. ];
  391. $this->loadLandmarks($analysis, $organism);
  392. $this->runGFFLoader($run_args, $gff_file);
  393. $identifier = [
  394. 'cv_id' => ['name' => 'sequence'],
  395. 'name' => 'polypeptide',
  396. ];
  397. $protein_type_id = tripal_get_cvterm($identifier);
  398. //This works i think i just dont have proteins described in the GFF.
  399. $name = 'FRAEX38873_v2_000000110.1-protein';
  400. $query = db_select('chado.feature', 'f')
  401. ->fields('f', ['uniquename'])
  402. ->condition('f.uniquename', $name)
  403. ->condition('f.type_id', $protein_type_id->cvterm_id)
  404. ->execute()
  405. ->fetchField();
  406. $this->assertFalse($query);
  407. $run_args['skip_protein'] = 0;
  408. $this->runGFFLoader($run_args, $gff_file);
  409. $query = db_select('chado.feature', 'f')
  410. ->fields('f', ['uniquename'])
  411. ->condition('f.uniquename', $name)
  412. ->condition('f.type_id', $protein_type_id->cvterm_id)
  413. ->execute()
  414. ->fetchObject();
  415. $this->assertEquals($name, $query->uniquename);
  416. }
  417. /**
  418. * The GFF importer should still create explicitly defined proteins if
  419. * skip_protein is true.
  420. *
  421. * @group gff
  422. * @ticket 77
  423. */
  424. public function testGFFImporterLoadsExplicitProteins() {
  425. $gff_file = ['file_local' => __DIR__ . '/../data/simpleGFF.gff'];
  426. $analysis = factory('chado.analysis')->create();
  427. $organism = factory('chado.organism')->create();
  428. $run_args = [
  429. //The new argument
  430. 'skip_protein' => 1,
  431. ///
  432. 'analysis_id' => $analysis->analysis_id,
  433. 'organism_id' => $organism->organism_id,
  434. 'use_transaction' => 1,
  435. 'add_only' => 0,
  436. 'update' => 1,
  437. 'create_organism' => 0,
  438. 'create_target' => 0,
  439. ///regexps for mRNA and protein.
  440. 're_mrna' => NULL,
  441. 're_protein' => NULL,
  442. //optional
  443. 'target_organism_id' => NULL,
  444. 'target_type' => NULL,
  445. 'start_line' => NULL,
  446. 'landmark_type' => NULL,
  447. 'alt_id_attr' => NULL,
  448. ];
  449. $this->loadLandmarks($analysis, $organism);
  450. $this->runGFFLoader($run_args, $gff_file);
  451. $name = 'FRAEX38873_v2_000000010.1.3_test_protein';
  452. $query = db_select('chado.feature', 'f')
  453. ->fields('f', ['uniquename'])
  454. ->condition('f.uniquename', $name)
  455. ->execute()
  456. ->fetchField();
  457. $this->assertEquals($name, $query);
  458. }
  459. private function runGFFLoader($run_args, $file) {
  460. // silent(function ($run_args, $file) {
  461. module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/GFF3Importer');
  462. $importer = new \GFF3Importer();
  463. $importer->create($run_args, $file);
  464. $importer->prepareFiles();
  465. $importer->run();
  466. // });
  467. }
  468. private function loadLandmarks($analysis, $organism, $landmark_file = array()) {
  469. if (empty($landmark_file)) {
  470. $landmark_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/sequences/empty_landmarks.fasta'];
  471. }
  472. $run_args = [
  473. 'organism_id' => $organism->organism_id,
  474. 'analysis_id' => $analysis->analysis_id,
  475. 'seqtype' => 'supercontig',
  476. 'method' => 2, //default insert and update
  477. 'match_type' => 1, //unique name default
  478. //optional
  479. 're_name' => NULL,
  480. 're_uname' => NULL,
  481. 're_accession' => NULL,
  482. 'db_id' => NULL,
  483. 'rel_type' => NULL,
  484. 're_subject' => NULL,
  485. 'parent_type' => NULL,
  486. ];
  487. module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/FASTAImporter');
  488. //silent(function ($run_args, $landmark_file) {
  489. $importer = new \FASTAImporter();
  490. $importer->create($run_args, $landmark_file);
  491. $importer->prepareFiles();
  492. $importer->run();
  493. // });
  494. }
  495. }