GFF3ImporterTest.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. <?php
  2. namespace Tests;
  3. use StatonLab\TripalTestSuite\DBTransaction;
  4. use StatonLab\TripalTestSuite\TripalTestCase;
  5. class GFF3ImporterTest extends TripalTestCase {
  6. use DBTransaction;
  7. /**
  8. * Confirm basic GFF importer functionality.
  9. *
  10. * @group gff
  11. */
  12. public function testGFFImporter() {
  13. $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff'];
  14. $analysis = factory('chado.analysis')->create();
  15. $organism = factory('chado.organism')->create();
  16. $run_args = [
  17. 'analysis_id' => $analysis->analysis_id,
  18. 'organism_id' => $organism->organism_id,
  19. 'use_transaction' => 1,
  20. 'add_only' => 0,
  21. 'update' => 1,
  22. 'create_organism' => 0,
  23. 'create_target' => 0,
  24. ///regexps for mRNA and protein.
  25. 're_mrna' => NULL,
  26. 're_protein' => NULL,
  27. //optional
  28. 'target_organism_id' => NULL,
  29. 'target_type' => NULL,
  30. 'start_line' => NULL,
  31. 'landmark_type' => NULL,
  32. 'alt_id_attr' => NULL,
  33. ];
  34. $this->loadLandmarks($analysis, $organism);
  35. $this->runGFFLoader($run_args, $gff_file);
  36. $name = 'FRAEX38873_v2_000000110.2.exon4';
  37. $query = db_select('chado.feature', 'f')
  38. ->fields('f', ['uniquename'])
  39. ->condition('f.uniquename', $name)
  40. ->execute()
  41. ->fetchField();
  42. $this->assertEquals($name, $query);
  43. }
  44. /**
  45. * Run the GFF loader on small_gene.gff for testing.
  46. *
  47. * This gff has many attributes that we would like to test in the
  48. * testGFFImporterAttribute*() methods.
  49. */
  50. private function initGFFImporterAttributes() {
  51. $gff = ['file_local' => __DIR__ . '/../data/small_gene.gff'];
  52. $fasta = ['file_local' => __DIR__ . '/../data/short_scaffold.fasta'];
  53. $analysis = factory('chado.analysis')->create();
  54. $organism = factory('chado.organism')->create();
  55. $run_args = [
  56. 'analysis_id' => $analysis->analysis_id,
  57. 'organism_id' => $organism->organism_id,
  58. 'use_transaction' => 1,
  59. 'add_only' => 0,
  60. 'update' => 1,
  61. 'create_organism' => 0,
  62. 'create_target' => 0,
  63. ///regexps for mRNA and protein.
  64. 're_mrna' => NULL,
  65. 're_protein' => NULL,
  66. //optional
  67. 'target_organism_id' => $organism->organism_id,
  68. 'target_type' => NULL,
  69. 'start_line' => NULL,
  70. 'landmark_type' => NULL,
  71. 'alt_id_attr' => NULL,
  72. ];
  73. $this->loadLandmarks($analysis, $organism, $fasta);
  74. $this->runGFFLoader($run_args, $gff);
  75. $this->organism = $organism;
  76. $this->analysis = $analysis;
  77. $this->gene_cvt = chado_get_cvterm(array(
  78. 'name' => 'gene',
  79. 'cv_id' => array(
  80. 'name' => 'sequence',
  81. ),
  82. ))->cvterm_id;
  83. $this->mrna_cvt = chado_get_cvterm(array(
  84. 'name' => 'mRNA',
  85. 'cv_id' => array(
  86. 'name' => 'sequence',
  87. ),
  88. ))->cvterm_id;
  89. $this->gene_1_uname = 'test_gene_001';
  90. $this->gene_2_uname = 'test_gene_002';
  91. }
  92. /**
  93. * Ensures that the feature record is loaded correctly into chado.
  94. *
  95. * @group gff
  96. */
  97. public function testGFFImporterAttributeFeature() {
  98. $this->initGFFImporterAttributes();
  99. $organism = $this->organism;
  100. $query = db_select('chado.feature', 'f')
  101. ->fields('f')
  102. ->condition('uniquename', $this->gene_1_uname)
  103. ->condition('type_id', $this->gene_cvt)
  104. ->execute();
  105. $gene_1 = $query->fetchObject();
  106. $this->assertEquals('test_gene_001', $gene_1->uniquename);
  107. $this->assertEquals('test_gene_001', $gene_1->name);
  108. $this->assertEquals($organism->organism_id, $gene_1->organism_id);
  109. $this->assertEquals($this->gene_cvt, $gene_1->type_id);
  110. }
  111. /**
  112. * Ensures the feature alias is loaded correctly into chado.
  113. *
  114. * @group gff
  115. */
  116. public function testGFFImporterAttributeAlias() {
  117. $this->initGFFImporterAttributes();
  118. $alias = 'first_test_gene';
  119. $gene_1 = db_select('chado.feature', 'f')
  120. ->fields('f')
  121. ->condition('uniquename', $this->gene_1_uname)
  122. ->condition('type_id', $this->gene_cvt)
  123. ->execute()->fetchObject();
  124. $query = db_select('chado.feature_synonym', 'fs');
  125. $query->join('chado.synonym', 's', 's.synonym_id = fs.synonym_id');
  126. $query->fields('s');
  127. $query->condition('fs.feature_id', $gene_1->feature_id);
  128. $query = $query->execute();
  129. $result = $query->fetchObject();
  130. $this->assertEquals($alias, $result->name);
  131. }
  132. /**
  133. * Ensures that the dbxref records are loaded correctly into chado.
  134. *
  135. * @group gff
  136. */
  137. public function testGFFImporterAttributeDbxref() {
  138. $this->initGFFImporterAttributes();
  139. $test_db_name = 'TEST_DB';
  140. $dbx_accession = 'test_gene_dbx_001';
  141. $test_db = chado_get_db(array('name' => $test_db_name));
  142. $gff_db = chado_get_db(array('name' => 'GFF_source'));
  143. $gene_1 = db_select('chado.feature', 'f')
  144. ->fields('f')
  145. ->condition('uniquename', $this->gene_1_uname)
  146. ->condition('type_id', $this->gene_cvt)
  147. ->execute()->fetchObject();
  148. $dbx_query = db_select('chado.feature_dbxref', 'fdbx');
  149. $dbx_query->join('chado.dbxref', 'dbx', 'dbx.dbxref_id = fdbx.dbxref_id');
  150. $dbx_query->fields('dbx');
  151. $dbx_query->condition('fdbx.feature_id', $gene_1->feature_id);
  152. $gff_query = clone $dbx_query;
  153. $dbx_query->condition('dbx.db_id', $test_db->db_id);
  154. $dbx_query = $dbx_query->execute();
  155. $gff_query->condition('dbx.db_id', $gff_db->db_id);
  156. $gff_query = $gff_query->execute();
  157. $dbxref = $dbx_query->fetchObject();
  158. $gff_dbxref = $gff_query->fetchObject();
  159. $this->assertEquals($dbx_accession, $dbxref->accession);
  160. $this->assertEquals($this->gene_1_uname, $gff_dbxref->accession);
  161. }
  162. /**
  163. * Ensures ontology term records loaded correctly into chado.
  164. *
  165. * @group gff
  166. */
  167. public function testGFFImporterAttributeOntology() {
  168. $this->initGFFImporterAttributes();
  169. $ontology_db = 'SO';
  170. $ontology_accession = '0000704';
  171. $gene_1 = db_select('chado.feature', 'f')
  172. ->fields('f')
  173. ->condition('uniquename', $this->gene_1_uname)
  174. ->condition('type_id', $this->gene_cvt)
  175. ->execute()->fetchObject();
  176. $term = chado_get_cvterm(array(
  177. 'dbxref_id' => array(
  178. 'accession' => $ontology_accession,
  179. 'db_id' => array(
  180. 'name' => $ontology_db,
  181. ),
  182. ),
  183. ));
  184. $feature_cvt = db_select('chado.feature_cvterm', 'fcvt')
  185. ->fields('fcvt')
  186. ->condition('cvterm_id', $term->cvterm_id)
  187. ->condition('feature_id', $gene_1->feature_id)
  188. ->execute();
  189. $this->assertEquals(1, $feature_cvt->rowCount());
  190. }
  191. /**
  192. * Ensures feature parent record loaded correctly into chado.
  193. *
  194. * @group gff
  195. */
  196. public function testGFFImporterAttributeParent() {
  197. $this->initGFFImporterAttributes();
  198. $mrna_uname = 'test_mrna_001.1';
  199. $rel_cvt = chado_get_cvterm(array(
  200. 'name' => 'part_of',
  201. 'cv_id' => array(
  202. 'name' => 'sequence',
  203. ),
  204. ))->cvterm_id;
  205. $mrna = db_select('chado.feature', 'f')
  206. ->fields('f')
  207. ->condition('uniquename', $mrna_uname)
  208. ->condition('type_id', $this->mrna_cvt)
  209. ->execute()->fetchObject();
  210. $query = db_select('chado.feature_relationship', 'fr');
  211. $query->join('chado.feature', 'f', 'f.feature_id = fr.object_id');
  212. $query->fields('f');
  213. $query->condition('fr.subject_id', $mrna->feature_id);
  214. $query->condition('fr.type_id', $rel_cvt);
  215. $query = $query->execute();
  216. $parent = $query->fetchObject();
  217. $this->assertEquals('test_gene_001', $parent->uniquename);
  218. $this->assertEquals('test_gene_001', $parent->name);
  219. $this->assertEquals($this->gene_cvt, $parent->type_id);
  220. $this->assertEquals($this->organism->organism_id, $parent->organism_id);
  221. }
  222. /**
  223. * Ensure target record loaded correctly into chado.
  224. *
  225. * @group gff
  226. */
  227. public function testGFFImporterAttributeTarget() {
  228. $this->initGFFImporterAttributes();
  229. $target_feature = 'scaffold1';
  230. $start = 99;
  231. $end = 200;
  232. $target_type = 'supercontig';
  233. $target_cvt = chado_get_cvterm(array(
  234. 'name' => $target_type,
  235. 'cv_id' => array(
  236. 'name' => 'sequence',
  237. ),
  238. ))->cvterm_id;
  239. $source_feature = db_select('chado.feature', 'f')
  240. ->fields('f')
  241. ->condition('uniquename', $target_feature)
  242. ->condition('type_id', $target_cvt)
  243. ->execute()->fetchObject();
  244. $gene_1 = db_select('chado.feature', 'f')
  245. ->fields('f')
  246. ->condition('uniquename', $this->gene_1_uname)
  247. ->condition('type_id', $this->gene_cvt)
  248. ->execute()->fetchObject();
  249. $featureloc = db_select('chado.featureloc', 'fl')
  250. ->fields('fl')
  251. ->condition('fl.feature_id', $gene_1->feature_id)
  252. ->condition('fl.srcfeature_id', $source_feature->feature_id)
  253. ->execute()->fetchObject();
  254. $this->assertEquals($start, $featureloc->fmin);
  255. $this->assertEquals($end, $featureloc->fmax);
  256. }
  257. /**
  258. * Ensure properties loaded correctly into chado.
  259. *
  260. * @group gff
  261. */
  262. public function testGFFImporterAttributeProperty() {
  263. $this->initGFFImporterAttributes();
  264. $gap_1 = 'test_gap_1';
  265. $gap_2 = 'test_gap_2';
  266. $note_val = 'test_gene_001_note';
  267. $gene_1 = db_select('chado.feature', 'f')
  268. ->fields('f')
  269. ->condition('uniquename', $this->gene_1_uname)
  270. ->condition('type_id', $this->gene_cvt)
  271. ->execute()->fetchObject();
  272. $gap_cvt = chado_get_cvterm(array(
  273. 'name' => 'Gap',
  274. 'cv_id' => array(
  275. 'name' => 'feature_property',
  276. ),
  277. ))->cvterm_id;
  278. $note_cvt = chado_get_cvterm(array(
  279. 'name' => 'Note',
  280. 'cv_id' => array(
  281. 'name' => 'feature_property',
  282. ),
  283. ))->cvterm_id;
  284. // Assert gaps loaded correctly
  285. $gaps_query = db_select('chado.featureprop', 'fp')
  286. ->fields('fp')
  287. ->condition('feature_id', $gene_1->feature_id)
  288. ->condition('type_id', $gap_cvt)
  289. ->execute();
  290. while (($gap = $gaps_query->fetchObject())) {
  291. $gaps[$gap->value] = $gap;
  292. }
  293. $this->assertEquals($gap_1, $gaps[$gap_1]->value);
  294. $this->assertEquals(0, $gaps[$gap_1]->rank);
  295. $this->assertEquals($gap_2, $gaps[$gap_2]->value);
  296. $this->assertEquals(1, $gaps[$gap_2]->rank);
  297. // Assert note loaded correctly
  298. $note = db_select('chado.featureprop', 'fp')
  299. ->fields('fp')
  300. ->condition('feature_id', $gene_1->feature_id)
  301. ->condition('type_id', $note_cvt)
  302. ->execute()->fetchObject();
  303. $this->assertEquals($note_val, $note->value);
  304. $this->assertEquals(0, $note->rank);
  305. }
  306. /**
  307. * Ensure derives from information loaded correctly into chado.
  308. *
  309. * @group gff
  310. */
  311. public function testGFFImporterAttributeDerivesFrom() {
  312. $this->initGFFImporterAttributes();
  313. $gene_2 = db_select('chado.feature', 'f')
  314. ->fields('f')
  315. ->condition('uniquename', $this->gene_2_uname)
  316. ->condition('type_id', $this->gene_cvt)
  317. ->execute()->fetchObject();
  318. $derivesfrom_cvt = chado_get_cvterm(array(
  319. 'name' => 'derives_from',
  320. 'cv_id' => array(
  321. 'name' => 'sequence',
  322. ),
  323. ))->cvterm_id;
  324. $query = db_select('chado.feature', 'f');
  325. $query->join('chado.feature_relationship', 'fr', 'f.feature_id = fr.object_id');
  326. $query->fields('f');
  327. $query->condition('fr.subject_id', $gene_2->feature_id);
  328. $query->condition('fr.type_id', $derivesfrom_cvt);
  329. $query = $query->execute();
  330. $derivesfrom_feature = $query->fetchObject();
  331. $this->assertEquals($this->gene_1_uname, $derivesfrom_feature->uniquename);
  332. $this->assertEquals($this->gene_1_uname, $derivesfrom_feature->name);
  333. $this->assertEquals($this->gene_cvt, $derivesfrom_feature->type_id);
  334. }
  335. /**
  336. * Add a skip protein option. Test that when checked, implicit proteins are
  337. * not created, but that they are created when unchecked.
  338. *
  339. * @group gff
  340. * @ticket 77
  341. *
  342. */
  343. public function testGFFNoProteinOption() {
  344. $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff'];
  345. $analysis = factory('chado.analysis')->create();
  346. $organism = factory('chado.organism')->create();
  347. $run_args = [
  348. //The new argument
  349. 'skip_protein' => 1,
  350. ///
  351. 'analysis_id' => $analysis->analysis_id,
  352. 'organism_id' => $organism->organism_id,
  353. 'use_transaction' => 1,
  354. 'add_only' => 0,
  355. 'update' => 1,
  356. 'create_organism' => 0,
  357. 'create_target' => 0,
  358. ///regexps for mRNA and protein.
  359. 're_mrna' => NULL,
  360. 're_protein' => NULL,
  361. //optional
  362. 'target_organism_id' => NULL,
  363. 'target_type' => NULL,
  364. 'start_line' => NULL,
  365. 'landmark_type' => NULL,
  366. 'alt_id_attr' => NULL,
  367. ];
  368. $this->loadLandmarks($analysis, $organism);
  369. $this->runGFFLoader($run_args, $gff_file);
  370. $identifier = [
  371. 'cv_id' => ['name' => 'sequence'],
  372. 'name' => 'polypeptide',
  373. ];
  374. $protein_type_id = tripal_get_cvterm($identifier);
  375. //This works i think i just dont have proteins described in the GFF.
  376. $name = 'FRAEX38873_v2_000000110.1-protein';
  377. $query = db_select('chado.feature', 'f')
  378. ->fields('f', ['uniquename'])
  379. ->condition('f.uniquename', $name)
  380. ->condition('f.type_id', $protein_type_id->cvterm_id)
  381. ->execute()
  382. ->fetchField();
  383. $this->assertFalse($query);
  384. $run_args['skip_protein'] = 0;
  385. $this->runGFFLoader($run_args, $gff_file);
  386. $query = db_select('chado.feature', 'f')
  387. ->fields('f', ['uniquename'])
  388. ->condition('f.uniquename', $name)
  389. ->condition('f.type_id', $protein_type_id->cvterm_id)
  390. ->execute()
  391. ->fetchObject();
  392. $this->assertEquals($name, $query->uniquename);
  393. }
  394. /**
  395. * The GFF importer should still create explicitly defined proteins if
  396. * skip_protein is true.
  397. *
  398. * @group gff
  399. * @ticket 77
  400. */
  401. public function testGFFImporterLoadsExplicitProteins() {
  402. $gff_file = ['file_local' => __DIR__ . '/../data/simpleGFF.gff'];
  403. $analysis = factory('chado.analysis')->create();
  404. $organism = factory('chado.organism')->create();
  405. $run_args = [
  406. //The new argument
  407. 'skip_protein' => 1,
  408. ///
  409. 'analysis_id' => $analysis->analysis_id,
  410. 'organism_id' => $organism->organism_id,
  411. 'use_transaction' => 1,
  412. 'add_only' => 0,
  413. 'update' => 1,
  414. 'create_organism' => 0,
  415. 'create_target' => 0,
  416. ///regexps for mRNA and protein.
  417. 're_mrna' => NULL,
  418. 're_protein' => NULL,
  419. //optional
  420. 'target_organism_id' => NULL,
  421. 'target_type' => NULL,
  422. 'start_line' => NULL,
  423. 'landmark_type' => NULL,
  424. 'alt_id_attr' => NULL,
  425. ];
  426. $this->loadLandmarks($analysis, $organism);
  427. $this->runGFFLoader($run_args, $gff_file);
  428. $name = 'FRAEX38873_v2_000000010.1.3_test_protein';
  429. $query = db_select('chado.feature', 'f')
  430. ->fields('f', ['uniquename'])
  431. ->condition('f.uniquename', $name)
  432. ->execute()
  433. ->fetchField();
  434. $this->assertEquals($name, $query);
  435. }
  436. private function runGFFLoader($run_args, $file) {
  437. // silent(function ($run_args, $file) {
  438. module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/GFF3Importer');
  439. $importer = new \GFF3Importer();
  440. $importer->create($run_args, $file);
  441. $importer->prepareFiles();
  442. $importer->run();
  443. // });
  444. }
  445. private function loadLandmarks($analysis, $organism, $landmark_file = array()) {
  446. if (empty($landmark_file)) {
  447. $landmark_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/sequences/empty_landmarks.fasta'];
  448. }
  449. $run_args = [
  450. 'organism_id' => $organism->organism_id,
  451. 'analysis_id' => $analysis->analysis_id,
  452. 'seqtype' => 'supercontig',
  453. 'method' => 2, //default insert and update
  454. 'match_type' => 1, //unique name default
  455. //optional
  456. 're_name' => NULL,
  457. 're_uname' => NULL,
  458. 're_accession' => NULL,
  459. 'db_id' => NULL,
  460. 'rel_type' => NULL,
  461. 're_subject' => NULL,
  462. 'parent_type' => NULL,
  463. ];
  464. module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/FASTAImporter');
  465. //silent(function ($run_args, $landmark_file) {
  466. $importer = new \FASTAImporter();
  467. $importer->create($run_args, $landmark_file);
  468. $importer->prepareFiles();
  469. $importer->run();
  470. // });
  471. }
  472. }