Browse Source

Merge pull request #1134 from risharde/1040-tv3-gff3_performance

Fix for GFF if phase value is invalid + unit tests for the gff phase + gff test files
Stephen Ficklin 5 years ago
parent
commit
4b60f9b9e6

+ 6 - 0
tests/tripal_chado/data/gff_phase.gff

@@ -0,0 +1,6 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	five_prime_UTR	16315	16557	.	+	.	ID=FRAEX38873_v2_000000010.1.5utr1;Parent=FRAEX38873_v2_000000010.1
+Contig0	FRAEX38873_v2	exon	16315	16967	.	+	.	ID=FRAEX38873_v2_000000010.1.exon1;Parent=FRAEX38873_v2_000000010.1
+Contig0	FRAEX38873_v2	CDS	16558	16967	.	+	1	ID=FRAEX38873_v2_000000010.1.cds1;Parent=FRAEX38873_v2_000000010.1

+ 6 - 0
tests/tripal_chado/data/gff_phase_invalid_character.gff

@@ -0,0 +1,6 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	five_prime_UTR	16315	16557	.	+	.	ID=FRAEX38873_v2_000000010.1.5utr1;Parent=FRAEX38873_v2_000000010.1
+Contig0	FRAEX38873_v2	exon	16315	16967	.	+	.	ID=FRAEX38873_v2_000000010.1.exon1;Parent=FRAEX38873_v2_000000010.1
+Contig0	FRAEX38873_v2	CDS	16558	16967	.	+	a	ID=FRAEX38873_v2_000000010.1.cds1;Parent=FRAEX38873_v2_000000010.1

+ 6 - 0
tests/tripal_chado/data/gff_phase_invalid_number.gff

@@ -0,0 +1,6 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	five_prime_UTR	16315	16557	.	+	.	ID=FRAEX38873_v2_000000010.1.5utr1;Parent=FRAEX38873_v2_000000010.1
+Contig0	FRAEX38873_v2	exon	16315	16967	.	+	.	ID=FRAEX38873_v2_000000010.1.exon1;Parent=FRAEX38873_v2_000000010.1
+Contig0	FRAEX38873_v2	CDS	16558	16967	.	+	3	ID=FRAEX38873_v2_000000010.1.cds1;Parent=FRAEX38873_v2_000000010.1

+ 139 - 0
tests/tripal_chado/loaders/GFF3ImporterTest.php

@@ -415,6 +415,145 @@ class GFF3ImporterTest extends TripalTestCase {
     }
   }
 
+  /**
+   * Run the GFF loader on gff_phase.gff for testing.
+   *
+   * This tests whether the GFF loader interprets the phase values correctly
+   * for CDS rows.
+   */  
+  public function testGFFImporterPhaseTest() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_phase.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+   
+    $this->loadLandmarks($analysis, $organism);
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $results = db_query("SELECT * FROM chado.feature 
+      WHERE uniquename = :uniquename LIMIT 1", array(
+      ':uniquename' => 'FRAEX38873_v2_000000010.1.cds1'
+      )
+    );
+
+    // Check to make sure it returns a single row (implying a match)
+    // by the uniquename specified
+    $this->assertEquals($results->rowCount(), 1);
+
+    $results = db_query("SELECT * FROM chado.featureloc 
+      WHERE phase = 1 LIMIT 1", array(
+      )
+    );
+
+    // Check to make sure it returns a single row (implying a match)
+    // by phase value 1
+    $this->assertEquals($results->rowCount(), 1);    
+  }
+
+  /**
+   * Run the GFF loader on gff_phase_invalid_number.gff for testing.
+   *
+   * This tests whether the GFF loader interprets the phase values correctly
+   * for CDS rows when a number outside of the range 0,1,2 is specified.
+   */  
+  public function testGFFImporterInvalidPhaseNumberTest() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_phase_invalid_number.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+    $this->loadLandmarks($analysis, $organism);
+    $hasException = false;
+    try {
+      $this->runGFFLoader($run_args, $gff_file);
+    }
+    catch (\Exception $ex) {
+      $hasException = true;
+    }
+
+    // An exception should have been thrown since the phase number is invalid
+    $this->assertEquals($hasException, true);
+  }
+
+  /**
+   * Run the GFF loader on gff_phase_invalid_character.gff for testing.
+   *
+   * This tests whether the GFF loader interprets the phase values correctly
+   * for CDS rows when a character outside of the range 0,1,2 is specified.
+   */  
+  public function testGFFImporterInvalidPhaseCharacterTest() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_phase_invalid_character.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+    $this->loadLandmarks($analysis, $organism);
+    $hasException = false;
+    try {
+      $this->runGFFLoader($run_args, $gff_file);
+    }
+    catch (\Exception $ex) {
+      $hasException = true;
+    }
+
+    // An exception should have been thrown since the phase number is invalid
+    $this->assertEquals($hasException, true);
+  }
+
+
+
   /**
    * Run the GFF loader on small_gene.gff for testing.
    *

+ 8 - 1
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -950,7 +950,6 @@ class GFF3Importer extends TripalImporter {
 
     // Check to make sure strand has a valid character
     if (preg_match('/[\+-\?\.]/',$ret['strand']) == false) {
-      print_r($ret['strand']);
       throw new Exception(t('Invalid strand detected on line !line, 
         strand can only be +-?.',['!line' => $line]));      
     }    
@@ -968,6 +967,14 @@ class GFF3Importer extends TripalImporter {
     elseif (strcmp($ret['strand'], '-') == 0) {
       $ret['strand'] = -1;
     }
+
+    
+    if (preg_match('/[012\.]/',$ret['phase']) == false) {
+      throw new Exception(t('Invalid phase detected on line !line, 
+        phase can only be 0,1,2 or . (period)',['!line' => $line]));      
+    }
+    
+
     if (strcmp($ret['phase'], '.') == 0) {
       if ($ret['type'] == 'cds') {
         $ret['phase'] = '0';