Przeglądaj źródła

Merge pull request #1124 from risharde/1040-tv3-gff3_performance

Added extra ID validation within GFF3 loader based on GFF3 specs, 2 unit tests for ID, 1 unit test for start end reversal
Stephen Ficklin 4 lat temu
rodzic
commit
172b018e30

+ 4 - 0
tests/tripal_chado/data/gff_invalidstartend.gff

@@ -0,0 +1,4 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	44054	16315	.	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 4 - 0
tests/tripal_chado/data/gff_rightarrow_id.gff

@@ -0,0 +1,4 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=>FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 4 - 0
tests/tripal_chado/data/gff_unescaped_ids.gff

@@ -0,0 +1,4 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010 SPACED;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 137 - 4
tests/tripal_chado/loaders/GFF3ImporterTest.php

@@ -27,10 +27,10 @@ class GFF3ImporterTest extends TripalTestCase {
       'update' => 1,
       'create_organism' => 0,
       'create_target' => 0,
-      ///regexps for mRNA and protein.
+      // regexps for mRNA and protein.
       're_mrna' => NULL,
       're_protein' => NULL,
-      //optional
+      // optional
       'target_organism_id' => NULL,
       'target_type' => NULL,
       'start_line' => NULL,
@@ -51,6 +51,95 @@ class GFF3ImporterTest extends TripalTestCase {
     $this->assertEquals($name, $query);
   }
 
+  /**
+   * Run the GFF loader on gff_unescaped_ids.gff for testing.
+   *
+   * This tests whether the GFF loader detects invalid ID that contains  
+   * unescaped whitespaces. The GFF loader should throw an exception which this
+   * unit test detects.
+   */  
+  public function testGFFImporterUnescapedWhitespaceID() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_unescaped_ids.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+    $hasException = false;
+    try {    
+      $this->loadLandmarks($analysis, $organism);
+      // This will produce an exception due to unescaped whitespace in ID
+      $this->runGFFLoader($run_args, $gff_file);
+    }
+    catch(\Exception $ex) {
+      $hasException = true;
+    }
+
+    // We expect an exception to happen so we are looking for a return of true
+    $this->assertEquals($hasException, true);
+  }
+
+  /**
+   * Run the GFF loader on gff_rightarrow_ids.gff for testing.
+   *
+   * This tests whether the GFF loader detects invalid ID that contains  
+   * beginning arrow >. The GFF loader should throw an exception which this
+   * unit detects.
+   */  
+  public function testGFFImporterRightArrowID() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_rightarrow_id.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+    $hasException = false;
+    try {    
+      $this->loadLandmarks($analysis, $organism);
+      // This will produce an exception due to right arrow in ID
+      $this->runGFFLoader($run_args, $gff_file);
+    }
+    catch(\Exception $ex) {
+      $hasException = true;
+    }
+
+    // We expect an exception to happen so we are looking for a return of true
+    $this->assertEquals($hasException, true);
+  }
+
+
   /**
    * Run the GFF loader on gff_duplicate_ids.gff for testing.
    *
@@ -70,10 +159,10 @@ class GFF3ImporterTest extends TripalTestCase {
       'update' => 1,
       'create_organism' => 0,
       'create_target' => 0,
-      ///regexps for mRNA and protein.
+      // regexps for mRNA and protein.
       're_mrna' => NULL,
       're_protein' => NULL,
-      //optional
+      // optional
       'target_organism_id' => NULL,
       'target_type' => NULL,
       'start_line' => NULL,
@@ -95,6 +184,50 @@ class GFF3ImporterTest extends TripalTestCase {
     $this->assertEquals($hasException, true);
   }
 
+  /**
+   * Run the GFF loader on gff_invalidstartend.gff for testing.
+   *
+   * This tests whether the GFF loader fixes start end values 
+   */  
+  public function testGFFImporterInvalidStartEnd() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_invalidstartend.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+   
+    $this->loadLandmarks($analysis, $organism);
+    // This will produce an exception of duplicate feature ID
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $results = db_select('chado.feature', 'f')
+      ->fields('f', ['uniquename'])
+      ->condition('f.uniquename', 'FRAEX38873_v2_000000010')
+      ->execute()
+      ->fetchAll();    
+
+    // We expect the feature to still be added to the database
+    // since the GFF Loader caters for reversing backward numbers
+    $this->assertEquals(count($results), 1);
+  }
+
   /**
    * Run the GFF loader on small_gene.gff for testing.
    *

+ 9 - 0
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -2765,6 +2765,15 @@ class GFF3Importer extends TripalImporter {
     }
     else {
       $uniquename = $attrs['ID'][0];
+
+      // Run ID validation checks based on GFF3 specifications 
+      preg_match('/[a-zA-Z0-9\.:\^\*\$@!\+_\?-\|]*/', $uniquename, $matches);
+      if($matches[0] != $uniquename) {
+        throw new Exception(t("ID !uniquename contains invalid characters. Only 
+          characters included in this regular expression is allowed 
+          [a-zA-Z0-9.:^*$@!+_?-|]", ['!uniquename' => $uniquename]));
+      }
+
       $name = $attrs['Name'][0];
     }