Browse Source

Fixed ID regular expression check in GFF loader, added new test for invalid start end values and corresponding test gff file

Risharde Ramnath 4 years ago
parent
commit
da939901ea

+ 4 - 0
tests/tripal_chado/data/gff_invalidstartend.gff

@@ -0,0 +1,4 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	44054	16315	.	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 53 - 9
tests/tripal_chado/loaders/GFF3ImporterTest.php

@@ -27,10 +27,10 @@ class GFF3ImporterTest extends TripalTestCase {
       'update' => 1,
       'create_organism' => 0,
       'create_target' => 0,
-      ///regexps for mRNA and protein.
+      // regexps for mRNA and protein.
       're_mrna' => NULL,
       're_protein' => NULL,
-      //optional
+      // optional
       'target_organism_id' => NULL,
       'target_type' => NULL,
       'start_line' => NULL,
@@ -70,10 +70,10 @@ class GFF3ImporterTest extends TripalTestCase {
       'update' => 1,
       'create_organism' => 0,
       'create_target' => 0,
-      ///regexps for mRNA and protein.
+      // regexps for mRNA and protein.
       're_mrna' => NULL,
       're_protein' => NULL,
-      //optional
+      // optional
       'target_organism_id' => NULL,
       'target_type' => NULL,
       'start_line' => NULL,
@@ -114,10 +114,10 @@ class GFF3ImporterTest extends TripalTestCase {
       'update' => 1,
       'create_organism' => 0,
       'create_target' => 0,
-      ///regexps for mRNA and protein.
+      // regexps for mRNA and protein.
       're_mrna' => NULL,
       're_protein' => NULL,
-      //optional
+      // optional
       'target_organism_id' => NULL,
       'target_type' => NULL,
       'start_line' => NULL,
@@ -128,7 +128,7 @@ class GFF3ImporterTest extends TripalTestCase {
     $hasException = false;
     try {    
       $this->loadLandmarks($analysis, $organism);
-      // This will produce an exception due to unescaped whitespace in ID
+      // This will produce an exception due to right arrow in ID
       $this->runGFFLoader($run_args, $gff_file);
     }
     catch(\Exception $ex) {
@@ -159,10 +159,10 @@ class GFF3ImporterTest extends TripalTestCase {
       'update' => 1,
       'create_organism' => 0,
       'create_target' => 0,
-      ///regexps for mRNA and protein.
+      // regexps for mRNA and protein.
       're_mrna' => NULL,
       're_protein' => NULL,
-      //optional
+      // optional
       'target_organism_id' => NULL,
       'target_type' => NULL,
       'start_line' => NULL,
@@ -184,6 +184,50 @@ class GFF3ImporterTest extends TripalTestCase {
     $this->assertEquals($hasException, true);
   }
 
+  /**
+   * Run the GFF loader on gff_invalidstartend.gff for testing.
+   *
+   * This tests whether the GFF loader fixes start end values 
+   */  
+  public function testGFFImporterInvalidStartEnd() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_invalidstartend.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+   
+    $this->loadLandmarks($analysis, $organism);
+    // This will produce an exception of duplicate feature ID
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $results = db_select('chado.feature', 'f')
+      ->fields('f', ['uniquename'])
+      ->condition('f.uniquename', 'FRAEX38873_v2_000000010')
+      ->execute()
+      ->fetchAll();    
+
+    // We expect the feature to still be added to the database
+    // since the GFF Loader caters for reversing backward numbers
+    $this->assertEquals(count($results), 1);
+  }
+
   /**
    * Run the GFF loader on small_gene.gff for testing.
    *

+ 4 - 1
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -2765,12 +2765,15 @@ class GFF3Importer extends TripalImporter {
     }
     else {
       $uniquename = $attrs['ID'][0];
-      preg_match('[a-zA-Z0-9.:^*$@!+_?-|]', $uniquename, $matches);
+
+      // Run ID validation checks based on GFF3 specifications 
+      preg_match('/[a-zA-Z0-9\.:\^\*\$@!\+_\?-\|]*/', $uniquename, $matches);
       if($matches[0] != $uniquename) {
         throw new Exception(t("ID !uniquename contains invalid characters. Only 
           characters included in this regular expression is allowed 
           [a-zA-Z0-9.:^*$@!+_?-|]", ['!uniquename' => $uniquename]));
       }
+
       $name = $attrs['Name'][0];
     }