Browse Source

Fixed ID regular expression check in GFF loader, added new test for invalid start end values and corresponding test gff file

Risharde Ramnath 4 years ago
parent
commit
da939901ea

+ 4 - 0
tests/tripal_chado/data/gff_invalidstartend.gff

@@ -0,0 +1,4 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	44054	16315	.	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 53 - 9
tests/tripal_chado/loaders/GFF3ImporterTest.php

@@ -27,10 +27,10 @@ class GFF3ImporterTest extends TripalTestCase {
       'update' => 1,
       'update' => 1,
       'create_organism' => 0,
       'create_organism' => 0,
       'create_target' => 0,
       'create_target' => 0,
-      ///regexps for mRNA and protein.
+      // regexps for mRNA and protein.
       're_mrna' => NULL,
       're_mrna' => NULL,
       're_protein' => NULL,
       're_protein' => NULL,
-      //optional
+      // optional
       'target_organism_id' => NULL,
       'target_organism_id' => NULL,
       'target_type' => NULL,
       'target_type' => NULL,
       'start_line' => NULL,
       'start_line' => NULL,
@@ -70,10 +70,10 @@ class GFF3ImporterTest extends TripalTestCase {
       'update' => 1,
       'update' => 1,
       'create_organism' => 0,
       'create_organism' => 0,
       'create_target' => 0,
       'create_target' => 0,
-      ///regexps for mRNA and protein.
+      // regexps for mRNA and protein.
       're_mrna' => NULL,
       're_mrna' => NULL,
       're_protein' => NULL,
       're_protein' => NULL,
-      //optional
+      // optional
       'target_organism_id' => NULL,
       'target_organism_id' => NULL,
       'target_type' => NULL,
       'target_type' => NULL,
       'start_line' => NULL,
       'start_line' => NULL,
@@ -114,10 +114,10 @@ class GFF3ImporterTest extends TripalTestCase {
       'update' => 1,
       'update' => 1,
       'create_organism' => 0,
       'create_organism' => 0,
       'create_target' => 0,
       'create_target' => 0,
-      ///regexps for mRNA and protein.
+      // regexps for mRNA and protein.
       're_mrna' => NULL,
       're_mrna' => NULL,
       're_protein' => NULL,
       're_protein' => NULL,
-      //optional
+      // optional
       'target_organism_id' => NULL,
       'target_organism_id' => NULL,
       'target_type' => NULL,
       'target_type' => NULL,
       'start_line' => NULL,
       'start_line' => NULL,
@@ -128,7 +128,7 @@ class GFF3ImporterTest extends TripalTestCase {
     $hasException = false;
     $hasException = false;
     try {    
     try {    
       $this->loadLandmarks($analysis, $organism);
       $this->loadLandmarks($analysis, $organism);
-      // This will produce an exception due to unescaped whitespace in ID
+      // This will produce an exception due to right arrow in ID
       $this->runGFFLoader($run_args, $gff_file);
       $this->runGFFLoader($run_args, $gff_file);
     }
     }
     catch(\Exception $ex) {
     catch(\Exception $ex) {
@@ -159,10 +159,10 @@ class GFF3ImporterTest extends TripalTestCase {
       'update' => 1,
       'update' => 1,
       'create_organism' => 0,
       'create_organism' => 0,
       'create_target' => 0,
       'create_target' => 0,
-      ///regexps for mRNA and protein.
+      // regexps for mRNA and protein.
       're_mrna' => NULL,
       're_mrna' => NULL,
       're_protein' => NULL,
       're_protein' => NULL,
-      //optional
+      // optional
       'target_organism_id' => NULL,
       'target_organism_id' => NULL,
       'target_type' => NULL,
       'target_type' => NULL,
       'start_line' => NULL,
       'start_line' => NULL,
@@ -184,6 +184,50 @@ class GFF3ImporterTest extends TripalTestCase {
     $this->assertEquals($hasException, true);
     $this->assertEquals($hasException, true);
   }
   }
 
 
+  /**
+   * Run the GFF loader on gff_invalidstartend.gff for testing.
+   *
+   * This tests whether the GFF loader fixes start end values 
+   */  
+  public function testGFFImporterInvalidStartEnd() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_invalidstartend.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+   
+    $this->loadLandmarks($analysis, $organism);
+    // This will produce an exception of duplicate feature ID
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $results = db_select('chado.feature', 'f')
+      ->fields('f', ['uniquename'])
+      ->condition('f.uniquename', 'FRAEX38873_v2_000000010')
+      ->execute()
+      ->fetchAll();    
+
+    // We expect the feature to still be added to the database
+    // since the GFF Loader caters for reversing backward numbers
+    $this->assertEquals(count($results), 1);
+  }
+
   /**
   /**
    * Run the GFF loader on small_gene.gff for testing.
    * Run the GFF loader on small_gene.gff for testing.
    *
    *

+ 4 - 1
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -2765,12 +2765,15 @@ class GFF3Importer extends TripalImporter {
     }
     }
     else {
     else {
       $uniquename = $attrs['ID'][0];
       $uniquename = $attrs['ID'][0];
-      preg_match('[a-zA-Z0-9.:^*$@!+_?-|]', $uniquename, $matches);
+
+      // Run ID validation checks based on GFF3 specifications 
+      preg_match('/[a-zA-Z0-9\.:\^\*\$@!\+_\?-\|]*/', $uniquename, $matches);
       if($matches[0] != $uniquename) {
       if($matches[0] != $uniquename) {
         throw new Exception(t("ID !uniquename contains invalid characters. Only 
         throw new Exception(t("ID !uniquename contains invalid characters. Only 
           characters included in this regular expression is allowed 
           characters included in this regular expression is allowed 
           [a-zA-Z0-9.:^*$@!+_?-|]", ['!uniquename' => $uniquename]));
           [a-zA-Z0-9.:^*$@!+_?-|]", ['!uniquename' => $uniquename]));
       }
       }
+
       $name = $attrs['Name'][0];
       $name = $attrs['Name'][0];
     }
     }