Parcourir la source

Merge pull request #1137 from risharde/1040-tv3-gff3_performance

Numerous updates to GFF3 testing + merge fix to GFF3 code
Stephen Ficklin il y a 4 ans
Parent
commit
c17ededb49

+ 5 - 0
tests/tripal_chado/data/gff_seqid_invalid_character.gff

@@ -0,0 +1,5 @@
+##gff-version 3
+Contig'0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000011;Name=FRAEX38873_v2_000000010;biotype=protein_coding

+ 5 - 0
tests/tripal_chado/data/gff_tag_parent_verification.gff

@@ -0,0 +1,5 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010_20;Name=FRAEX38873_v2_000000010%2C20;biotype=protein_coding

+ 4 - 0
tests/tripal_chado/data/gff_tag_unescaped_character.gff

@@ -0,0 +1,4 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010;TES,T=TEST;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 5 - 0
tests/tripal_chado/data/gff_tagvalue_encoded_character.gff

@@ -0,0 +1,5 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010%2C20;Name=FRAEX38873_v2_000000010%2C20;biotype=protein_coding

+ 4 - 0
tests/tripal_chado/data/gff_tagvalue_unescaped_character.gff

@@ -0,0 +1,4 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010;TEST=T,EST;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 274 - 28
tests/tripal_chado/loaders/GFF3ImporterTest.php

@@ -52,14 +52,13 @@ class GFF3ImporterTest extends TripalTestCase {
   }
 
   /**
-   * Run the GFF loader on gff_unescaped_ids.gff for testing.
+   * Run the GFF loader on gff_tag_unescaped_character.gff for testing.
    *
-   * This tests whether the GFF loader detects invalid ID that contains  
-   * unescaped whitespaces. The GFF loader should throw an exception which this
-   * unit test detects.
+   * This tests whether the GFF loader adds IDs that contain a comma. 
+   * The GFF loader should allow it
    */  
-  public function testGFFImporterUnescapedWhitespaceID() {
-    $gff_file = ['file_local' => __DIR__ . '/../data/gff_unescaped_ids.gff'];
+  public function testGFFImporterUnescapedTagWithComma() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_tag_unescaped_character.gff'];
     $analysis = factory('chado.analysis')->create();
     $organism = factory('chado.organism')->create();
     $run_args = [
@@ -81,29 +80,156 @@ class GFF3ImporterTest extends TripalTestCase {
       'alt_id_attr' => NULL,
     ];
 
+  
+    $this->loadLandmarks($analysis, $organism);
+    // This should throw an error based on the tag name having the comma
     $hasException = false;
-    try {    
-      $this->loadLandmarks($analysis, $organism);
-      // This will produce an exception due to unescaped whitespace in ID
+    try {
       $this->runGFFLoader($run_args, $gff_file);
     }
-    catch(\Exception $ex) {
+    catch (\Exception $ex) {
       $hasException = true;
     }
 
-    // We expect an exception to happen so we are looking for a return of true
-    $this->assertEquals($hasException, true);
+    $this->assertEquals($hasException, false);
   }
 
   /**
-   * Run the GFF loader on gff_rightarrow_ids.gff for testing.
+   * Run the GFF loader on small_gene.gff for testing.
    *
-   * This tests whether the GFF loader detects invalid ID that contains  
-   * beginning arrow >. The GFF loader should throw an exception which this
-   * unit detects.
+   * This tests whether the GFF loader adds Alias attributes
+   * The GFF loader should allow it
    */  
-  public function testGFFImporterRightArrowID() {
-    $gff_file = ['file_local' => __DIR__ . '/../data/gff_rightarrow_id.gff'];
+  public function testGFFImporterTagAliasVerification() {
+    $gff_file = ['file_local' =>
+      __DIR__ . '/../data/small_gene.gff'];
+    $fasta = ['file_local' => __DIR__ . '/../data/short_scaffold.fasta'];      
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+  
+    $this->loadLandmarks($analysis, $organism, $fasta);
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $results = db_query("SELECT * FROM chado.feature_synonym fs
+      LEFT JOIN chado.synonym s ON (fs.feature_synonym_id = s.synonym_id)
+      WHERE name = 'first_test_gene'
+    ;",array());
+
+    $this->assertEquals($results->rowCount(), 1);
+  }
+
+
+  /**
+   * Run the GFF loader on gff_tag_parent_verification.gff for testing.
+   *
+   * This tests whether the GFF loader adds Parent attributes
+   * The GFF loader should allow it
+   */  
+  public function testGFFImporterTagParentVerification() {
+    $gff_file = ['file_local' =>
+      __DIR__ . '/../data/gff_tag_parent_verification.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+  
+    $this->loadLandmarks($analysis, $organism);
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $results = db_query("SELECT * FROM chado.feature_relationship fr
+      LEFT JOIN chado.feature f ON (fr.object_id = f.feature_id)
+      WHERE f.uniquename = 'FRAEX38873_v2_000000010'
+    ;",array());
+
+    // Found parent via object_id FRAEX38873_v2_000000010
+    $this->assertEquals($results->rowCount(), 1);
+  }
+
+  /**
+   * Run the GFF loader on gff_tagvalue_unescaped_character.gff for testing.
+   *
+   * This tests whether the GFF loader adds IDs that contain a comma. 
+   * The GFF loader should allow it
+   */  
+  public function testGFFImporterEscapedTagValueWithEncodedCharacter() {
+    $gff_file = ['file_local' => 
+      __DIR__ . '/../data/gff_tagvalue_encoded_character.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+  
+    $this->loadLandmarks($analysis, $organism);
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $results = db_query("SELECT * FROM chado.feature 
+      WHERE uniquename = 'FRAEX38873_v2_000000010,20';",array());
+
+    $this->assertEquals($results->rowCount(), 1);
+  }
+
+
+  /**
+   * Run the GFF loader on gff_tagvalue_unescaped_character.gff for testing.
+   *
+   * This tests whether the GFF loader adds IDs that contain a comma. 
+   * The GFF loader should allow it
+   */  
+  public function testGFFImporterUnescapedTagValueWithComma() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_tagvalue_unescaped_character.gff'];
     $analysis = factory('chado.analysis')->create();
     $organism = factory('chado.organism')->create();
     $run_args = [
@@ -125,18 +251,143 @@ class GFF3ImporterTest extends TripalTestCase {
       'alt_id_attr' => NULL,
     ];
 
+  
+    $this->loadLandmarks($analysis, $organism);
+    // This should throw an error based on the tag name having the comma
     $hasException = false;
-    try {    
-      $this->loadLandmarks($analysis, $organism);
-      // This will produce an exception due to right arrow in ID
+    try {
       $this->runGFFLoader($run_args, $gff_file);
     }
-    catch(\Exception $ex) {
+    catch (\Exception $ex) {
+      $hasException = true;
+    }
+
+    $this->assertEquals($hasException, false);
+  }
+
+  /**
+   * Run the GFF loader on gff_seqid_invalid_character.gff for testing.
+   * Seqids seem to also be called landmarks within GFF loader.
+   * This tests whether the GFF loader has any issues with characters like  
+   * single quotes.
+   */  
+  public function testGFFImporterSeqidWithInvalidCharacter() {
+    $gff_file = ['file_local' => 
+      __DIR__ . '/../data/gff_seqid_invalid_character.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+  
+    $this->loadLandmarks($analysis, $organism);
+    // This will produce an exception due to quote character in Seqid
+    $hasException = false;
+    try {
+      $this->runGFFLoader($run_args, $gff_file);
+    }
+    catch (\Exception $ex) {
       $hasException = true;
     }
 
-    // We expect an exception to happen so we are looking for a return of true
     $this->assertEquals($hasException, true);
+
+  }
+
+  /**
+   * Run the GFF loader on gff_unescaped_ids.gff for testing.
+   *
+   * This tests whether the GFF loader adds IDs that contain whitespaces. 
+   * The GFF loader should allow it
+   */  
+  public function testGFFImporterUnescapedWhitespaceID() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_unescaped_ids.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+  
+    $this->loadLandmarks($analysis, $organism);
+    // This should go through just fine
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $results = db_query("SELECT * FROM chado.feature WHERE uniquename = 
+      'FRAEX38873_v2_000000010 SPACED';");
+    $this->assertEquals($results->rowCount(), 1);
+  }
+
+  /**
+   * Run the GFF loader on gff_rightarrow_ids.gff for testing.
+   *
+   * This tests whether the GFF loader fails if ID contains  
+   * arrow >. It should not fail.
+   */  
+  public function testGFFImporterRightArrowID() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_rightarrow_id.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+ 
+    $this->loadLandmarks($analysis, $organism);
+    // This will produce an exception due to right arrow in ID
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $results = db_query("SELECT * FROM chado.feature 
+      WHERE uniquename = '>FRAEX38873_v2_000000010';");
+
+    // We expect this record to get inserted.
+    $this->assertEquals($results->rowCount(), 1);
   }
 
 
@@ -408,11 +659,6 @@ class GFF3ImporterTest extends TripalTestCase {
       $this->assertEquals($row->strand, 0); // .
     }     
 
-    // This GFF should create 5 featureloc records
-    $results = db_query('SELECT COUNT(*) as c FROM chado.featureloc;');
-    foreach ($results as $row) {
-      $this->assertEquals($row->c, 5);
-    }
   }
 
   /**

+ 0 - 1
tripal/includes/TripalImporter.inc

@@ -579,7 +579,6 @@ class TripalImporter {
     // Generate a translated message.
     $tmessage = t($message, $variables);
 
-
     // If we have a job then pass along the messaging to the job.
     if ($this->job) {
       $this->job->logMessage($message, $variables, $severity);

+ 11 - 14
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -480,7 +480,6 @@ class GFF3Importer extends TripalImporter {
    * @see TripalImporter::run()
    */
   public function run() {
-
     $arguments = $this->arguments['run_args'];
     $this->gff_file = $this->arguments['files'][0]['file_path'];
 
@@ -578,7 +577,6 @@ class GFF3Importer extends TripalImporter {
 
     // Create the cache file for storing parsed GFF entries.
     $this->openCacheFile();
-
     // Load the GFF3.
     try {
       $this->logMessage("Step  1 of 26: Caching GFF3 file...                             ");
@@ -939,6 +937,15 @@ class GFF3Importer extends TripalImporter {
     $ret['start'] = $fmin;
     $ret['stop'] = $fmax;
 
+    // Landmark (seqid) validation checks based on GFF3 specifications 
+    preg_match('/[a-zA-Z0-9\.:\^\*\$@!\+_\?-\|]*/', $ret['landmark'], $matches);
+    if ($matches[0] != $ret['landmark']) {
+      throw new Exception(t("Landmark/seqid !landmark contains invalid 
+        characters. Only characters included in this regular expression is 
+        allowed [a-zA-Z0-9.:^*$@!+_?-|]", 
+        ['!landmark' => $ret['landmark']]));
+    }
+
     // Check to make sure strand has a valid character
     if (preg_match('/[\+-\?\.]/',$ret['strand']) == false) {
       throw new Exception(t('Invalid strand detected on line !line,
@@ -1000,7 +1007,7 @@ class GFF3Importer extends TripalImporter {
 
       // Break apart each attribute into key/value pairs.
       $tag = preg_split("/=/", $attr, 2);
-
+      
       // Multiple values of an attribute are separated by commas
       $tag_name = $tag[0];
       if (!array_key_exists($tag_name, $tags)) {
@@ -1017,7 +1024,7 @@ class GFF3Importer extends TripalImporter {
         $attr_aliases = array_merge($attr_aliases, $tags[$tag_name]);
       }
       elseif (strcmp($tag_name, 'Parent') == 0) {
-        $attr_parent = urldecode($tag[1]);
+        $attr_parent = $tag[1];
       }
       elseif (strcmp($tag_name, 'Dbxref') == 0) {
         $attr_dbxref = array_merge($attr_dbxref, $tags[$tag_name]);
@@ -2765,7 +2772,6 @@ class GFF3Importer extends TripalImporter {
     elseif (!array_key_exists('Name', $attrs)) {
       $uniquename = $attrs['ID'][0];
       $name = $attrs['ID'][0];
-
     }
     elseif (!array_key_exists('ID', $attrs)) {
       $uniquename = $attrs['Name'][0];
@@ -2773,15 +2779,6 @@ class GFF3Importer extends TripalImporter {
     }
     else {
       $uniquename = $attrs['ID'][0];
-
-      // Run ID validation checks based on GFF3 specifications
-      preg_match('/[a-zA-Z0-9\.:\^\*\$@!\+_\?-\|]*/', $uniquename, $matches);
-      if($matches[0] != $uniquename) {
-        throw new Exception(t("ID !uniquename contains invalid characters. Only
-          characters included in this regular expression is allowed
-          [a-zA-Z0-9.:^*$@!+_?-|]", ['!uniquename' => $uniquename]));
-      }
-
       $name = $attrs['Name'][0];
     }