Browse Source

Merge pull request #1133 from risharde/1040-tv3-gff3_performance

Score php unit test + GFF fix for strands specified by question marks + Strand php unit test + Remove empty files
Stephen Ficklin 4 years ago
parent
commit
842dc83253

+ 5 - 0
tests/tripal_chado/data/gff_score.gff

@@ -0,0 +1,5 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	2	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	2.5	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	mRNA	16315	44054	-2.5	+	.	ID=FRAEX38873_v2_000000010.2;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.2;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 6 - 0
tests/tripal_chado/data/gff_strand.gff

@@ -0,0 +1,6 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	1	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	2	-	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	mRNA	16315	44054	3	.	.	ID=FRAEX38873_v2_000000010.2;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.2;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	mRNA	16315	44054	4	?	.	ID=FRAEX38873_v2_000000010.3;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.3;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 6 - 0
tests/tripal_chado/data/gff_strand_invalid.gff

@@ -0,0 +1,6 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	1	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	2	-	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	mRNA	16315	44054	3	.	.	ID=FRAEX38873_v2_000000010.2;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.2;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	mRNA	16315	44054	4	a	.	ID=FRAEX38873_v2_000000010.3;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.3;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 187 - 0
tests/tripal_chado/loaders/GFF3ImporterTest.php

@@ -228,6 +228,193 @@ class GFF3ImporterTest extends TripalTestCase {
     $this->assertEquals(count($results), 1);
   }
 
+  /**
+   * Run the GFF loader on gff_score.gff for testing.
+   *
+   * This tests whether the GFF loader interprets the score values
+   */  
+  public function testGFFImporterScoreTest() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_score.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+   
+    $this->loadLandmarks($analysis, $organism);
+    $this->runGFFLoader($run_args, $gff_file);
+
+    // Test that integer values get placed in the db
+    $results = db_query('SELECT * FROM chado.analysisfeature WHERE significance = 2 LIMIT 1', array(
+    ));
+    foreach ($results as $row){
+      $this->assertEquals($row->significance,2);
+    }
+
+    // Test that decimal/float values get placed in the db
+    $results = db_query('SELECT * FROM chado.analysisfeature WHERE significance = 2.5 LIMIT 1', array(
+    ));
+    foreach ($results as $row){
+      $this->assertEquals($row->significance,2.5);
+    } 
+    
+    // Test that negative score values get placed in the db
+    $results = db_query('SELECT * FROM chado.analysisfeature WHERE significance = -2.5 LIMIT 1', array(
+    ));
+    foreach ($results as $row){
+      $this->assertEquals($row->significance,-2.5);
+    }     
+
+  }
+
+    /**
+   * Run the GFF loader on gff_strand.gff for testing.
+   *
+   * This tests whether the GFF loader interprets the strand values
+   */  
+  public function testGFFImporterInvalidStrandTest() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_strand_invalid.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+   
+    $this->loadLandmarks($analysis, $organism);
+    
+    $isException = false;
+    try {
+      $this->runGFFLoader($run_args, $gff_file);
+    }
+    catch(\Exception $ex) {
+      $isException = true;
+    }
+
+    $this->assertEquals($isException, true);
+
+  }
+
+  /**
+   * Run the GFF loader on gff_strand.gff for testing.
+   *
+   * This tests whether the GFF loader interprets the strand values
+   */  
+  public function testGFFImporterStrandTest() {
+    $gff_file = ['file_local' => __DIR__ . '/../data/gff_strand.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      // regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      // optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+
+   
+    $this->loadLandmarks($analysis, $organism);
+    $this->runGFFLoader($run_args, $gff_file);
+
+    // Test that integer values for strand that get placed in the db
+    // Strand data gets saved in chado.featureloc
+    $results = db_query('SELECT * FROM chado.featureloc fl 
+      LEFT JOIN chado.feature f ON (fl.feature_id = f.feature_id)
+      WHERE uniquename = :uniquename LIMIT 1', 
+      array(
+        ':uniquename' => 'FRAEX38873_v2_000000010'
+      )
+    );
+
+    foreach ($results as $row) {
+      $this->assertEquals($row->strand, 1); // +
+    }
+
+    $results = db_query('SELECT * FROM chado.featureloc fl 
+      LEFT JOIN chado.feature f ON (fl.feature_id = f.feature_id)
+      WHERE uniquename = :uniquename LIMIT 1', 
+      array(
+        ':uniquename' => 'FRAEX38873_v2_000000010.1'
+      )
+    );
+
+    foreach ($results as $row) {
+      $this->assertEquals($row->strand,-1); // -
+    } 
+    
+    $results = db_query('SELECT * FROM chado.featureloc fl 
+      LEFT JOIN chado.feature f ON (fl.feature_id = f.feature_id)
+      WHERE uniquename = :uniquename LIMIT 1', 
+      array(
+        ':uniquename' => 'FRAEX38873_v2_000000010.2'
+      )
+    );
+
+    foreach ($results as $row) {
+      $this->assertEquals($row->strand, 0); // ?
+    }
+    
+    $results = db_query('SELECT * FROM chado.featureloc fl 
+      LEFT JOIN chado.feature f ON (fl.feature_id = f.feature_id)
+      WHERE uniquename = :uniquename LIMIT 1', 
+      array(
+        ':uniquename' => 'FRAEX38873_v2_000000010.3'
+      )
+    );
+
+    foreach ($results as $row) {
+      $this->assertEquals($row->strand, 0); // .
+    }     
+
+    // This GFF should create 5 featureloc records
+    $results = db_query('SELECT COUNT(*) as c FROM chado.featureloc;');
+    foreach ($results as $row) {
+      $this->assertEquals($row->c, 5);
+    }
+  }
+
   /**
    * Run the GFF loader on small_gene.gff for testing.
    *

+ 10 - 0
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -948,10 +948,20 @@ class GFF3Importer extends TripalImporter {
     $ret['start'] = $fmin;
     $ret['stop'] = $fmax;
 
+    // Check to make sure strand has a valid character
+    if (preg_match('/[\+-\?\.]/',$ret['strand']) == false) {
+      print_r($ret['strand']);
+      throw new Exception(t('Invalid strand detected on line !line, 
+        strand can only be +-?.',['!line' => $line]));      
+    }    
+
     // Format the strand for chado
     if (strcmp($ret['strand'], '.') == 0) {
       $ret['strand'] = 0;
     }
+    elseif (strcmp($ret['strand'], '?') == 0) {
+      $ret['strand'] = 0;
+    }
     elseif (strcmp($ret['strand'], '+') == 0) {
       $ret['strand'] = 1;
     }