Browse Source

Merge pull request #529 from tripal/77-gff_optional_proteins

add checkbox for proteins to be created GFF loader
Stephen Ficklin 6 years ago
parent
commit
85b3c2988e

+ 1 - 0
composer.lock

@@ -1657,6 +1657,7 @@
         },
         {
             "name": "statonlab/tripal-test-suite",
+
             "version": "1.4.0",
             "source": {
                 "type": "git",

+ 14 - 0
tests/DataFactory.php

@@ -72,6 +72,20 @@ Factory::define('chado.feature', function (Faker\Generator $faker) {
     ];
 });
 
+
+Factory::define('chado.analysis', function (Faker\Generator $faker) {
+  return [
+    'name' => $faker->name,
+    'description' => $faker->name,
+    'program' => $faker->unique()->name,
+    'programversion' => $faker->unique()->name,
+    'sourcename' => $faker->unique()->name,
+    'algorithm' => $faker->name,
+    'sourcename' => $faker->name,
+    'sourceversion' => $faker->name,
+    'sourceuri' => $faker->name,
+  ];
+});
 /** @see  StatonLab\TripalTestSuite\Database\Factory::define() */
 Factory::define('tripal_jobs', function (Faker\Generator $faker) {
   return [

+ 1 - 0
tests/README.md

@@ -50,6 +50,7 @@ Tests must end with `Test.php` to be recognized by PHPUnit.  The tests themselve
 * fields
 * entities
 * admin
+* loaders
 
 So for example, tests for the file `tripal/api/tripal.jobs.api.inc` should go in `tests/tripal/api/TripalJobsAPITest.php`. tests that don't fit in any of these categories should be placed in `tests/[submodule]/`.
 

+ 4 - 0
tests/tripal_chado/data/simpleGFF.gff

@@ -0,0 +1,4 @@
+##gff-version 3
+Contig0	FRAEX38873_v2	gene	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010;biotype=protein_coding
+Contig0	FRAEX38873_v2	mRNA	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1;Parent=FRAEX38873_v2_000000010;Name=FRAEX38873_v2_000000010.1;biotype=protein_coding;AED=0.05
+Contig0	FRAEX38873_v2	polypeptide	16315	44054	.	+	.	ID=FRAEX38873_v2_000000010.1.3_test_protein;Parent=FRAEX38873_v2_000000010.1

+ 205 - 0
tests/tripal_chado/loaders/GFF3ImporterTest.php

@@ -0,0 +1,205 @@
+<?php
+
+namespace Tests;
+
+use StatonLab\TripalTestSuite\DBTransaction;
+use StatonLab\TripalTestSuite\TripalTestCase;
+
+class GFF3ImporterTest extends TripalTestCase {
+
+  use DBTransaction;
+
+  /**
+   * Confirm basic GFF importer functionality.
+   *
+   * @group gff
+   */
+  public function testGFFImporter() {
+    $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      ///regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      //optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+    $this->loadLandmarks($analysis, $organism);
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $name = 'FRAEX38873_v2_000000110.2.exon4';
+    $query = db_select('chado.feature', 'f')
+      ->fields('f', ['uniquename'])
+      ->condition('f.uniquename', $name)
+      ->execute()
+      ->fetchField();
+    $this->assertEquals($name, $query);
+  }
+
+
+  /**
+   * Add a skip protein option.  Test that when checked, implicit proteins are
+   * not created, but that they are created when unchecked.
+   *
+   * @group gff
+   * @ticket 77
+   *
+   */
+  public function testGFFNoProteinOption() {
+
+    $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      //The new argument
+      'skip_protein' => 1,
+      ///
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      ///regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      //optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+    $this->loadLandmarks($analysis, $organism);
+
+    $this->runGFFLoader($run_args, $gff_file);
+
+
+    $identifier = [
+      'cv_id' => ['name' => 'sequence'],
+      'name' => 'polypeptide',
+    ];
+    $protein_type_id = tripal_get_cvterm($identifier);
+
+    //This works i think i just dont have proteins described in the GFF.
+
+    $name = 'FRAEX38873_v2_000000110.1-protein';
+    $query = db_select('chado.feature', 'f')
+      ->fields('f', ['uniquename'])
+      ->condition('f.uniquename', $name)
+      ->condition('f.type_id', $protein_type_id->cvterm_id)
+      ->execute()
+      ->fetchField();
+    $this->assertFalse($query);
+
+    $run_args['skip_protein'] = 0;
+
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $query = db_select('chado.feature', 'f')
+      ->fields('f', ['uniquename'])
+      ->condition('f.uniquename', $name)
+      ->condition('f.type_id', $protein_type_id->cvterm_id)
+      ->execute()
+      ->fetchObject();
+    $this->assertEquals($name, $query->uniquename);
+
+  }
+
+  /**
+   * The GFF importer should still create explicitly defined proteins if
+   * skip_protein is true.
+   *
+   * @ticket 77
+   */
+  public function testGFFImporterLoadsExplicitProteins() {
+
+    $gff_file = ['file_local' => __DIR__ . '/../data/simpleGFF.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      //The new argument
+      'skip_protein' => 1,
+      ///
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      ///regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      //optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+    $this->loadLandmarks($analysis, $organism);
+
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $name = 'FRAEX38873_v2_000000010.1.3_test_protein';
+    $query = db_select('chado.feature', 'f')
+      ->fields('f', ['uniquename'])
+      ->condition('f.uniquename', $name)
+      ->execute()
+      ->fetchField();
+    $this->assertEquals($name, $query);
+  }
+
+  private function runGFFLoader($run_args, $file) {
+    // silent(function ($run_args, $file) {
+    module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/GFF3Importer');
+    $importer = new \GFF3Importer();
+    $importer->create($run_args, $file);
+    $importer->prepareFiles();
+    $importer->run();
+    //  });
+  }
+
+  private function loadLandmarks($analysis, $organism) {
+    $landmark_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/sequences/empty_landmarks.fasta'];
+
+    $run_args = [
+      'organism_id' => $organism->organism_id,
+      'analysis_id' => $analysis->analysis_id,
+      'seqtype' => 'scaffold',
+      'method' => 2, //default insert and update
+      'match_type' => 1, //unique name default
+      //optional
+      're_name' => NULL,
+      're_uname' => NULL,
+      're_accession' => NULL,
+      'db_id' => NULL,
+      'rel_type' => NULL,
+      're_subject' => NULL,
+      'parent_type' => NULL,
+    ];
+    module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/FASTAImporter');
+    //silent(function ($run_args, $landmark_file) {
+    $importer = new \FASTAImporter();
+    $importer->create($run_args, $landmark_file);
+    $importer->prepareFiles();
+    $importer->run();
+    // });
+
+  }
+
+}

+ 10 - 5
tests/tripal_chado/TaxonomyImporterTest.php → tests/tripal_chado/loaders/TaxonomyImporterTest.php

@@ -5,27 +5,29 @@ namespace Tests\tripal_chado;
 use StatonLab\TripalTestSuite\DBTransaction;
 use StatonLab\TripalTestSuite\TripalTestCase;
 
-require_once(__DIR__ . '/../../tripal_chado/includes/TripalImporter/TaxonomyImporter.inc');
-
 
 class TaxonomyImporterTest extends TripalTestCase {
+
   use DBTransaction;
 
 
   /*
    * Adds an organism and checks that the importer runs and adds some properties to it.
+   *
    */
   public function testImportExistingTaxonomyLoader() {
+    module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/TaxonomyImporter');
+
     $org = [
       'genus' => 'Armadillo',
       'species' => 'officinalis',
       'abbreviation' => 'A. officinalis',
       'common_name' => 'pillbug',
-      'type_id' => null
+      'type_id' => NULL,
     ];
 
     $organism = factory('chado.organism')->create($org);
-  //  $this->publish('organism');
+    //  $this->publish('organism');
     $file = [];
     $run_args = ['import_existing' => TRUE];
     $importer = new \TaxonomyImporter();
@@ -45,7 +47,8 @@ class TaxonomyImporterTest extends TripalTestCase {
   }
 
   /**
-   * the importer can also load an array of pubmed ids.  We use the pillbug again.
+   * the importer can also load an array of pubmed ids.  We use the pillbug
+   * again.
    *
    * https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=96821
    *
@@ -53,6 +56,8 @@ class TaxonomyImporterTest extends TripalTestCase {
    */
   public function testImportOrganismFromTaxID() {
 
+    module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/TaxonomyImporter');
+
     $file = [];
     $run_args = ['taxonomy_ids' => '96821']; //its the pillbug again!
     $importer = new \TaxonomyImporter();

+ 26 - 11
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -101,6 +101,16 @@ class GFF3Importer extends TripalImporter {
       you may specify the name of the attribute to use for the name."),
     );
 
+
+    $form['advanced']['skip_protein'] = array(
+      '#type' => 'checkbox',
+      '#title' => t('Skip automatic protein creation'),
+      '#required' => FALSE,
+      '#description' => t('The GFF loader will automatically create a protein feature for each transcript in the GFF file if a protein feature is missing in the GFF file. Check this box to disable this functionality. Protein features that are specifically present in the GFF will always be created.'),
+      '#default_value' => 0
+    );
+
+
     $form['advanced']['protein_names'] = array(
       '#type' => 'fieldset',
       '#title' => t('Protein Names'),
@@ -135,7 +145,6 @@ class GFF3Importer extends TripalImporter {
        expression would be "$1-P$2".')
     );
 
-
     $form['advanced']['use_transaction']= array(
       '#type' => 'checkbox',
       '#title' => t('Use a transaction'),
@@ -315,12 +324,14 @@ class GFF3Importer extends TripalImporter {
     $create_organism = $arguments['create_organism'];
     $re_mrna = $arguments['re_mrna'];
     $re_protein = $arguments['re_protein'];
+    $skip_protein = $arguments['skip_protein'];
+
 
     $this->loadGFF3($file_path, $organism_id, $analysis_id,
         $add_only, $update, $refresh, $remove, $use_transaction,
         $target_organism_id, $target_type,  $create_target,
         $start_line, $landmark_type, $alt_id_attr,  $create_organism,
-        $re_mrna, $re_protein);
+        $re_mrna, $re_protein, $skip_protein);
   }
 
   /**
@@ -394,6 +405,8 @@ class GFF3Importer extends TripalImporter {
    *          regular expression to extract portions from mRNA id
    * @param $re_protein A
    *          replacement string to generate the protein id
+   * @PARAM $skip_protein
+   *        BOOL: Will the loader create the proteins inferred from the CDS?
    *
    * @ingroup gff3_loader
    */
@@ -401,7 +414,7 @@ class GFF3Importer extends TripalImporter {
       $add_only = 0, $update = 1, $refresh = 0, $remove = 0, $use_transaction = 1,
       $target_organism_id = NULL, $target_type = NULL,  $create_target = 0,
       $start_line = 1, $landmark_type = '', $alt_id_attr = '',  $create_organism = FALSE,
-      $re_mrna = '', $re_protein = '') {
+      $re_mrna = '', $re_protein = '', $skip_protein = 0) {
 
     $ret = array();
     $date = getdate();
@@ -948,17 +961,19 @@ class GFF3Importer extends TripalImporter {
               $pfmin += $min_phase[0]->phase;
             }
 
-            // Add the new protein record.
-            $feature = $this->loadFeature($organism, $analysis_id,
+            if ($skip_protein == 0){
+              // Add the new protein record.
+              $feature = $this->loadFeature($organism, $analysis_id,
                 $protein_cvterm, $uname, $name, '', 'f', 'f', 1, 0);
-            // Add the derives_from relationship.
-            $cvterm = chado_get_cvterm(array('cvterm_id' => $result->cvterm_id));
-            $this->loadDerivesFrom($feature, $cvterm,
+              // Add the derives_from relationship.
+              $cvterm = chado_get_cvterm(array('cvterm_id' => $result->cvterm_id));
+              $this->loadDerivesFrom($feature, $cvterm,
                 $result->uniquename, $organism, $pfmin, $pfmax);
-            // Add the featureloc record. Set the start of the protein to
-            // be the start of the coding sequence minus the phase.
-            $this->loadFeatureLoc($feature, $organism, $result->landmark,
+              // Add the featureloc record. Set the start of the protein to
+              // be the start of the coding sequence minus the phase.
+              $this->loadFeatureLoc($feature, $organism, $result->landmark,
                 $pfmin, $pfmax, $result->strand, '', 'f', 'f', '', 0);
+            }
           }
         }
       }