Browse Source

add checkbox for proteins to be created and simple test for successful GFF run and GFF with without protein imported

Bradford Condon 6 years ago
parent
commit
b521059ebc

+ 15 - 0
tests/DataFactory.php

@@ -71,3 +71,18 @@ Factory::define('chado.feature', function (Faker\Generator $faker) {
         'type_id' => factory('chado.cvterm')->create()->cvterm_id,
     ];
 });
+
+
+Factory::define('chado.analysis', function (Faker\Generator $faker) {
+  return [
+    'name' => $faker->name,
+    'description' => $faker->name,
+    'program' => $faker->unique()->name,
+    'programversion' => $faker->unique()->name,
+    'sourcename' => $faker->unique()->name,
+    'algorithm' => $faker->name,
+    'sourcename' => $faker->name,
+    'sourceversion' => $faker->name,
+    'sourceuri' => $faker->name,
+  ];
+});

+ 154 - 0
tests/tripal_chado/GFF3ImporterTest.php

@@ -0,0 +1,154 @@
+<?php
+
+namespace Tests;
+
+use StatonLab\TripalTestSuite\DBTransaction;
+use StatonLab\TripalTestSuite\TripalTestCase;
+
+class GFF3ImporterTest extends TripalTestCase {
+
+  // Uncomment to auto start and rollback db transactions per test method.
+  use DBTransaction;
+
+  /**
+   * Confirm GFF loads.
+   *
+   * @group gff
+   */
+  public function testGFFImporter() {
+    $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      ///regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      //optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+    $this->loadLandmarks($analysis, $organism);
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $name = 'FRAEX38873_v2_000000110.2.exon4';
+    $query = db_select('chado.feature', 'f')
+      ->fields('f', ['uniquename'])
+      ->condition('f.uniquename', $name)
+      ->execute()
+      ->fetchField();
+    $this->assertEquals($name, $query);
+  }
+
+
+  /**
+   *
+   * @group gff
+   */
+  public function testGFFNoProteinOption() {
+
+    $gff_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/gff/filtered.gff'];
+    $analysis = factory('chado.analysis')->create();
+    $organism = factory('chado.organism')->create();
+    $run_args = [
+      //The new argument
+      'create_proteins' => 0,
+      ///
+      'analysis_id' => $analysis->analysis_id,
+      'organism_id' => $organism->organism_id,
+      'use_transaction' => 1,
+      'add_only' => 0,
+      'update' => 1,
+      'create_organism' => 0,
+      'create_target' => 0,
+      ///regexps for mRNA and protein.
+      're_mrna' => NULL,
+      're_protein' => NULL,
+      //optional
+      'target_organism_id' => NULL,
+      'target_type' => NULL,
+      'start_line' => NULL,
+      'landmark_type' => NULL,
+      'alt_id_attr' => NULL,
+    ];
+    $this->loadLandmarks($analysis, $organism);
+
+    $this->runGFFLoader($run_args, $gff_file);
+
+
+    $identifier = [
+      'cv_id' => ['name' => 'sequence'],
+      'name' => 'polypeptide',
+    ];
+    $protein_type_id = tripal_get_cvterm($identifier);
+
+    //This works i think i just dont have proteins described in the GFF.
+
+    $name = 'FRAEX38873_v2_000000110.1-protein';
+    $query = db_select('chado.feature', 'f')
+      ->fields('f', ['uniquename'])
+      ->condition('f.uniquename', $name)
+      ->condition('f.type_id', $protein_type_id)
+      ->execute()
+      ->fetchField();
+    $this->assertFalse($query);
+
+    $run_args['create_proteins'] = 1;
+
+    $this->runGFFLoader($run_args, $gff_file);
+
+    $query = db_select('chado.feature', 'f')
+      ->fields('f', ['uniquename'])
+      ->condition('f.uniquename', $name)
+      ->condition('f.type_id', $protein_type_id)
+      ->execute()
+      ->fetchObject();
+    $this->assertEquals($name, $query->uniquename);
+
+
+  }
+
+  private function runGFFLoader($run_args, $file) {
+    module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/GFF3Importer');
+    $importer = new \GFF3Importer();
+    $importer->create($run_args, $file);
+    $importer->prepareFiles();
+    $importer->run();
+
+  }
+
+  private function loadLandmarks($analysis, $organism) {
+    $landmark_file = ['file_remote' => 'https://raw.githubusercontent.com/statonlab/tripal_dev_seed/master/Fexcel_mini/sequences/empty_landmarks.fasta'];
+
+    $run_args = [
+      'organism_id' => $organism->organism_id,
+      'analysis_id' => $analysis->analysis_id,
+      'seqtype' => 'scaffold',
+      'method' => 2, //default insert and update
+      'match_type' => 1, //unique name default
+      //optional
+      're_name' => NULL,
+      're_uname' => NULL,
+      're_accession' => NULL,
+      'db_id' => NULL,
+      'rel_type' => NULL,
+      're_subject' => NULL,
+      'parent_type' => NULL,
+    ];
+    module_load_include('inc', 'tripal_chado', 'includes/TripalImporter/FASTAImporter');
+    $importer = new \FASTAImporter();
+    $importer->create($run_args, $landmark_file);
+    $importer->prepareFiles();
+    $importer->run();
+  }
+
+}

+ 27 - 11
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -101,6 +101,16 @@ class GFF3Importer extends TripalImporter {
       you may specify the name of the attribute to use for the name."),
     );
 
+
+    $form['advanced']['create_proteins'] = array(
+      '#type' => 'checkbox',
+      '#title' => t('Create proteins'),
+      '#required' => FALSE,
+      '#description' => t('Check the box if you would like to create the proteins described in the GFF file.'),
+      '#default_value' => FALSE
+    );
+
+
     $form['advanced']['protein_names'] = array(
       '#type' => 'fieldset',
       '#title' => t('Protein Names'),
@@ -135,7 +145,6 @@ class GFF3Importer extends TripalImporter {
        expression would be "$1-P$2".')
     );
 
-
     $form['advanced']['use_transaction']= array(
       '#type' => 'checkbox',
       '#title' => t('Use a transaction'),
@@ -316,11 +325,14 @@ class GFF3Importer extends TripalImporter {
     $re_mrna = $arguments['re_mrna'];
     $re_protein = $arguments['re_protein'];
 
+    $create_protein = $arguments['create_proteins'];
+
+
     $this->loadGFF3($file_path, $organism_id, $analysis_id,
         $add_only, $update, $refresh, $remove, $use_transaction,
         $target_organism_id, $target_type,  $create_target,
         $start_line, $landmark_type, $alt_id_attr,  $create_organism,
-        $re_mrna, $re_protein);
+        $re_mrna, $re_protein, $create_protein);
   }
 
   /**
@@ -394,6 +406,8 @@ class GFF3Importer extends TripalImporter {
    *          regular expression to extract portions from mRNA id
    * @param $re_protein A
    *          replacement string to generate the protein id
+   * @PARAM $create_protein
+   *        BOOL: Will the loader create the described proteins?
    *
    * @ingroup gff3_loader
    */
@@ -401,7 +415,7 @@ class GFF3Importer extends TripalImporter {
       $add_only = 0, $update = 1, $refresh = 0, $remove = 0, $use_transaction = 1,
       $target_organism_id = NULL, $target_type = NULL,  $create_target = 0,
       $start_line = 1, $landmark_type = '', $alt_id_attr = '',  $create_organism = FALSE,
-      $re_mrna = '', $re_protein = '') {
+      $re_mrna = '', $re_protein = '', $create_protein = FALSE) {
 
     $ret = array();
     $date = getdate();
@@ -948,17 +962,19 @@ class GFF3Importer extends TripalImporter {
               $pfmin += $min_phase[0]->phase;
             }
 
-            // Add the new protein record.
-            $feature = $this->loadFeature($organism, $analysis_id,
+            if ($create_protein){
+              // Add the new protein record.
+              $feature = $this->loadFeature($organism, $analysis_id,
                 $protein_cvterm, $uname, $name, '', 'f', 'f', 1, 0);
-            // Add the derives_from relationship.
-            $cvterm = chado_get_cvterm(array('cvterm_id' => $result->cvterm_id));
-            $this->loadDerivesFrom($feature, $cvterm,
+              // Add the derives_from relationship.
+              $cvterm = chado_get_cvterm(array('cvterm_id' => $result->cvterm_id));
+              $this->loadDerivesFrom($feature, $cvterm,
                 $result->uniquename, $organism, $pfmin, $pfmax);
-            // Add the featureloc record. Set the start of the protein to
-            // be the start of the coding sequence minus the phase.
-            $this->loadFeatureLoc($feature, $organism, $result->landmark,
+              // Add the featureloc record. Set the start of the protein to
+              // be the start of the coding sequence minus the phase.
+              $this->loadFeatureLoc($feature, $organism, $result->landmark,
                 $pfmin, $pfmax, $result->strand, '', 'f', 'f', '', 0);
+            }
           }
         }
       }