Browse Source

Fixed FASTA loader, had several bugs and updated the alignment template

spficklin 13 years ago
parent
commit
7ea86c8786

+ 3 - 6
theme_tripal/tripal_feature/tripal_feature_featurelocs.tpl.php

@@ -44,8 +44,8 @@ if (!$sfeaturelocs) {
             $class = 'tripal_feature-table-odd-row tripal-table-even-row';
          } 
          $location = $featureloc->srcfeature_id->name .":". ($featureloc->fmin + 1) . ".." . $featureloc->fmax;
-         if($location->srcfeature_id->nid){
-           $location = "<a href=\"" . url("node/".$location->srcfeature_id->nid) . "\">".$featureloc->srcfeature_id->name .":".($featureloc->fmin + 1) . ".." . $featureloc->fmax ."</a> ";
+         if($featureloc->srcfeature_id->nid){
+           $location = "<a href=\"" . url("node/".$featureloc->srcfeature_id->nid) . "\">".$featureloc->srcfeature_id->name ."</a>:".($featureloc->fmin + 1) . ".." . $featureloc->fmax ."";
          }
          ?>
          <tr class="<?php print $class ?>">
@@ -96,14 +96,11 @@ if (!$sfeaturelocs) {
             $class = 'tripal_feature-table-odd-row tripal-table-even-row';
          } 
          $location = $featureloc->srcfeature_id->name .":". ($featureloc->fmin + 1) . ".." . $featureloc->fmax;
-         if($location->srcfeature_id->nid){
-           $location = "<a href=\"" . url("node/$location->srcfeature_id->nid") . "\">".$featureloc->srcfeature_id->name .":".($featureloc->fmin + 1) . ".." . $featureloc->fmax ."</a> ";
-         }
          ?>
          <tr class="<?php print $class ?>">
            <td><?php 
               if($featureloc->feature_id->nid){
-                 print "<a href=\"" . url("node/".$featureloc->feature_id->name) . "\">".$featureloc->feature_id->name."</a>";
+                 print "<a href=\"" . url("node/".$featureloc->feature_id->nid) . "\">".$featureloc->feature_id->name."</a>";
               } else {
                  print $featureloc->feature_id->name;
               }?>

+ 172 - 65
tripal_feature/fasta_loader.php

@@ -22,8 +22,8 @@ function tripal_feature_fasta_load_form (){
                              installation (e.g. /sites/default/files/xyz.obo).  The path must be accessible to the
                              server on which this Drupal instance is running.'),
       '#required' => TRUE,
-      '#weight'        => 1
    );
+
    // get the list of organisms
    $sql = "SELECT * FROM {organism} ORDER BY genus, species";
    $previous_db = tripal_db_set_active('chado');  // use chado database
@@ -40,14 +40,13 @@ function tripal_feature_fasta_load_form (){
      '#description' => t("Choose the organism to which these sequences are associated "),
      '#required'    => TRUE,
      '#options'     => $organisms,
-     '#weight'      => 2,
    );
-   $form['type']= array(
+
+   $form['seqtype']= array(
       '#type' => 'textfield',
       '#title' => t('Sequence Type'),
       '#required' => TRUE,
       '#description' => t('Please enter the Sequence Ontology term that describes the sequences in the FASTA file.'),
-      '#weight' => 3
    );
 
 
@@ -72,20 +71,50 @@ function tripal_feature_fasta_load_form (){
 //     '#options'     => $libraries,
 //     '#weight'      => 5,
 //   );
-   $form['update']= array(
-      '#type' => 'checkbox',
-      '#title' => t('Insert and update'),
-      '#required' => FALSE,
-      '#description' => t('By default only new features are inserted.  Select this checkbox to update
-                           features that already exists with the contents from the FASTA file.'),
-      '#weight' => 6
+   $form['method']= array(
+      '#type' => 'radios',
+      '#title' => 'Method',
+      '#required' => TRUE,
+      '#options' => array(
+         t('Insert only'),
+         t('Update only'),
+         t('Insert and update'),
+      ),
+      '#description' => t('Select how features in the FASTA file are handled.  
+         Select "Insert only" to insert the new features. If a feature already 
+         exists with the same name or unique name and type then it is skipped.
+         Select "Update only" to only update featues that already exist in the
+         database.  Select "Insert and Update" to insert features that do
+         not exist and upate those that do.'),
+      '#default_value' => 2,
    );
 
+$form['match_type']= array(
+      '#type' => 'radios',
+      '#title' => 'Name Match Type',
+      '#required' => TRUE,
+      '#options' => array(
+         t('Name'),
+         t('Unique name'),
+      ),
+      '#description' => t('Feature data is stored in Chado with both a human-readable
+        name and a unique name. If the features in your FASTA file are identified using
+        a human-readable name then select the "Name" button. If your features are
+        identified using the unique name then select the "Unique name" button.  If you 
+        loaded your features first using the GFF loader then the unique name of each
+        features were indicated by the "ID=" attribute and the name by the "Name=" attribute.
+        By default, the FASTA loader will use the first word (character string
+        before the first space) as  the name for your feature. If 
+        this does not uniquely identify your feature consider specifying a regular expression in the advanced section below. 
+        Additionally, you may import both a name and a unique name for each sequence using the advanced options. 
+        When updating a sequence, the value selected here will be used to identify the sequence in the 
+        database in combination with any regular expression provided below.'),
+      '#default_value' => 1,
+   );
 
    $form['analysis'] = array(
       '#type' => 'fieldset',
       '#title' => t('Analysis Used to Derive Features'),
-      '#weight'=> 6,
       '#collapsed' => TRUE
    ); 
    $form['analysis']['desc'] = array(
@@ -118,31 +147,39 @@ function tripal_feature_fasta_load_form (){
    $form['advanced'] = array(
       '#type' => 'fieldset',
       '#title' => t('Advanced Options'),
-      '#weight'=> 7,
       '#collapsed' => TRUE
    );
    $form['advanced']['re_help']= array(
       '#type' => 'item',
       '#value' => t('A regular expression is an advanced method for extracting information from a string of text.  
-                     By default, this loader will use the first word in the definition line for each sequence in the FASTA file
-                     as the uniquename for the sequences.  If this is not desired, you may use the following regular 
-                     expressions to define the postions of the unique name.'),
-      '#weight' => 0
+                     Your FASTA file may contain both a human-readable name and a unique name for each sequence.  
+                     If you want to import
+                     both the name and unique name for all sequences, then you must provide regular expressions 
+                     so that the loader knows how to separate them.  
+                     Otherwise the name and uniquename will be the same.  
+                     By default, this loader will use the first word in the definition 
+                     lines of the FASTA file
+                     as the name or unique name of the feature.'),
    );
    $form['advanced']['re_name']= array(
       '#type' => 'textfield',
       '#title' => t('Regular expression for the name'),
       '#required' => FALSE,
-      '#description' => t('Enter the regular expression that will extract the feature name from the FASTA definition line. For example, for a defintion line with a name and uniquename separated by a bar \'|\' (>seqname|uniquename), the regular expression would be, "^(.*?)\|.*$"'),
-      '#weight' => 1
-   );   
+      '#description' => t('Enter the regular expression that will extract the 
+         feature name from the FASTA definition line. For example, for a 
+         defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename), 
+         the regular expression for the name would be, "^(.*?)\|.*$".'),
+   );  
    $form['advanced']['re_uname']= array(
       '#type' => 'textfield',
       '#title' => t('Regular expression for the unique name'),
       '#required' => FALSE,
-      '#description' => t('Enter the regular expression that will extract the unique feature name for each feature from the FASTA definition line.  This name must be unique for the organism.'),
-      '#weight' => 2
+      '#description' => t('Enter the regular expression that will extract the 
+         feature name from the FASTA definition line. For example, for a 
+         defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename), 
+         the regular expression for the unique name would be "^.*?\|(.*)$").'),
    );   
+ 
 
    // Advanced database cross-reference optoins
    $form['advanced']['db'] = array(
@@ -238,8 +275,9 @@ function tripal_feature_fasta_load_form (){
 function tripal_feature_fasta_load_form_validate($form, &$form_state){
    $fasta_file = trim($form_state['values']['fasta_file']);
    $organism_id  = $form_state['values']['organism_id'];
-   $type         = trim($form_state['values']['type']);
-   $update       = trim($form_state['values']['update']);
+   $type         = trim($form_state['values']['seqtype']);
+   $method       = trim($form_state['values']['method']);
+   $match_type   = trim($form_state['values']['match_type']);
    $library_id   = $form_state['values']['library_id'];
    $re_name      = trim($form_state['values']['re_name']);
    $re_uname     = trim($form_state['values']['re_uname']);
@@ -249,6 +287,33 @@ function tripal_feature_fasta_load_form_validate($form, &$form_state){
    $re_subject   = trim($form_state['values']['re_subject']);
    $parent_type   = trim($form_state['values']['parent_type']);
 
+   if($method == 0){
+      $method = 'Insert only';
+   }
+   if($method == 1){
+      $method = 'Update only';
+   }
+   if($method == 2){
+      $method = 'Insert and update';
+   }
+
+   if($match_type == 0){
+      $match_type = 'Name';
+   }
+
+   if($match_type == 1){
+      $match_type = 'Unique name';
+   }
+
+
+   if ($re_name and !$re_uname and strcmp($match_type,'Unique name')==0){
+      form_set_error('re_uname',t("You must provide a regular expression to identify the sequence unique name"));     
+   }
+
+   if (!$re_name and $re_uname and strcmp($match_type,'Name')==0){
+      form_set_error('re_name',t("You must provide a regular expression to identify the sequence name"));     
+   }
+
    // check to see if the file is located local to Drupal
    $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $fasta_file; 
    if(!file_exists($dfile)){
@@ -311,8 +376,9 @@ function tripal_feature_fasta_load_form_submit ($form, &$form_state){
 
    $dfile        = $form_state['storage']['dfile'];
    $organism_id  = $form_state['values']['organism_id'];
-   $type         = trim($form_state['values']['type']);
-   $update       = trim($form_state['values']['update']);
+   $type         = trim($form_state['values']['seqtype']);
+   $method       = trim($form_state['values']['method']);
+   $match_type   = trim($form_state['values']['match_type']);
    $library_id   = $form_state['values']['library_id'];
    $re_name      = trim($form_state['values']['re_name']);
    $re_uname     = trim($form_state['values']['re_uname']);
@@ -323,9 +389,27 @@ function tripal_feature_fasta_load_form_submit ($form, &$form_state){
    $parent_type   = trim($form_state['values']['parent_type']);
    $analysis_id = $form_state['values']['analysis_id'];
 
+   if($method == 0){
+      $method = 'Insert only';
+   }
+   if($method == 1){
+      $method = 'Update only';
+   }
+   if($method == 2){
+      $method = 'Insert and update';
+   }
+
+   if($match_type == 0){
+      $match_type = 'Name';
+   }
+
+   if($match_type == 1){
+      $match_type = 'Unique name';
+   }
+
    $args = array($dfile,$organism_id,$type,$library_id,$re_name,$re_uname,
-            $re_accession,$db_id,$rel_type,$re_subject,$parent_type,$update,
-            $user->uid,$analysis_id);
+            $re_accession,$db_id,$rel_type,$re_subject,$parent_type,$method,
+            $user->uid,$analysis_id,$match_type);
 
    tripal_add_job("Import FASTA file: $dfile",'tripal_feature',
       'tripal_feature_load_fasta',$args,$user->uid);
@@ -338,7 +422,8 @@ function tripal_feature_fasta_load_form_submit ($form, &$form_state){
  */
 function tripal_feature_load_fasta($dfile, $organism_id, $type,
    $library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type,
-   $re_subject, $parent_type, $update,$uid, $analysis_id, $job = NULL)
+   $re_subject, $parent_type, $method, $uid, $analysis_id, 
+   $match_type,$job = NULL)
 {
 
    print "Opening FASTA file $dfile\n";
@@ -367,46 +452,50 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type,
       // the definition line
       if(preg_match('/^>/',$line)){
          // if we have a feature name then we are starting a new sequence
-         // and we need to insert this one
+         // so let's handle the previous one before moving on
          if($name){
-           tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,
+           tripal_feature_fasta_loader_handle_feature($name,$uname,$db_id,
               $accession,$subject,$rel_type,$parent_type,$library_id,$organism_id,$type,
-              $source,$residues,$update,$re_name);
+              $source,$residues,$method,$re_name,$match_type);
            $residues = '';
            $name = '';
          }
 
          $line = preg_replace("/^>/",'',$line);
+         // get the feature name
          if($re_name){
             if(!preg_match("/$re_name/",$line,$matches)){
-               print "Regular expression for the feature name finds nothing\n";
+               print "WARNING: Regular expression for the feature name finds nothing\n";
             }
             $name = trim($matches[1]);
          } else {
             preg_match("/^\s*(.*?)[\s\|].*$/",$line,$matches);
             $name = trim($matches[1]);
-         }
+         } 
+         // get the feature unique name
          if($re_uname){
-            preg_match("/$re_uname/",$line,$matches);
+            if(!preg_match("/$re_uname/",$line,$matches)){
+               print "WARNING: Regular expression for the feature unique name finds nothing\n";
+            }
             $uname = trim($matches[1]);
          } else {
             preg_match("/^\s*(.*?)[\s\|].*$/",$line,$matches);
             $uname = trim($matches[1]);
-         }         
+         } 
+               
          preg_match("/$re_accession/",$line,$matches);
          $accession = trim($matches[1]);
          preg_match("/$re_subject/",$line,$matches);
          $subject = trim($matches[1]);
-//         print "Name: $name, UName: $uname, Accession: $accession, Subject: $subject\n";
       }
       else {
          $residues .= trim($line);
       }
    }
    // now load the last sequence in the file
-   tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,
+   tripal_feature_fasta_loader_handle_feature($name,$uname,$db_id,
       $accession,$subject,$rel_type,$parent_type,$library_id,$organism_id,$type,
-      $source,$residues,$update,$re_name);
+      $source,$residues,$method,$re_name,$match_type);
    return '';
 }
 
@@ -415,9 +504,9 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type,
  *
  * @ingroup fasta_loader
  */
-function tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,$accession,
+function tripal_feature_fasta_loader_handle_feature($name,$uname,$db_id,$accession,
               $parent,$rel_type,$parent_type,$library_id,$organism_id,$type, 
-              $source,$residues,$update,$re_name) 
+              $source,$residues,$method,$re_name,$match_type) 
 {
    $previous_db = tripal_db_set_active('chado');
 
@@ -434,38 +523,56 @@ function tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,$accessi
    }
 
    // check to see if this feature already exists
-   $feature_sql = "SELECT * FROM {feature} 
-                   WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
-   $feature = db_fetch_object(db_query($feature_sql,$organism_id,$uname,$cvterm->cvterm_id));
-   if(!$feature){
-      // now insert the feature
-      $sql = "INSERT INTO {feature} (organism_id, name, uniquename, residues, seqlen, md5checksum,type_id,is_analysis,is_obsolete)
-              VALUES(%d,'%s','%s','%s',%d, '%s', %d, %s, %s)";
-      $result = db_query($sql,$organism_id,$name,$uname,$residues,strlen($residues),
-                  md5($residues),$cvterm->cvterm_id,'false','false');
-      if(!$result){
-         print "ERROR: failed to insert feature '$name ($uname)'\n";
+   if(strcmp($match_type,'Name')==0){
+      $cnt_sql = "SELECT count(*) as cnt FROM {feature} 
+                      WHERE organism_id = %d and name = '%s' and type_id = %d";
+      $cnt = db_fetch_object(db_query($cnt_sql,$organism_id,$name,$cvterm->cvterm_id));
+      if($cnt->cnt > 1){
+         print "ERROR: multiple features exist with the name '$name' of type '$type' for the organism.  skipping\n";
          return 0;
       } else {
-         print "Inserted feature $name ($uname)\n";
+         $feature_sql = "SELECT * FROM {feature} 
+                      WHERE organism_id = %d and name = '%s' and type_id = %d";
+         $feature = db_fetch_object(db_query($feature_sql,$organism_id,$name,$cvterm->cvterm_id));
       }
    } else {
-       if($update){
+      $feature_sql = "SELECT * FROM {feature} 
+                      WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
+      $feature = db_fetch_object(db_query($feature_sql,$organism_id,$uname,$cvterm->cvterm_id));
+   }
 
-         // we do not want to wipe out the name if the user did not intend for this to
-         // happen.  The uniquename must match the sequence but the name may not.  
-         // so, we'll only update the name if the users specified an 're_name' regular
-         // expression.
-         if($re_name){
+   if(!$feature){
+       if(strcmp($method,'Insert only')==0 or strcmp($method,'Insert and update')==0){
+         // now insert the feature
+         $sql = "INSERT INTO {feature} 
+                    (organism_id, name, uniquename, residues, seqlen, 
+                     md5checksum,type_id,is_analysis,is_obsolete)
+                 VALUES(%d,'%s','%s','%s',%d, '%s', %d, %s, %s)";
+         $result = db_query($sql,$organism_id,$name,$uname,$residues,strlen($residues),
+                     md5($residues),$cvterm->cvterm_id,'false','false');
+         if(!$result){
+            print "ERROR: failed to insert feature '$name ($uname)'\n";
+            return 0;
+         } else {
+            print "Inserted feature $name ($uname)\n";
+         }
+      } 
+      else {
+         print "WARNING: failed to find feature '$name' ('$uname') while matching on " . strtolower($match_type) . ". Skipping\n";
+         return 0;
+      }
+   } else {
+       if(strcmp($method,'Update only')==0 or strcmp($method,'Insert and update')==0){
+         if(strcmp($match_type,'Name')==0){
             $sql = "UPDATE {feature} 
-                     SET name = '%s', residues = '%s', seqlen = '%s', md5checksum = '%s'
-                     WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
-            $result = db_query($sql,$name,$residues,strlen($residues),md5($residues),$organism_id,$uname,$cvterm->cvterm_id);
+                     SET uniquename = '%s', residues = '%s', seqlen = '%s', md5checksum = '%s'
+                     WHERE organism_id = %d and name = '%s' and type_id = %d";
+            $result = db_query($sql,$uname,$residues,strlen($residues),md5($residues),$organism_id,$name,$cvterm->cvterm_id);
          } else {
             $sql = "UPDATE {feature} 
-                     SET residues = '%s', seqlen = '%s', md5checksum = '%s'
+                     SET name = '%s', residues = '%s', seqlen = '%s', md5checksum = '%s'
                      WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
-            $result = db_query($sql,$residues,strlen($residues),md5($residues),$organism_id,$uname,$cvterm->cvterm_id);
+            $result = db_query($sql,$name,$residues,strlen($residues),md5($residues),$organism_id,$uname,$cvterm->cvterm_id);
          }
          if(!$result){
             print "ERROR: failed to update feature '$name ($uname)'\n";
@@ -474,7 +581,7 @@ function tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,$accessi
             print "Updated feature $name ($uname)\n";
          }
       } else {
-         print "WARNING: feature already exists, skipping: '$name ($uname)'\n";
+         print "WARNING: feature already exists: '$name' ('$uname'). Skipping\n";
       }
    }
    // now get the feature

+ 3 - 6
tripal_feature/tripal_feature.module

@@ -2127,9 +2127,11 @@ function tripal_feature_job_describe_args($callback,$args){
       $organism = tripal_core_chado_select('organism',array('genus','species'),array('organism_id' => $args[1]));
       $new_args['Organism'] = $organism[0]->genus." ". $organism[0]->species;
       $new_args['Sequence Type'] = $args[2];
+      $new_args['Name Match Type'] = $args[14];
       $new_args['Name RE'] = $args[4];
       $new_args['Unique Name RE'] = $args[5];
 
+
       // add in the relationship arguments
       $new_args['Relationship Type'] = $args[8];
       $new_args['Relationship Parent RE'] = $args[9];
@@ -2141,12 +2143,7 @@ function tripal_feature_job_describe_args($callback,$args){
       }
       $new_args['Database Reference'] = $db[0]->name;
       $new_args['Accession RE'] = $args[6];
-      if($args[11]){
-         $new_args['Update and Insert'] = 'Yes';
-      }
-      else {
-         $new_args['Insert Only New Features'] = 'Yes';
-      }
+      $new_args['Method'] = $args[11];
 
       // add in the analysis 
       if($args[13]){