Browse Source

Fixed the get sequence function to work with Postgres 8.4 and fixed the annotated feature so that it can properly display overlapping parts and show the sequence in reverse complement if on the reverse strand

spficklin 12 years ago
parent
commit
91184c553b

+ 69 - 61
tripal_feature/api/tripal_feature.api.inc

@@ -508,7 +508,7 @@ function tripal_feature_delete_property_by_id($featureprop_id) {
  *
  * @ingroup tripal_feature_api
  */
-function trpial_feature_reverse_complement($sequence) {
+function tripal_feature_reverse_complement($sequence) {
 
   $seq = strtoupper($sequence);
   $seq = strrev($seq);
@@ -566,7 +566,7 @@ function trpial_feature_reverse_complement($sequence) {
  */
 function trpial_feature_get_formatted_sequence($feature_id, $feature_name, 
   $num_bases_per_line, $derive_from_parent, $aggregate, $output_format,
-  $upstream, $downstream) {
+  $upstream, $downstream, $sub_features = array()) {
   
   // to speed things up we need to make sure we have a persistent connection
   $connection = tripal_db_persistent_chado(); 
@@ -578,18 +578,18 @@ function trpial_feature_get_formatted_sequence($feature_id, $feature_name,
      $downstream = 0;
   }
   
-  // if we need to get the sequence from the parent but there is no aggregation
-  // then do so now.
-  if ($derive_from_parent) {             
-    
-    // execute our prepared statement
-    if (!tripal_core_is_sql_prepared('sequence_by_parent')) {
-      // prepare the queries we're going to use later during the render phase
-      // This SQL statement uses conditionals in the select clause to handle
-      // cases cases where the alignment is in the reverse direction and when
-      // the upstream and downstream extensions go beyond the lenght of the 
-      // parent sequence.
-      $psql ='PREPARE sequence_by_parent (int, int, int) AS 
+  // prepare statements we'll need to use later
+  if (!tripal_core_is_sql_prepared('sequence_by_parent')) {
+    // prepare the queries we're going to use later during the render phase
+    // This SQL statement uses conditionals in the select clause to handle
+    // cases cases where the alignment is in the reverse direction and when
+    // the upstream and downstream extensions go beyond the lenght of the 
+    // parent sequence.
+    $psql ='PREPARE sequence_by_parent (int, int, int) AS 
+            SELECT srcname, srcfeature_id, strand, srctypename, typename,
+              fmin, fmax, upstream, downstream, adjfmin, adjfmax, 
+              substring(residues from (adjfmin + 1) for (upstream + (fmax - fmin) + downstream))  as residues
+            FROM (
               SELECT
                 OF.name srcname, FL.srcfeature_id, FL.strand, 
                 OCVT.name as srctypename, SCVT.name as typename,
@@ -641,59 +641,60 @@ function trpial_feature_get_formatted_sequence($feature_id, $feature_name,
                        WHEN FL.fmin - $2 <= 0 THEN FL.fmin
                        ELSE $2
                     END                   
-                END as downstream,                                          
-                substring(OF.residues from (adjfmin + 1) for (upstream + (FL.fmax - FL.fmin) + downstream))  as residues
+                END as downstream,  
+                OF.residues                                                     
               FROM featureloc FL 
                 INNER JOIN feature SF on FL.feature_id = SF.feature_id
                 INNER JOIN cvterm SCVT on SF.type_id = SCVT.cvterm_id
                 INNER JOIN feature OF on FL.srcfeature_id = OF.feature_id                
                 INNER JOIN cvterm OCVT on OF.type_id = OCVT.cvterm_id
-              WHERE SF.feature_id = $3';
-              
-      $status = chado_query($psql);
-exit;    
-      if (!$status) {
-        watchdog('tripal_views_handler_field_sequence', 
-          "init: not able to prepare SQL statement '%name'", 
-          array('%name' => 'sequence_by_parent'), 'WATCHDOG ERROR');
-      }
-      // this query is meant to get all of the sub features of any given
-      // feature (arg #1) and order them as they appear on the reference
-      // feature (arg #2).
-      $psql ='PREPARE sub_features (int, int) AS 
-              SELECT SF.feature_id, CVT.name as type_name, SF.type_id
-              FROM feature_relationship FR
-                INNER JOIN feature SF on SF.feature_id = FR.subject_id
-                INNER JOIN cvterm CVT on CVT.cvterm_id = SF.type_id
-                INNER JOIN featureloc FL on FL.feature_id = FR.subject_id
-                INNER JOIN feature PF on PF.feature_id = FL.srcfeature_id
-              WHERE FR.object_id = $1 and PF.feature_id = $2
-              ORDER BY FL.fmin ASC';
-              
-      $status = chado_query($psql);
-      if (!$status) {
-        watchdog('tripal_views_handler_field_sequence', 
-          "init: not able to prepare SQL statement '%name'", 
-          array('%name' => 'ssub_features'), 'WATCHDOG ERROR');
-      }
-      $psql ='PREPARE count_sub_features (int, int) AS 
-              SELECT count(*) as num_children
-              FROM feature_relationship FR
-                INNER JOIN feature SF on SF.feature_id = FR.subject_id
-                INNER JOIN cvterm CVT on CVT.cvterm_id = SF.type_id
-                INNER JOIN featureloc FL on FL.feature_id = FR.subject_id
-                INNER JOIN feature PF on PF.feature_id = FL.srcfeature_id
-              WHERE FR.object_id = $1 and PF.feature_id = $2';
-              
-      $status = chado_query($psql);
-      if (!$status) {
-        watchdog('tripal_views_handler_field_sequence', 
-          "init: not able to prepare SQL statement '%name'", 
-          array('%name' => 'count_sub_features'), 'WATCHDOG ERROR');
-      }
+              WHERE SF.feature_id = $3) as tbl1
+    ';              
+    $status = chado_query($psql);
+    if (!$status) {
+      watchdog('tripal_views_handler_field_sequence', 
+        "init: not able to prepare SQL statement '%name'", 
+        array('%name' => 'sequence_by_parent'), 'WATCHDOG ERROR');
     }
     
-    // execute the query
+    // this query is meant to get all of the sub features of any given
+    // feature (arg #1) and order them as they appear on the reference
+    // feature (arg #2).
+    $psql ='PREPARE sub_features (int, int) AS 
+            SELECT SF.feature_id, CVT.name as type_name, SF.type_id
+            FROM feature_relationship FR
+              INNER JOIN feature SF on SF.feature_id = FR.subject_id
+              INNER JOIN cvterm CVT on CVT.cvterm_id = SF.type_id
+              INNER JOIN featureloc FL on FL.feature_id = FR.subject_id
+              INNER JOIN feature PF on PF.feature_id = FL.srcfeature_id
+            WHERE FR.object_id = $1 and PF.feature_id = $2
+            ORDER BY FL.fmin ASC';            
+    $status = chado_query($psql);
+    if (!$status) {
+      watchdog('tripal_views_handler_field_sequence', 
+        "init: not able to prepare SQL statement '%name'", 
+        array('%name' => 'ssub_features'), 'WATCHDOG ERROR');
+    }
+    $psql ='PREPARE count_sub_features (int, int) AS 
+            SELECT count(*) as num_children
+            FROM feature_relationship FR
+              INNER JOIN feature SF on SF.feature_id = FR.subject_id
+              INNER JOIN cvterm CVT on CVT.cvterm_id = SF.type_id
+              INNER JOIN featureloc FL on FL.feature_id = FR.subject_id
+              INNER JOIN feature PF on PF.feature_id = FL.srcfeature_id
+            WHERE FR.object_id = $1 and PF.feature_id = $2';            
+    $status = chado_query($psql);
+    if (!$status) {
+      watchdog('tripal_views_handler_field_sequence', 
+        "init: not able to prepare SQL statement '%name'", 
+        array('%name' => 'count_sub_features'), 'WATCHDOG ERROR');
+    }
+  }
+    
+  // if we need to get the sequence from the parent then do so now.
+  if ($derive_from_parent) {                  
+    
+    // execute the query to get the sequence from the parent
     $sql = "EXECUTE sequence_by_parent (%d, %d, %d)";
     $parents = chado_query($sql, $upstream, $downstream, $feature_id);
 
@@ -715,6 +716,13 @@ exit;
         $types = array();
         $i = 0;
         while ($child = db_fetch_object($children)) {
+          // if the callee has specified that only certain sub features should be
+          // included then continue of this child is not one of those allowed
+          // subfeatures
+          if (count($sub_features) > 0 and !in_array($child->type_name, $sub_features)){
+             continue;
+          }
+        
           // keep up with the types
           if (!in_array($child->type_name, $types)) {
             $types[] = $child->type_name;
@@ -772,7 +780,7 @@ exit;
       // get the reverse compliment if feature is on the reverse strand
       $dir = 'forward';
       if ($parent->strand < 0) {
-        $seq = trpial_feature_reverse_complement($seq);
+        $seq = tripal_feature_reverse_complement($seq);
         $dir = 'reverse';
       }
       

+ 5 - 1
tripal_feature/tripal_feature.drush.inc

@@ -37,6 +37,7 @@ function tripal_feature_drush_command() {
       'out'    => dt('The output format. Valid options are "fasta_html", "fasta_txt" and raw.'),
       'parent' => dt('Set this argument to 1 to retrieve the sequence from the parent in an alignment rather than the residues column of the feature itself.'),
       'agg'    => dt('Set this argument to 1 to aggregate sub features into a single sequence.  This is useful, for example, for obtaining CDS sequence from an mRNA'),
+      'child'  => dt('Set this argument to the sequence ontology term for the children to aggregate.  This is useful in the case where a gene has exons as well as CDSs and UTRs.  You may sepcify as many feature types as desired by separating each with a single comma (no spaces).'),
     ),
     'examples' => array(
       'Standard example' => 'drush tripal-current-job',
@@ -61,7 +62,10 @@ function drush_tripal_feature_tripal_get_sequence() {
   $output_format = drush_get_option('out');
   $derive_from_parent = drush_get_option('parent');
   $aggregate = drush_get_option('agg');
+  $child = drush_get_option('child');
   
+  $sub_features = explode(',',$child);
+    
   if (!$output_format) {
      $output_format = 'fasta_txt';
   }
@@ -98,7 +102,7 @@ function drush_tripal_feature_tripal_get_sequence() {
       
     $sequence = trpial_feature_get_formatted_sequence($feature_id, $feature_name, 
       $num_bases_per_line, $derive_from_parent, $aggregate, $output_format,
-      $upstream, $downstream);
+      $upstream, $downstream, $sub_features);
     print $sequence;
   }
 }

+ 139 - 60
tripal_feature/tripal_feature.module

@@ -331,15 +331,15 @@ function tripal_feature_block($op = 'list', $delta = 0, $edit=array()) {
       $blocks['sequence']['info'] = t('Tripal Feature Sequence');
       $blocks['sequence']['cache'] = BLOCK_NO_CACHE;
 
+      $blocks['featureloc_sequences']['info'] = t('Tripal Feature Annotated Sequence');
+      $blocks['featureloc_sequences']['cache'] = BLOCK_NO_CACHE;
+
       $blocks['synonyms']['info'] = t('Tripal Feature Synonyms');
       $blocks['synonyms']['cache'] = BLOCK_NO_CACHE;
 
       $blocks['properties']['info'] = t('Tripal Feature Properties');
       $blocks['properties']['cache'] = BLOCK_NO_CACHE;;
 
-      $blocks['featureloc_sequences']['info'] = t('Tripal Formatted Sequence');
-      $blocks['featureloc_sequences']['cache'] = BLOCK_NO_CACHE;
-
       $blocks['alignments']['info'] = t('Tripal Feature Alignments');
       $blocks['alignments']['cache'] = BLOCK_NO_CACHE;
 
@@ -1302,39 +1302,75 @@ function tripal_feature_load_featureloc_sequences($feature_id, $featurelocs) {
       // keep track of the individual parts for each relationship
       $start = $rel->featurelocs->$src->fmin;
       $end   = $rel->featurelocs->$src->fmax;
-      $rel_locs[$src]['parts'][$start]['type']  = $rel->subject_type;
-      $rel_locs[$src]['parts'][$start]['start'] = $start;
-      $rel_locs[$src]['parts'][$start]['end']   = $end;
+      $type  = $rel->subject_type;
+      $rel_locs[$src]['parts'][$start][$type]['start'] = $start;
+      $rel_locs[$src]['parts'][$start][$type]['end']   = $end;
+      $rel_locs[$src]['parts'][$start][$type]['type']  = $type;
     }
   }
 
-   // the featurelocs array provided to the function contains the locations
-   // where this feature is found.   We want to get the sequence for each
-   // location and then annotate it with the parts found from the relationships
-   // locations determiend above.
-  $sql = "SELECT residues FROM {feature} WHERE feature_id = %d";
+  // the featurelocs array provided to the function contains the locations
+  // where this feature is found.   We want to get the sequence for each
+  // location and then annotate it with the parts found from the relationships
+  // locations determiend above.
+  $sql = "SELECT substring(residues from %d for %d) as residues ".
+         "FROM {feature} ".
+         "WHERE feature_id = %d";
   $floc_sequences = array();
   foreach ($featurelocs as $featureloc) {
-    // get the residues for this feature
-    $previous_db = tripal_db_set_active('chado');  // use chado database
-    $feature = db_fetch_object(db_query($sql, $featureloc->srcfeature_id->feature_id));
-    tripal_db_set_active($previous_db);  // now use drupal database
-
+   
+    // build the src name so we can keep track of the different parts for each feature
     $src = $featureloc->srcfeature_id->feature_id ."-". $featureloc->srcfeature_id->type_id->cvterm_id;
 
     // orient the parts to the beginning of the feature sequence
     if (!empty($rel_locs[$src]['parts'])) {
       $parts = $rel_locs[$src]['parts'];
-      usort($parts, 'tripal_feature_sort_rel_parts');
-      foreach ($parts as $start => $attrs) {
-        $parts[$start]['start'] = $parts[$start]['start'] - $featureloc->fmin;
-        $parts[$start]['end']   = $parts[$start]['end'] - $featureloc->fmin;
+      usort($parts, 'tripal_feature_sort_rel_parts_by_start');
+      foreach ($parts as $start => $types) {
+        foreach ($types as $type_name => $type){ 
+          if ($featureloc->strand >= 0) {
+             // this is on the forward strand.  We need to convert the start on the src feature to the 
+             // start on this feature's sequence
+             $parts[$start][$type_name]['start'] = $parts[$start][$type_name]['start'] - $featureloc->fmin;
+             $parts[$start][$type_name]['end']   = $parts[$start][$type_name]['end'] - $featureloc->fmin;          
+          } 
+          else {
+             // this is on the reverse strand.  We need to swap the start and stop and calculate from the 
+             // begining of the reverse sequence
+             $size = ($featureloc->fmax - $featureloc->fmin);
+             $start_orig = $parts[$start][$type_name]['start'];
+             $end_orig = $parts[$start][$type_name]['end'];
+             $parts[$start][$type_name]['start'] = $size - ($end_orig - $featureloc->fmin);
+             $parts[$start][$type_name]['end']   = $size - ($start_orig - $featureloc->fmin);
+          }
+        }
+      }
+      // if we're on the reverse strand we need to reverse sort our parts
+      if ($featureloc->strand < 0) {
+         $new = array();
+         $keys = array_keys($parts); 
+         for ($x = count($keys) - 1; $x >= 0 ; $x--) {
+            $new[] = $parts[$x];
+         }
+         $parts = $new;
       }
+      
       $floc_sequences[$src]['src'] = $src;
       $floc_sequences[$src]['type'] = $featureloc->feature_id->type_id->name;
-      $sequence = drupal_substr($feature->residues, $featureloc->fmin-1, ($featureloc->fmax - $featureloc->fmin)+1);
-      $floc_sequences[$src]['formatted_seq'] =  tripal_feature_color_sequence(
-        $sequence, $parts);
+      $sequence = db_fetch_object(chado_query($sql, $featureloc->fmin + 1, ($featureloc->fmax - $featureloc->fmin), $featureloc->srcfeature_id->feature_id));
+      $residues = $sequence->residues;
+      if ($featureloc->strand < 0) {
+         $residues = tripal_feature_reverse_complement($residues);
+      }
+      $strand = '.';
+      if ($featureloc->strand == 1) {
+        $strand = '+';
+      }
+      elseif ($featureloc->strand == -1) {
+        $strand = '-';
+      }
+      $defline = $featureloc->feature_id->name . " " . $featureloc->srcfeature_id->name . ":" . ($featureloc->fmin + 1) . ".." . $featureloc->fmax . " " . $strand;
+      $floc_sequences[$src]['formatted_seq'] =  tripal_feature_color_sequence($residues, $parts, $defline);
     }
   }
   return $floc_sequences;
@@ -1687,8 +1723,28 @@ function tripal_feature_sort_rel_objects($a, $b) {
  *
  * @ingroup tripal_feature
  */
-function tripal_feature_sort_rel_parts($a, $b) {
-  return strnatcmp($a['start'], $b['start']);
+function tripal_feature_sort_rel_parts_by_start($a, $b) {
+  foreach ($a as $type_name => $details){
+     $astart = $a[$type_name]['start'];
+     break;
+  }
+  foreach ($b as $type_name => $details){
+     $bstart = $b[$type_name]['start'];
+     break;
+  }
+  return strnatcmp($astart, $bstart);
+}
+
+/**
+ *  used to sort the list of relationship parts by start position
+ *
+ * @ingroup tripal_feature
+ */
+function tripal_feature_sort_rel_parts_by_end($a, $b) {
+  if ($a['end'] == $b['end']) {
+     return strcmp($a['type'], $b['type']);
+  }
+  return strnatcmp($a['end'], $b['end']);
 }
 
 /**
@@ -1696,21 +1752,21 @@ function tripal_feature_sort_rel_parts($a, $b) {
  *
  * @ingroup tripal_feature
  */
-function tripal_feature_color_sequence($sequence, $parts) {
+function tripal_feature_color_sequence($sequence, $parts, $defline) {
 
   $types = array();
-
   // first get the list of types so we can create a color legend
-  foreach ($parts as $index => $child) {
-    $type = $child['type'];
-    if (!in_array($type, $types)) {
-      $types[] = $type;
+  foreach ($parts as $index => $types) {
+    foreach ($types as $type_name => $details){
+      //if (!in_array($type_name, $types)) {      
+        $types[$type_name] = 1;
+      //}
     }
   }
 
   $newseq .= "<div id=\"tripal_feature-featureloc_sequence-legend\">Legend: ";
-  foreach ($types as $type) {
-    $newseq .= "<span class=\"tripal_feature-featureloc_sequence-$type\">$type</span>";
+  foreach ($types as $type_name => $present) {
+    $newseq .= "<span class=\"tripal_feature-featureloc_sequence-$type_name\">$type_name</span>";
   }
   $newseq .= "</div>";
 
@@ -1718,33 +1774,56 @@ function tripal_feature_color_sequence($sequence, $parts) {
   // set the background color of the rows based on the type
   $pos = 0;
   $newseq .= "<pre id=\"tripal_feature-featureloc_sequence\">";
-  foreach ($parts as $index => $child) {
-    $type = $child['type'];
-    $start = $child['start'];
-    $end = $child['end']+1;
-
-    $class = "class=\"tripal_feature-featureloc_sequence-$type\"";
-
-    // iterate through the sequence up to the end of the child
-    for ($i = $pos; $i < $end; $i++) {
-
-      // if we're at the beginning of the child sequence then set the
-      // appropriate text color
-      if ($pos == $start) {
-        $newseq .= "<span $class>";
-        $func = 'uc';  // nucleotides within the child should be uppercase
-      }
-      $newseq .= $sequence{$pos};
-      $seqcount++;
-
-      if ($seqcount % 60 == 0) {
-        $newseq .= "\n";
-      }
-      $pos++;
-      if ($pos == $end) {
-        $newseq .= "</span>";
-        $func = 'lc';
-      }
+  $newseq .= ">$defline\n";
+  
+  // iterate through the parts. They should be in order.
+  foreach ($parts as $index => $types) {
+  
+    // get the start for this part.  All types in this part start at the 
+    // same position so we only need the first record
+    foreach ($types as $type => $child) {
+      $start = $child['start'];
+      break;
+    }
+  
+    // add in the sequence up to the start of this part
+    for ($i = $pos; $i < $start; $i++) {    
+       $newseq .= $sequence{$pos};
+       $seqcount++;
+       if ($seqcount % 50 == 0) {
+         $newseq .= "\n";
+       }
+       $pos++;
+    }
+
+    // we want to sort the parts by their end. We want the span tag to
+    // to be added in the order the parts end. 
+    usort($types,'tripal_feature_sort_rel_parts_by_end');
+    
+    // now add the child span for all types that start at this position  
+    foreach ($types as $type) {
+      $class = "class=\"tripal_feature-featureloc_sequence-" . $type['type'] ."\"";
+      $newseq .= "<span $class>";
+    }     
+
+    // now continue adding sequence until we reach the end of the parts
+    // if we reach the end then close the span
+    $parts_done = 0;    
+
+    while($parts_done < count($types)){
+       $newseq .= $sequence{$pos};
+       $seqcount++;
+       $pos++;
+       if ($seqcount % 50 == 0) {
+         $newseq .= "\n";
+       }
+       foreach ($types as $type => $child) {
+         $end = $child['end'];
+         if($pos == $end){
+           $newseq .= "</span>";
+           $parts_done++;
+         }
+       }
     }
   }