Browse Source

Merge pull request #1152 from tripal/1151-tv3-sequence_deflines

Missing sequence definition line
Lacey-Anne Sanderson 4 years ago
parent
commit
a453a4510b

+ 8 - 8
tripal_chado/api/modules/tripal_chado.feature.api.inc

@@ -785,21 +785,21 @@ function chado_get_fasta_defline($feature, $notes = '', $featureloc = NULL, $typ
 
   // Construct the definition line.
   $defline = $feature->uniquename . " " .
-    'ID=' . $feature->uniquename . "|" .
-    'Name=' . $feature->name . "|" .
-    'organism=' . $feature->organism_id->genus . " " . $feature->organism_id->species . "|" .
-    'type=' . $type . '|';
+    'ID=' . $feature->uniquename . "; " .
+    'Name=' . $feature->name . "; " .
+    'organism=' . $feature->organism_id->genus . " " . $feature->organism_id->species . "; " .
+    'type=' . $type . '; ';
   if ($length > 0) {
-    $defline .= "length=" . $length . "bp|";
+    $defline .= "length=" . $length . "bp; ";
   }
   if ($featureloc) {
-    $defline .= "location=Sequence derived from alignment at " . chado_get_location_string($featureloc);
+    $defline .= "location=Sequence derived from: " . chado_get_location_string($featureloc);
     $defline .= " (" . $featureloc->srcfeature_id->organism_id->genus . " " . $featureloc->srcfeature_id->organism_id->species . ")|";
   }
   if ($notes) {
-    $defline .= "Notes=$notes|";
+    $defline .= "Notes=$notes; ";
   }
-  $defline = substr($defline, 0, -1); // remove the trailing |
+  $defline = substr($defline, 0, -2); // remove the trailing "; "
   return $defline;
 }
 

+ 39 - 42
tripal_chado/includes/TripalFields/data__sequence_record/data__sequence_record.inc

@@ -75,6 +75,9 @@ class data__sequence_record extends ChadoField {
     $field_name = $this->field['field_name'];
     $feature = $entity->chado_record;
 
+    // Intialize the field items array
+    $entity->{$field_name}['und'] = [];
+
     // Add the primary sequence from the Chada feature table, residues column.
     $feature = chado_expand_var($feature, 'field', 'feature.residues');
 
@@ -84,8 +87,8 @@ class data__sequence_record extends ChadoField {
     // If this is an mRNA feature then add the gene parent, full length
     // mRNA, CDS and protein.
     if ($feature->type_id->name == 'mRNA') {
-      $this->addGeneParent($entity, $feature, $field_name);
       $featurelocs = $this->addFLmRNA($entity, $feature, $field_name);
+      $this->addGeneParent($entity, $feature, $field_name);
       if (count($featurelocs) > 0) {
         $this->addCDS($entity, $feature, $field_name, $featurelocs);
         $this->addProtein($entity, $feature, $field_name);
@@ -111,6 +114,7 @@ class data__sequence_record extends ChadoField {
     $seq_coords_term = 'data:2012';
     $seq_length_term = chado_get_semweb_term('feature', 'seqlen');
     $seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
+    $fasta_defline = 'local:fasta_definition';
 
     $options = [
       'derive_from_parent' => 1,
@@ -123,12 +127,13 @@ class data__sequence_record extends ChadoField {
 
       $entity->{$field_name}['und'][]['value'] = [
         $sequence_term => $seq['residues'],
-        $label_term =>  ucfirst(preg_replace('/_/', ' ', $feature->type_id->name)) . ' Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
-        $description_term => 'This sequence was extracted from the reference sequence location at ' . $coords['schema:description'] . '.',
+        $label_term =>  'Derived ' . ucfirst(preg_replace('/_/', ' ', $feature->type_id->name)) . ' Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
+        $description_term => 'This sequence was derived by extracting bases from the reference sequence location at ' . $coords['schema:description'] . '.',
         $seq_coords_term => $coords,
         $seq_length_term => strlen($seq['residues']),
         $seq_md5sum_term => md5($seq['residues']),
-        $type_term => $feature->type_id->name
+        $type_term => $feature->type_id->name,
+        $fasta_defline => $seq['defline'],
       ];
 
       $featurelocs[] = $featureloc;
@@ -137,10 +142,7 @@ class data__sequence_record extends ChadoField {
   }
 
   /**
-   *
-   * @param unknown $entity
-   * @param unknown $feature
-   * @param unknown $field_name
+   * Adds the primary sequence from the feature.residues column.
    */
   private function addPrimary(&$entity, $feature, $field_name) {
 
@@ -150,25 +152,24 @@ class data__sequence_record extends ChadoField {
     $sequence_term = chado_get_semweb_term('feature', 'residues');
     $seq_length_term = chado_get_semweb_term('feature', 'seqlen');
     $seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
+    $fasta_defline = 'local:fasta_definition';
 
     if ($feature->residues) {
       $entity->{$field_name}['und'][]['value'] = [
-        $label_term => 'Primary ' . preg_replace('/_/', ' ', $feature->type_id->name) . 'Sequence (' . number_format($feature->seqlen) . 'bp)',
+        $label_term => 'Primary ' . preg_replace('/_/', ' ', $feature->type_id->name) . ' Sequence (' . number_format($feature->seqlen) . 'bp)',
         $description_term => 'This is the primary representative sequence for this feature.',
         $sequence_term => $feature->residues,
         $seq_length_term => $feature->seqlen,
         $seq_md5sum_term => $feature->md5checksum,
-        $type_term => $feature->type_id->name
+        $type_term => $feature->type_id->name,
+        $fasta_defline => chado_get_fasta_defline($feature)
       ];
     }
   }
 
 
   /**
-   *
-   * @param unknown $entity
-   * @param unknown $feature
-   * @param unknown $field_name
+   * Adds the full length mRNA sequence (only for gene features).
    */
   private function addFLmRNA(&$entity, $feature, $field_name) {
     $label_term = 'rdfs:label';
@@ -178,6 +179,7 @@ class data__sequence_record extends ChadoField {
     $seq_coords_term = 'data:2012';
     $seq_length_term = chado_get_semweb_term('feature', 'seqlen');
     $seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
+    $fasta_defline = 'local:fasta_definition';
 
     // Sometimes an mRNA may have only exons, only CDS or both exons and
     // CDS.  We need to know which.
@@ -212,12 +214,13 @@ class data__sequence_record extends ChadoField {
 
       $entity->{$field_name}['und'][]['value'] = [
         $sequence_term => $seq['residues'],
-        $label_term => 'Full Length mRNA Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
-        $description_term => 'This full length mRNA sequence was extracted from the reference sequence location at ' . $coords['schema:description'] . ' and contains: ' . implode(', ', $types) ,
+        $label_term => 'Derived mRNA Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
+        $description_term => 'This full length mRNA sequence was derived by extracting bases from the reference sequence location at ' . $coords['schema:description'] . ' and contains: ' . implode(', ', $types) ,
         $seq_coords_term => $coords,
         $seq_length_term => strlen($seq['residues']),
         $seq_md5sum_term => md5($seq['residues']),
-        $type_term => 'mRNA'
+        $type_term => 'mRNA',
+        $fasta_defline => $seq['defline']
       ];
 
       $featurelocs[] = $featureloc;
@@ -226,9 +229,7 @@ class data__sequence_record extends ChadoField {
   }
 
   /**
-   *
-   * @param unknown $featureloc_id
-   * @return unknown
+   * Retrieves the feature location information.
    */
   private function getFeatureLoc($featureloc_id) {
     $featurelocs_sql = "
@@ -241,9 +242,7 @@ class data__sequence_record extends ChadoField {
   }
 
   /**
-   *
-   * @param unknown $featureloc
-   * @return string[]|number[]|NULL[]|unknown[]
+   * Gets the sequence location string for a featureloc record.
    */
   private function getSequenceCoords($featureloc) {
     $description_term = 'schema:description';
@@ -277,10 +276,7 @@ class data__sequence_record extends ChadoField {
   }
 
   /**
-   *
-   * @param unknown $entity
-   * @param unknown $feature
-   * @param unknown $field_name
+   * Adds the CDS sequence (only for an mRNA feature)
    */
   private function addCDS(&$entity, $feature, $field_name, $featurelocs) {
     $label_term = 'rdfs:label';
@@ -289,6 +285,7 @@ class data__sequence_record extends ChadoField {
     $sequence_term = chado_get_semweb_term('feature', 'residues');
     $seq_length_term = chado_get_semweb_term('feature', 'seqlen');
     $seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
+    $fasta_defline = 'local:fasta_definition';
 
     foreach ($featurelocs as $featureloc) {
       $cds_feature = [
@@ -311,20 +308,18 @@ class data__sequence_record extends ChadoField {
         $entity->{$field_name}['und'][]['value'] = [
           $label_term => 'Coding Sequence (' . number_format($cds_sequence[0]['length']) . 'bp)',
           $sequence_term => $cds_sequence[0]['residues'],
-          $description_term => 'This CDS was extracted from the reference sequence location at ' . $coords['schema:description'] . '.' ,
+          $description_term => 'This CDS was derived by extracting bases from the reference sequence location at ' . $coords['schema:description'] . '.' ,
           $seq_length_term => $cds_sequence[0]['length'],
           $seq_md5sum_term => md5($cds_sequence[0]['residues']),
-          $type_term => 'CDS'
+          $type_term => 'CDS',
+          $fasta_defline => $cds_sequence[0]['defline']
         ];
       }
     }
   }
 
   /**
-   *
-   * @param unknown $entity
-   * @param unknown $feature
-   * @param unknown $field_name
+   * Adds the sequecne for a gene parent (only for gene children).
    */
   private function addGeneParent(&$entity, $feature, $field_name) {
     $label_term = 'rdfs:label';
@@ -334,6 +329,7 @@ class data__sequence_record extends ChadoField {
     $sequence_term = chado_get_semweb_term('feature', 'residues');
     $seq_length_term = chado_get_semweb_term('feature', 'seqlen');
     $seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
+    $fasta_defline = 'local:fasta_definition';
 
     $sql = "
       SELECT FO.*
@@ -355,7 +351,8 @@ class data__sequence_record extends ChadoField {
           $label_term => 'The gene sequence.',
           $seq_length_term => strlen($gene->residues),
           $seq_md5sum_term => md5($gene->residues),
-          $type_term => 'polypeptide'
+          $type_term => 'gene',
+          $fasta_defline => chado_get_fasta_defline($gene),
         ];
       }
       else {
@@ -364,13 +361,14 @@ class data__sequence_record extends ChadoField {
           $featureloc = $this->getFeatureLoc($seq['featureloc_id']);
           $coords = $this->getSequenceCoords($featureloc);
           $entity->{$field_name}['und'][]['value'] = [
-            $label_term => 'Gene Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
+            $label_term => 'Derived Gene Sequence (' . number_format(strlen($seq['residues'])) . 'bp)',
             $sequence_term => $seq['residues'],
-            $description_term => 'This gene sequence was extracted from the reference sequence location at ' . $coords['schema:description'] . '.' ,
+            $description_term => 'This gene sequence was derived by extracting bases from the reference sequence location at ' . $coords['schema:description'] . '.' ,
             $seq_coords_term => $coords,
             $seq_length_term => strlen($seq['residues']),
             $seq_md5sum_term => md5($seq['residues']),
-            $type_term => 'gene'
+            $type_term => 'gene',
+            $fasta_defline => $seq['defline'],
           ];
         }
       }
@@ -378,10 +376,7 @@ class data__sequence_record extends ChadoField {
   }
 
   /**
-   *
-   * @param unknown $entity
-   * @param unknown $feature
-   * @param unknown $field_name
+   * Adds the protein sequence (only for mRNA features).
    */
   private function addProtein(&$entity, $feature, $field_name) {
     $label_term = 'rdfs:label';
@@ -390,6 +385,7 @@ class data__sequence_record extends ChadoField {
     $sequence_term = chado_get_semweb_term('feature', 'residues');
     $seq_length_term = chado_get_semweb_term('feature', 'seqlen');
     $seq_md5sum_term = chado_get_semweb_term('feature', 'md5checksum');
+    $fasta_defline = 'local:fasta_definition';
 
     $sql = "
       SELECT F.*
@@ -412,7 +408,8 @@ class data__sequence_record extends ChadoField {
           $description_term => 'The protein sequence.',
           $seq_length_term => strlen($protein->residues),
           $seq_md5sum_term => md5($protein->residues),
-          $type_term => 'polypeptide'
+          $type_term => 'polypeptide',
+          $fasta_defline => chado_get_fasta_defline($protein),
         ];
       }
     }

+ 2 - 1
tripal_chado/includes/TripalFields/data__sequence_record/data__sequence_record_formatter.inc

@@ -26,6 +26,7 @@ class data__sequence_record_formatter extends ChadoFieldFormatter {
     $fmax_term = chado_get_semweb_term('featureloc', 'fmax');
     $strand_term = chado_get_semweb_term('featureloc', 'strand');
     $phase_term = chado_get_semweb_term('featureloc', 'phase');
+    $fasta_defline = 'local:fasta_definition';
 
     $content = [];
 
@@ -36,6 +37,7 @@ class data__sequence_record_formatter extends ChadoFieldFormatter {
 
       $num_bases = 50;
       $residues = '<pre class="residues-formatter">';
+      $residues .= ">" . $item['value'][$fasta_defline] . "<br>";
       $residues .= wordwrap($item['value'][$sequence_term], $num_bases, "<br>", TRUE);
       $residues .= '</pre>';
 
@@ -49,7 +51,6 @@ class data__sequence_record_formatter extends ChadoFieldFormatter {
 
     }
 
-
     if (empty($content)) {
       $element[0] = [
         '#type' => 'markup',

+ 11 - 0
tripal_chado/includes/tripal_chado.semweb.inc

@@ -1457,6 +1457,17 @@ function tripal_chado_populate_vocab_LOCAL() {
     'db_name' => 'local',
   ]);
 
+  //--------
+  // Feature
+  //--------
+  $term = chado_insert_cvterm([
+    'name' => 'fasta_definition',
+    'definition' => 'The definition line for a FASTA formatted sequence',
+    'cv_name' => 'local',
+    'is_relationship' => 0,
+    'db_name' => 'local',
+  ]);
+
   //--------------
   // Feature Map
   //--------------

+ 21 - 2
tripal_chado/tripal_chado.install

@@ -2081,7 +2081,7 @@ function tripal_chado_update_7338() {
  */
 function tripal_chado_update_7339() {
   try {
-    $term = chado_insert_cvterm([
+    chado_insert_cvterm([
       'id' => 'data:0849',
       'name' => 'Sequence record',
       'cv_name' => 'EDAM',
@@ -2092,4 +2092,23 @@ function tripal_chado_update_7339() {
     $error = $e->getMessage();
     throw new DrupalUpdateException('Could not perform update: '. $error);
   }
-}
+}
+
+
+/**
+ * Adds the "FASTA definition" cvterm for the data__sequence_record field.
+ */
+function tripal_chado_update_7340() {
+  try {
+    chado_insert_cvterm([
+      'name' => 'fasta_definition',
+      'definition' => 'The definition line for a FASTA formatted sequence',
+      'cv_name' => 'local',
+      'is_relationship' => 0,
+      'db_name' => 'local',
+    ]);
+  } catch (\PDOException $e) {
+    $error = $e->getMessage();
+    throw new DrupalUpdateException('Could not perform update: '. $error);
+  }
+}