Browse Source

Fixes to GFF3 loader

Stephen Ficklin 4 years ago
parent
commit
34f161ac8e
1 changed files with 130 additions and 76 deletions
  1. 130 76
      tripal_chado/includes/TripalImporter/GFF3Importer.inc

+ 130 - 76
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -903,7 +903,7 @@ class GFF3Importer extends TripalImporter {
    *  An associative array containing the 9 elements othe GFF3 file. The
    *  9th element is an associative array of the attributes.
    */
-  private function parseFeature($line) {
+  private function parseGFF3Line($line) {
     $date = getdate();
 
     // get the columns
@@ -937,12 +937,12 @@ class GFF3Importer extends TripalImporter {
     $ret['start'] = $fmin;
     $ret['stop'] = $fmax;
 
-    // Landmark (seqid) validation checks based on GFF3 specifications 
+    // Landmark (seqid) validation checks based on GFF3 specifications
     preg_match('/[a-zA-Z0-9\.:\^\*\$@!\+_\?-\|]*/', $ret['landmark'], $matches);
     if ($matches[0] != $ret['landmark']) {
-      throw new Exception(t("Landmark/seqid !landmark contains invalid 
-        characters. Only characters included in this regular expression is 
-        allowed [a-zA-Z0-9.:^*$@!+_?-|]", 
+      throw new Exception(t("Landmark/seqid !landmark contains invalid
+        characters. Only characters included in this regular expression is
+        allowed [a-zA-Z0-9.:^*$@!+_?-|]",
         ['!landmark' => $ret['landmark']]));
     }
 
@@ -1007,7 +1007,7 @@ class GFF3Importer extends TripalImporter {
 
       // Break apart each attribute into key/value pairs.
       $tag = preg_split("/=/", $attr, 2);
-      
+
       // Multiple values of an attribute are separated by commas
       $tag_name = $tag[0];
       if (!array_key_exists($tag_name, $tags)) {
@@ -1103,11 +1103,43 @@ class GFF3Importer extends TripalImporter {
     // A feature may get ignored. But let's default this to FALSE.
     $ret['skipped'] = FALSE;
 
+    // A line may have more than one feature (e.g. match, EST_match, etc).
+    // This flag, when TRUE, tells the parseGFF3 function to repeat this line.
+    $ret['repeat'] = FALSE;
+
     // If neither name nor uniquename are provided then generate one.
     $names = $this->getFeatureNames($tags, $ret['type'], $ret['landmark'], $ret['start'], $ret['stop']);
     $attr_uniquename = $names['uniquename'];
     $attr_name = $names['name'];
 
+    // If this is a match feature (match, EST_match, cDNA_match, etc), then
+    // we need to handle this line specially.
+    if (preg_match('/match$/i', $ret['type'])) {
+
+      // If the feature already exists that means we need to add a match_part
+      // feature.  If not, then we will add a flag to the results to tell
+      // the parseGFF3 function to repeat this line, as it has two features:
+      // the match and the match_part.  All other match feature with the same
+      // ID in the GFF3 will just be match_part features.
+      $parent_check = preg_replace('/_part_\d+/', '', $attr_uniquename);
+      if (array_key_exists($parent_check, $this->features)) {
+         // Set the match_part parent
+         // remove the "_part_X" suffix added by the getFeatureNames to find
+         // the parent.
+         $attr_parent = $parent_check;
+         $ret['type'] = 'match_part';
+      }
+      else {
+        // Unset all attributes as these belong on the match_part
+        $attr_dbxref = [];
+        $attr_aliases = [];
+        $attr_terms = [];
+        $attr_derives = [];
+        $attr_others = [];
+        $ret['repeat'] = TRUE;
+      }
+    }
+
     $ret['name'] = $attr_name;
     $ret['uniquename'] = $attr_uniquename;
     $ret['synonyms'] = $attr_aliases;
@@ -1156,7 +1188,9 @@ class GFF3Importer extends TripalImporter {
       $ret['attrs'][$key] = $value;
     }
 
-    // Add the organism  entry.
+    // Add the organism entry, but if we don't have one for this feature
+    // (in the case where the target_organism attribute doesn't match
+    // an organism in the databse) then skip this feature.
     $ret['organism'] = $attr_organism;
     if (!$ret['organism']) {
       $ret['skipped'] = TRUE;
@@ -1177,7 +1211,6 @@ class GFF3Importer extends TripalImporter {
           ]));
     }
 
-
     // Add the properties and parent.
     $ret['properties'] = $attr_others;
     $ret['parent'] = $attr_parent;
@@ -1395,69 +1428,14 @@ class GFF3Importer extends TripalImporter {
       }
 
       // Parse this feature from this line of the GFF3 file.
-      $gff_feature = $this->parseFeature($line);
-
-      // Add the landmark if it doesn't exist in the landmark list.
-      if (!array_key_exists($gff_feature['landmark'], $this->landmarks)) {
-        $this->landmarks[$gff_feature['landmark']] = FALSE;
-      }
-
-      // Organize DBs and DBXrefs for faster access later on.
-      foreach ($gff_feature['dbxrefs'] as $index => $info) {
-        if (!array_key_exists($info['db'], $this->db_lookup)) {
-          $this->db_lookup[$info['db']] = FALSE;
-        }
-        if (!array_key_exists($index, $this->dbxref_lookup)) {
-          $this->dbxref_lookup[$index] = $info;
-        }
-      }
-
-      // We want to make sure the Ontology_term attribute dbxrefs are
-      // also easily looked up... but we do not want to create them
-      // if they do not exist the precense of the 'cvterm' key will
-      // tell the loadDbxrefs() function to not create the term.
-      foreach ($gff_feature['terms'] as $index => $info) {
-        if (!array_key_exists($info['db'], $this->db_lookup)) {
-          $this->db_lookup[$info['db']] = FALSE;
-        }
-
-        if (!array_key_exists($index, $this->dbxref_lookup)) {
-          $this->dbxref_lookup[$index] = $info;
-          $this->dbxref_lookup[$index]['cvterm_id'] = NULL;
-        }
-      }
-
-      // Organize the CVterms for faster access later on.
-      if (!array_key_exists($gff_feature['type'], $feature_cvterms)) {
-        $feature_cvterms[$gff_feature['type']] = 0;
-      }
-      $feature_cvterms[$gff_feature['type']]++;
-
-      // Add any target feature types to the list as well.
-      if (array_key_exists('name', $gff_feature['target'])) {
-        if (!array_key_exists($gff_feature['target']['type'], $feature_cvterms)) {
-          $feature_cvterms[$gff_feature['target']['type']] = 0;
-        }
-        $feature_cvterms[$gff_feature['target']['type']]++;
-      }
-
-      // Organize the feature property types for faster access later on.
-      foreach ($gff_feature['properties'] as $prop_name => $value) {
-        if (!array_key_exists($prop_name, $featureprop_cvterms)) {
-          $featureprop_cvterms[$prop_name] = NULL;
-        }
-        $featureprop_cvterms[$prop_name]++;
-      }
-
-      // Cache the GFF feature details for later lookup.
-      if (strcmp($gff_feature['uniquename'], $gff_feature['landmark']) != 0) {
-        $this->cacheFeature($gff_feature);
-      }
+      $gff_feature = $this->parseGFF3Line($line);
+      $this->prepareFeature($gff_feature, $feature_cvterms, $featureprop_cvterms);
 
-      // If this feature has a target then we need to add the target as
-      // new feature for insertion.
-      if (array_key_exists('name', $gff_feature['target'])) {
-        $this->addTargetFeature($gff_feature);
+      // If there is a second feature (in the case of a match) then
+      // repeat this line (to get the match_part).
+      if ($gff_feature['repeat'] === TRUE) {
+        $gff_feature = $this->parseGFF3Line($line);
+        $this->prepareFeature($gff_feature, $feature_cvterms, $featureprop_cvterms);
       }
     }
 
@@ -1477,7 +1455,75 @@ class GFF3Importer extends TripalImporter {
     foreach (array_keys($featureprop_cvterms) as $name) {
       $this->getTypeID($name, TRUE);
     }
+  }
+
+  /**
+   *
+   */
+  private function prepareFeature($gff_feature, &$feature_cvterms, &$featureprop_cvterms) {
+
+    // Add the landmark if it doesn't exist in the landmark list.
+    if (!array_key_exists($gff_feature['landmark'], $this->landmarks)) {
+      $this->landmarks[$gff_feature['landmark']] = FALSE;
+    }
 
+    // Organize DBs and DBXrefs for faster access later on.
+    foreach ($gff_feature['dbxrefs'] as $index => $info) {
+      if (!array_key_exists($info['db'], $this->db_lookup)) {
+        $this->db_lookup[$info['db']] = FALSE;
+      }
+      if (!array_key_exists($index, $this->dbxref_lookup)) {
+        $this->dbxref_lookup[$index] = $info;
+      }
+    }
+
+    // We want to make sure the Ontology_term attribute dbxrefs are
+    // also easily looked up... but we do not want to create them
+    // if they do not exist the precense of the 'cvterm' key will
+    // tell the loadDbxrefs() function to not create the term.
+    foreach ($gff_feature['terms'] as $index => $info) {
+      if (!array_key_exists($info['db'], $this->db_lookup)) {
+        $this->db_lookup[$info['db']] = FALSE;
+      }
+
+      if (!array_key_exists($index, $this->dbxref_lookup)) {
+        $this->dbxref_lookup[$index] = $info;
+        $this->dbxref_lookup[$index]['cvterm_id'] = NULL;
+      }
+    }
+
+    // Organize the CVterms for faster access later on.
+    if (!array_key_exists($gff_feature['type'], $feature_cvterms)) {
+      $feature_cvterms[$gff_feature['type']] = 0;
+    }
+    $feature_cvterms[$gff_feature['type']]++;
+
+    // Add any target feature types to the list as well.
+    if (array_key_exists('name', $gff_feature['target'])) {
+      if (!array_key_exists($gff_feature['target']['type'], $feature_cvterms)) {
+        $feature_cvterms[$gff_feature['target']['type']] = 0;
+      }
+      $feature_cvterms[$gff_feature['target']['type']]++;
+    }
+
+    // Organize the feature property types for faster access later on.
+    foreach ($gff_feature['properties'] as $prop_name => $value) {
+      if (!array_key_exists($prop_name, $featureprop_cvterms)) {
+        $featureprop_cvterms[$prop_name] = NULL;
+      }
+      $featureprop_cvterms[$prop_name]++;
+    }
+
+    // Cache the GFF feature details for later lookup.
+    if (strcmp($gff_feature['uniquename'], $gff_feature['landmark']) != 0) {
+      $this->cacheFeature($gff_feature);
+    }
+
+    // If this feature has a target then we need to add the target as
+    // new feature for insertion.
+    if (array_key_exists('name', $gff_feature['target'])) {
+      $this->addTargetFeature($gff_feature);
+    }
   }
 
   /**
@@ -2796,12 +2842,20 @@ class GFF3Importer extends TripalImporter {
         }
         $uniquename = $uniquename . "_" . $i;
       }
-      // A name can be duplicated if there is a target match alignment and
-      // the feature appears first in the GFF as a target before it appears
-      // on it's own independent line of the gff file.
-      elseif ($prev_feature['is_target'] == TRUE) {
-        // Do nothing, the previous feature is a target so we'll overwrite
-        // it with this record.
+      // If this is a match feature (e.g. match, EST_match, cDNA_match, etc).
+      // then we can accept a duplicated ID in the GFF3 file.  But we
+      // must rename it before going into Chado.  For this, we will allow
+      // the match feature to keep the original ID and we will create a new
+      // name for the match_part.
+      elseif (preg_match('/match$/', $type)) {
+        $i = 1;
+        $temp_uname = $uniquename;
+        do {
+          $temp_uname = $uniquename . "_part_" . $i;
+          $i++;
+        }
+        while (array_key_exists($temp_uname, $this->features));
+        $uniquename = $temp_uname;
       }
       else {
         throw new Exception(t("A feature with the same ID exists multiple times: !uname", ['!uname' => $uniquename]));