Browse Source

Now supporting Ontology_term

Stephen Ficklin 4 years ago
parent
commit
2c9f613b93
1 changed files with 116 additions and 19 deletions
  1. 116 19
      tripal_chado/includes/TripalImporter/GFF3Importer.inc

+ 116 - 19
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -197,6 +197,11 @@ class GFF3Importer extends TripalImporter {
    */
   private $dbxref_lookup = [];
 
+  /**
+   * Holds a mapping of Dbxref names to cvterm ids.
+   */
+  private $cvterm_lookup = [];
+
   /**
    * Holds a mapping of synonymns to ids.
    */
@@ -667,6 +672,10 @@ class GFF3Importer extends TripalImporter {
     $this->logMessage("Step 8c: Loading feature cross references...              ");
     $this->loadFeatureDbxrefs();
 
+
+    $this->logMessage("Step 9: Loading feature ontology terms...                 ");
+    $this->loadFeatureCVterms();
+
     /*
       strcmp($tag_name, 'Parent') != 0 and
       strcmp($tag_name, 'Target') != 0 and
@@ -959,6 +968,7 @@ class GFF3Importer extends TripalImporter {
     $attr_aliases = [];
     $attr_dbxref = [];
     $attr_derives = [];
+    $attr_terms = [];
     foreach ($attrs as $attr) {
       $attr = rtrim($attr);
       $attr = ltrim($attr);
@@ -1000,6 +1010,9 @@ class GFF3Importer extends TripalImporter {
       elseif (strcmp($tag_name, 'Derives_from') == 0) {
         $attr_derives = array_merge($attr_derives, $tags[$tag_name]);
       }
+      elseif (strcmp($tag_name, 'Ontology_term') == 0) {
+        $attr_terms = array_merge($attr_terms, $tags[$tag_name]);
+      }
       // Get the list of non-reserved attributes these will get added
       // as properties to the featureprop table.
       elseif (strcmp($tag_name, 'Name') !=0 and strcmp($tag_name, 'ID') !=0 and
@@ -1031,7 +1044,7 @@ class GFF3Importer extends TripalImporter {
     foreach ($attr_dbxref as $key => $dbx) {
       $parts = explode(':', $dbx);
       if (count($parts) != 2) {
-        throw new Exception(t('Dbxrefs must be of the format: "Dbxref=<db name>:<accession>". The Dbxref %dbx on line %line_num does not satisfy this format.',
+        throw new Exception(t('Dbxrefs must be of the format: "<db name>:<accession>". The Dbxref %dbx on line %line_num does not satisfy this format.',
             ['%line_num' => $this->current_line, '%dbx' => $dbx]));
       }
       $ret['dbxrefs']["{$parts[0]}:{$parts[1]}"] = array(
@@ -1039,11 +1052,27 @@ class GFF3Importer extends TripalImporter {
         'accession' => $parts[1],
       );
     }
+
+    // Add in the GFF source dbxref. This is needed for GBrowse.
     $ret['dbxrefs']["GFF_source:{$ret['source']}"] = array(
       'db' => 'GFF_source',
       'accession' => $ret['source'],
     );
 
+    // Add in the ontology terms
+    $ret['terms'] = [];
+    foreach ($attr_terms as $key => $dbx) {
+      $parts = explode(':', $dbx);
+      if (count($parts) != 2) {
+        throw new Exception(t('Ontology_terms must be of the format: "<db name>:<accession>". The term %dbx on line %line_num does not satisfy this format.',
+            ['%line_num' => $this->current_line, '%dbx' => $dbx]));
+      }
+      $ret['terms']["{$parts[0]}:{$parts[1]}"] = array(
+        'db' => $parts[0],
+        'accession' => $parts[1],
+      );
+    }
+
     $ret['derives_from'] = $attr_derives;
     if (count($ret['derives_from']) > 1) {
       throw new Exception(t('Each feature can only have one "Derives_from" attribute. The feature %uniquename has more than one: %derives',
@@ -1307,17 +1336,31 @@ class GFF3Importer extends TripalImporter {
         $this->relationships['Child'][$gff_feature['uniquename']] = $gff_feature['Parent'];
       }
 
-      // Organize DBs for faster acces later on.
+      // Organize DBs and DBXrefs for faster access later on.
       foreach ($gff_feature['dbxrefs'] as $index => $info) {
         if (!array_key_exists($info['db'], $this->db_lookup)) {
           $this->db_lookup[$info['db']] = FALSE;
         }
+        if (!array_key_exists($index, $this->dbxref_lookup)) {
+          $this->dbxref_lookup[$index] = $info;
+        }
+      }
+      // We want to make sure the Ontology_term attribute dbxrefs are
+      // also easily looked up... but we do not want to create them
+      // if they do not exist the precense of the 'cvterm' key will
+      // tell the loadDbxrefs() function to not create the term.
+      foreach ($gff_feature['terms'] as $index => $info) {
+        if (!array_key_exists($info['db'], $this->db_lookup)) {
+          $this->db_lookup[$info['db']] = FALSE;
+        }
 
         if (!array_key_exists($index, $this->dbxref_lookup)) {
           $this->dbxref_lookup[$index] = $info;
+          $this->dbxref_lookup[$index]['cvterm_id'] = NULL;
         }
       }
 
+
       // Organize the CVterms for faster access later on.
       if (!array_key_exists($gff_feature['type'], $feature_cvterms)) {
         $feature_cvterms[$gff_feature['type']] = 0;
@@ -1610,8 +1653,8 @@ class GFF3Importer extends TripalImporter {
    */
   private function findDbxrefs() {
     $batch_size = 1000;
-    $num_features = count(array_keys($this->dbxref_lookup));
-    $num_batches = (int) ($num_features / $batch_size) + 1;
+    $num_dbxrefs = count(array_keys($this->dbxref_lookup));
+    $num_batches = (int) ($num_dbxrefs / $batch_size) + 1;
 
     $this->setItemsHandled(0);
     $this->setTotalItems($num_batches);
@@ -1619,29 +1662,36 @@ class GFF3Importer extends TripalImporter {
     // DBXrefs may be already present so we'll do an initial round of
     // looking for them and then insert those that don't exist.
     $init_sql = "
-      SELECT DB.name, DBX.db_id, DBX.accession, DBX.dbxref_id
+      SELECT DB.name, DBX.db_id, DBX.accession, DBX.dbxref_id, CVT.cvterm_id
       FROM {dbxref} DBX
         INNER JOIN {db} DB on DB.db_id = DBX.db_id
+        LEFT JOIN {cvterm} CVT on DBX.dbxref_id = CVT.dbxref_id
       WHERE
     ";
     $i = 0;
+    $total = 0;
     $batch_num = 1;
     $sql = '';
     $args = [];
     foreach ($this->dbxref_lookup as $xref => $info) {
       $i++;
+      $total++;
       $sql .= "(DBX.accession = :accession_$i and DBX.db_id = :db_id_$i) OR\n";
       $args[":accession_$i"] = $info['accession'];
       $args[":db_id_$i"] = $this->db_lookup[$info['db']];
 
       // If we've reached the size of the batch then let's do the select.
-      if ($i == $batch_size) {
+      if ($i == $batch_size or $total == $num_dbxrefs) {
         $sql = rtrim($sql, " OR\n");
         $sql = $init_sql . $sql;
         $results = chado_query($sql, $args);
         while ($dbxref = $results->fetchObject()) {
           $index = $dbxref->name . ':' . $dbxref->accession;
           $this->dbxref_lookup[$index]['dbxref_id'] = $dbxref->dbxref_id;
+          if ($dbxref->cvterm_id) {
+            $this->cvterm_lookup[$index] = $dbxref->cvterm_id;
+            $this->dbxref_lookup[$index]['cvterm_id'] = $dbxref->cvterm_id;
+          }
         }
         $this->setItemsHandled($batch_num);
         $batch_num++;
@@ -1653,17 +1703,6 @@ class GFF3Importer extends TripalImporter {
         $args = [];
       }
     }
-    // Select any remaining batch items
-    if ($i > 0) {
-      $sql = rtrim($sql, " OR\n");
-      $sql = $init_sql . $sql;
-      $results = chado_query($sql, $args);
-      while ($dbxref = $results->fetchObject()) {
-        $index = $dbxref->name . ':' . $dbxref->accession;
-        $this->dbxref_lookup[$index]['dbxref_id'] = $dbxref->dbxref_id;
-      }
-      $this->setItemsHandled($batch_num);
-    }
   }
 
   /**
@@ -1688,7 +1727,9 @@ class GFF3Importer extends TripalImporter {
       $total++;
 
       // Only do an insert if this dbxref doesn't already exist in the databse.
-      if (!array_key_exists('dbxref_id', $info)) {
+      // and this dbxref is from a Dbxref attribute not an Ontology_term attr.
+      if (!array_key_exists('dbxref_id', $info) and
+          !array_key_exists('cvterm_id', $info)) {
         $sql .= "(:db_id_$i, :accession_$i),\n";
         $args[":db_id_$i"] = $this->db_lookup[$info['db']];
         $args[":accession_$i"] = $info['accession'];
@@ -1771,6 +1812,63 @@ class GFF3Importer extends TripalImporter {
     }
   }
 
+  /**
+   *
+   */
+  private function loadFeatureCVterms() {
+    $batch_size = 100;
+    $num_features = count(array_keys($this->features));
+    $num_batches = (int) ($num_features / $batch_size) + 1;
+
+    $this->setItemsHandled(0);
+    $this->setTotalItems($num_batches);
+
+    // Don't need to use placeholders for this insert since we are only using integers.
+
+    $init_sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id) VALUES \n";
+    $i = 0;
+    $j = 0;
+    $total = 0;
+    $batch_num = 1;
+    $sql = '';
+    $args = [];
+    foreach ($this->features as $uniquename => $feature) {
+
+      // Only do an insert if this feature doesn't already exist in the databse.
+      if (!$this->doesFeatureAlreadyExist($feature)) {
+        $i++;
+        $total++;
+
+        $this->ensureFeatureIsLoaded($feature);
+
+        // Iterate through all of the dbxrefs of this feature.
+        foreach ($feature['terms'] as $index => $info) {
+          $j++;
+          $sql .= "(:feature_id_$j, :cvterm_id_$j, :pub_id_$j),\n";
+          $args[":feature_id_$j"] = $feature['feature_id'];
+          $args[":cvterm_id_$j"] = $this->cvterm_lookup[$index];
+          $args[":pub_id_$j"] = $this->null_pub->pub_id;
+        }
+        // If we've reached the size of the batch then let's do the insert.
+        if ($i == $batch_size or $total == $num_features) {
+          if (count($args) > 0) {
+            $sql = rtrim($sql, ",\n");
+            $sql = $init_sql . $sql;
+            $last_id = chado_query($sql, $args, ['return' => Database::RETURN_INSERT_ID]);
+          }
+          $this->setItemsHandled($batch_num);
+          $batch_num++;
+
+          // Now reset all of the varables for the next batch.
+          $sql = '';
+          $i = 0;
+          $j = 0;
+          $args = [];
+        }
+      }
+    }
+  }
+
   /**
    *
    */
@@ -2132,7 +2230,6 @@ class GFF3Importer extends TripalImporter {
     $filesize = filesize($this->gff_file);
     $this->setTotalItems($filesize);
 
-
     $in_fasta = 0;
     $line_num = 0;
     $num_read = 0;