Ver código fonte

new GFF3 loader now supports aliases

Stephen Ficklin 4 anos atrás
pai
commit
6c90a75a49
1 arquivos alterados com 273 adições e 7 exclusões
  1. 273 7
      tripal_chado/includes/TripalImporter/GFF3Importer.inc

+ 273 - 7
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -194,6 +194,16 @@ class GFF3Importer extends TripalImporter {
    */
   private $featureprop_cvterm_lookup = [];
 
+  /**
+   * Holds the CV term for the "exact" synonym.
+   */
+  private $exact_syn = NULL;
+
+  /**
+   * Holds the object for the null publication record.
+   */
+  private $null_pub = NULL;
+
   /**
    * An array the stores existing features in the database for the organism
    * and feature types in the database.  This is used for quick lookups
@@ -208,7 +218,6 @@ class GFF3Importer extends TripalImporter {
    */
   private $features = [];
 
-
   /**
    * A mapping of features to their parents.
    */
@@ -582,21 +591,25 @@ class GFF3Importer extends TripalImporter {
       }
     }
 
-    // Prepare the temporary tables.
-    $this->prepTempTables();
+    // Make sure we have the synonym records and null publication ready to go.
+    $this->prepSynonms();
+    $this->prepNullPub();
 
     // Load the GFF3.
     $this->logMessage("Step 1: Preloading GFF3 file...");
-    $this->preLoad();
+    $this->parseGFF3();
 
     $this->logMessage("Step 2: Loading features...");
     $this->loadFeatures();
 
     $this->logMessage("Step 3: Loading feature locations...");
-    $this->loadFeatureLocs();
+    //$this->loadFeatureLocs();
 
     $this->logMessage("Step 4: Loading features properties...");
-    $this->loadFeatureProps();
+    //$this->loadFeatureProps();
+
+    $this->logMessage("Step 5: Loading features synonyms (aliases)...");
+    $this->loadAliases();
 
   }
 
@@ -617,6 +630,109 @@ class GFF3Importer extends TripalImporter {
     chado_query($sql);
   }
 
+  /**
+   * Makes sure Chado is ready with the necessary synonym type records.
+   */
+  private function prepSynonms() {
+    // make sure we have a 'synonym_type' vocabulary
+    $select = ['name' => 'synonym_type'];
+    $results = chado_select_record('cv', ['*'], $select);
+
+    if (count($results) == 0) {
+      // insert the 'synonym_type' vocabulary
+      $values = [
+          'name' => 'synonym_type',
+          'definition' => 'vocabulary for synonym types',
+      ];
+      $success = chado_insert_record('cv', $values, array(
+          'skip_validation' => TRUE,
+      ));
+      if (!$success) {
+        $this->logMessage("Failed to add the synonyms type vocabulary.", [], TRIPAL_WARNING);
+        return 0;
+      }
+      // now that we've added the cv we need to get the record
+      $results = chado_select_record('cv', ['*'], $select);
+      if (count($results) > 0) {
+        $syncv = $results[0];
+      }
+    }
+    else {
+      $syncv = $results[0];
+    }
+
+    // get the 'exact' cvterm, which is the type of synonym we're adding
+    $select = [
+        'name' => 'exact',
+        'cv_id' => [
+            'name' => 'synonym_type',
+        ],
+    ];
+    $result = chado_select_record('cvterm', ['*'], $select);
+    if (count($result) == 0) {
+      $term = [
+          'name' => 'exact',
+          'id' => "synonym_type:exact",
+          'definition' => '',
+          'is_obsolete' => 0,
+          'cv_name' => $syncv->name,
+          'is_relationship' => FALSE,
+      ];
+      $syntype = chado_insert_cvterm($term, ['update_existing' => TRUE]);
+      if (!$syntype) {
+        $this->logMessage("Cannot add synonym type: internal:$type.", [], TRIPAL_WARNING);
+        return 0;
+      }
+    }
+    else {
+      $syntype = $result[0];
+    }
+    $this->exact_syn = $syntype;
+  }
+
+  /**
+   * Makes sure there is a null publication in the database.
+   */
+  private function prepNullPub(){
+
+    // check to see if we have a NULL publication in the pub table.  If not,
+    // then add one.
+    $select = ['uniquename' => 'null'];
+    $result = chado_select_record('pub', ['*'], $select);
+    if (count($result) == 0) {
+      $pub_sql = "
+        INSERT INTO {pub} (uniquename,type_id)
+        VALUES (:uname,
+          (SELECT cvterm_id
+           FROM {cvterm} CVT
+             INNER JOIN {dbxref} DBX ON DBX.dbxref_id = CVT.dbxref_id
+             INNER JOIN {db} DB      ON DB.db_id      = DBX.db_id
+           WHERE CVT.name = :type_id))
+      ";
+      $status = chado_query($psql);
+      if (!$status) {
+        $this->logMessage("Cannot prepare statement 'ins_pub_uniquename_typeid.", [], TRIPAL_WARNING);
+        return 0;
+      }
+
+      // insert the null pub
+      $result = chado_query($pub_sql, [
+          ':uname' => 'null',
+          ':type_id' => 'null',
+      ])->fetchObject();
+      if (!$result) {
+        $this->logMessage("Cannot add null publication needed for setup of alias.", [], TRIPAL_WARNING);
+        return 0;
+      }
+      $result = chado_select_record('pub', ['*'], $select);
+      $pub = $result[0];
+    }
+    else {
+      $pub = $result[0];
+    }
+    $this->null_pub = $pub;
+  }
+
   /**
    * Parses the current line of the GFF3 file for a feature.
    *
@@ -685,6 +801,7 @@ class GFF3Importer extends TripalImporter {
     $attr_organism = $this->organism;
     $attr_parent = '';
     $attr_others = [];
+    $attr_aliases = [];
     foreach ($attrs as $attr) {
       $attr = rtrim($attr);
       $attr = ltrim($attr);
@@ -714,6 +831,9 @@ class GFF3Importer extends TripalImporter {
       if (strcmp($tag_name, 'organism') == 0) {
         $attr_organism = $this->getOrganism(urldecode($tag[1]));
       }
+      elseif (strcmp($tag_name, 'Alias') == 0) {
+        $attr_aliases = array_merge($attr_aliases, $tags[$tag_name]);
+      }
       elseif (strcmp($tag_name, 'Parent') == 0) {
         $attr_parent = urldecode($tag[1]);
       }
@@ -743,6 +863,7 @@ class GFF3Importer extends TripalImporter {
     $attr_name = $names['name'];
     $ret['name'] = $attr_name;
     $ret['uniquename'] = $attr_uniquename;
+    $ret['synonyms'] = $attr_aliases;
 
     // Now add all of the attributes into the return array.
     foreach ($tags as $key => $value) {
@@ -807,7 +928,7 @@ class GFF3Importer extends TripalImporter {
   /**
    *
    */
-  private function preLoad() {
+  private function parseGFF3() {
 
     $filesize = filesize($this->gff_file);
     $this->setTotalItems($filesize);
@@ -1193,6 +1314,151 @@ class GFF3Importer extends TripalImporter {
     }
   }
 
+  /**
+   *
+   */
+  private function loadAliases(){
+    $batch_size = 1000;
+    $num_features = count(array_keys($this->features));
+    $num_batches = (int) ($num_features / $batch_size) + 1;
+
+    $this->setItemsHandled(0);
+    $this->setTotalItems($num_batches);
+
+    $init_syn_sql = "INSERT INTO {synonym} (name, type_id, synonym_sgml) VALUES \n";
+    $init_fsyn_sql = "INSERT INTO {feature_synonym} (synonym_id, feature_id, pub_id) VALUES \n";
+    $i = 0;
+    $batch_num = 1;
+    $syn_sql = '';
+    $syn_args = [];
+    $fsyn_sql = '';
+    $fsyn_args = [];
+    $batch_synonyms = [];
+    $batch_featuresyn = [];
+    foreach ($this->features as $uniquename => $feature) {
+
+      // Only do an insert if this feature doesn't already exist in the databse.
+      if (!(array_key_exists($feature['type'], $this->feature_lookup) and
+            array_key_exists($feature['uniquename'], $this->feature_lookup[$feature['type']]))) {
+
+        $i++;
+
+        // If this feature doesn't have a feature_id then someting is wrong.
+        if (!array_key_exists('feature_id', $feature)) {
+          throw new Exception(t('The feature, !feature, is in the GFF but somehow was not added to the database.',
+              ['!feature' => $uniquename . " (" . $feature['name'] . ") at line " . $feature['line'] . '.']));
+        }
+
+        // Get all of the synonyms for this batch.
+        foreach ($feature['synonyms'] as $index => $synonym) {
+          $batch_synonyms[] = $synonym;
+          $batch_featuresyn[] = [$synonym, $feature['feature_id']];
+        }
+
+        // If we've reached the size of the batch then let's do the insert.
+        if ($i == $batch_size) {
+
+          if (count($batch_synonyms) > 0) {
+
+            // First get the synonym_ids for those already in the database.
+            $syns_avail_sql = "SELECT synonym_id, name FROM {synonym} WHERE type_id = :type_id and name IN (:names)";
+            $syns_avail_args = [
+              ':type_id' => $this->exact_syn->cvterm_id,
+              ':names' => $batch_synonyms
+            ];
+            $syns_avail = chado_query($syns_avail_sql, $syns_avail_args)->fetchAllAssoc('name', PDO::FETCH_OBJ);
+
+            // First, add any missing synonyms
+            $j = 0;
+            foreach ($batch_synonyms as $index => $synonym) {
+              if (!array_key_exists($synonym, $syns_avail)) {
+                $j++;
+                $syn_sql .= "(:name_$j, :type_id_$j, ''),\n";
+                $syn_args[":name_$j"] = $synonym;
+                $syn_args[":type_id_$j"] = $this->exact_syn->cvterm_id;
+              }
+            }
+            if ($syn_sql) {
+              $syn_sql = rtrim($syn_sql, ",\n");
+              $syn_sql = $init_syn_sql . $syn_sql;
+              $last_id = chado_query($syn_sql, $syn_args, ['return' => Database::RETURN_INSERT_ID]);
+              $syns_avail = chado_query($syns_avail_sql, $syns_avail_args)->fetchAllAssoc('name', PDO::FETCH_OBJ);
+            }
+
+            // Add in the feature synonym records for this batch.
+            $j = 0;
+            foreach ($batch_featuresyn as $index => $featuresyn) {
+              $j++;
+              $fsyn_sql .= "(:synonym_id_$j, :feature_id_$j, :pub_id_$j),\n";
+              $fsyn_args[":synonym_id_$j"] = $syns_avail[$featuresyn[0]]->synonym_id;
+              $fsyn_args[":feature_id_$j"] = $featuresyn[1];
+              $fsyn_args[":pub_id_$j"] = $this->null_pub->pub_id;
+            }
+            $fsyn_sql = rtrim($fsyn_sql, ",\n");
+            $fsyn_sql = $init_fsyn_sql . $fsyn_sql;
+            $last_id = chado_query($fsyn_sql, $fsyn_args, ['return' => Database::RETURN_INSERT_ID]);
+          }
+
+          $this->setItemsHandled($batch_num);
+
+          // Now reset all of the varables for the next batch.
+          $syn_sql = '';
+          $fsyn_sql = '';
+          $i = 0;
+          $syn_args = [];
+          $fsyn_args = [];
+          $batch_synonyms = [];
+          $batch_featuresyn = [];
+        }
+      }
+    }
+
+    // Add any remaining batch items
+    if ($i > 0) {
+      if (count($batch_synonyms) > 0) {
+        // First get the synonym_ids for those already in the database.
+        $syns_avail_sql = "SELECT synonym_id, name FROM {synonym} WHERE type_id = :type_id and name IN (:names)";
+        $syns_avail_args = [
+            ':type_id' => $this->exact_syn->cvterm_id,
+            ':names' => $batch_synonyms
+        ];
+        $syns_avail = chado_query($syns_avail_sql, $syns_avail_args)->fetchAllAssoc('name', PDO::FETCH_OBJ);
+
+        // First, add any missing synonyms
+        $j = 0;
+        foreach ($batch_synonyms as $index => $synonym) {
+          if (!array_key_exists($synonym, $syns_avail)) {
+            $j++;
+            $syn_sql .= "(:name_$j, :type_id_$j, ''),\n";
+            $syn_args[":name_$j"] = $synonym;
+            $syn_args[":type_id_$j"] = $this->exact_syn->cvterm_id;
+          }
+        }
+        if ($syn_sql) {
+          $syn_sql = rtrim($syn_sql, ",\n");
+          $syn_sql = $init_syn_sql . $syn_sql;
+          $last_id = chado_query($syn_sql, $syn_args, ['return' => Database::RETURN_INSERT_ID]);
+          $syns_avail = chado_query($syns_avail_sql, $syns_avail_args)->fetchAllAssoc('name', PDO::FETCH_OBJ);
+        }
+
+        // Add in the feature synonym records for this batch.
+        $j = 0;
+        foreach ($batch_featuresyn as $index => $featuresyn) {
+          $j++;
+          $fsyn_sql .= "(:synonym_id_$j, :feature_id_$j, :pub_id_$j),\n";
+          $fsyn_args[":synonym_id_$j"] = $syns_avail[$featuresyn[0]]->synonym_id;
+          $fsyn_args[":feature_id_$j"] = $featuresyn[1];
+          $fsyn_args[":pub_id_$j"] = $this->null_pub->pub_id;
+        }
+        $fsyn_sql = rtrim($fsyn_sql, ",\n");
+        $fsyn_sql = $init_fsyn_sql . $fsyn_sql;
+        $last_id = chado_query($fsyn_sql, $fsyn_args, ['return' => Database::RETURN_INSERT_ID]);
+      }
+
+      $this->setItemsHandled($batch_num);
+    }
+  }
+
 
   /**
    * Load a GFF3 file. This is the function called by tripal jobs