Browse Source

More fixes for GO problems...

Stephen Ficklin 6 years ago
parent
commit
544a49d9f4
1 changed files with 181 additions and 118 deletions
  1. 181 118
      tripal_chado/includes/TripalImporter/OBOImporter.inc

+ 181 - 118
tripal_chado/includes/TripalImporter/OBOImporter.inc

@@ -195,6 +195,15 @@ class OBOImporter extends TripalImporter {
    * @var string
    */
   private $is_subset = FALSE;
+  
+  /**
+   * Sometimes an OBO can define two terms with the same name but different 
+   * IDs (e.g. GO:0001404 and GO:0007125). We need to find these and 
+   * deal with them.  This array keeps track of term names as we see them for
+   * easy lookup later.
+   * @var array
+   */
+  private $term_names = [];
 
   /**
    * @see TripalImporter::form()
@@ -1128,15 +1137,15 @@ class OBOImporter extends TripalImporter {
     // Is this term borrowed from another ontology?  
     $is_borrowed = $this->isTermBorrowed($stanza);
     
-    // The cvterm ChadoRecord object.
+    // Will hold the cvterm ChadoRecord object.
     $cvterm = NULL;
-    
+        
     // Get the CV and DB objects.
     $cv = $this->all_cvs[$cvname];
     $db = $this->all_dbs[$dbname];
-    
+        
     // If this is set to TRUE then we should insert the term.
-    $do_cvterm_insert = FALSE;
+    $do_cvterm_insert = TRUE;
     
     // We need to locate terms using their dbxref. This is because term names
     // can sometimes change, so we don't want to look up the term by it's name.
@@ -1147,137 +1156,87 @@ class OBOImporter extends TripalImporter {
       'accession' => $accession
     ]);
     if ($dbxref->find()) {
-      // Now see if there is a cvterm for this dbxref. A dbxref can only be 
-      // used one time. If so, then update the term, but only if this term
-      // belongs to namespace created by this OBO. We don't want to update
-      // terms from other OBOs that may be borrowed by this one.
-      $cvterm = new ChadoRecord('cvterm');
-      $cvterm->setValues(['dbxref_id' => $dbxref->getID()]);
-      if ($cvterm->find()) {
+      
+      // Does this accession already have a cvterm it's associated with? Then
+      // we need to make we will update the name. Names change but accessions
+      // always refer to the same term.
+      $dbx_cvterm = new ChadoRecord('cvterm');
+      $dbx_cvterm->setValues(['dbxref_id' => $dbxref->getID()]);
+      if ($dbx_cvterm->find()) {
+        $do_cvterm_insert = FALSE;
+        $cvterm = $dbx_cvterm;
+        
+        // We don't want to do any updates for borrowed terms. Just leave them
+        // as they are.
         if (!$is_borrowed) {
-
+          
+          // Let's make sure we don't have a conflict in term naming 
+          // if we change the name of this term.
+          $this->fixTermMismatch($stanza, $dbxref, $cv, $name);
+          
+          // Now update this cvterm record.
           $cvterm->setValue('name', $name);
           $cvterm->setValue('definition', $definition);
-          $cvterm->setValue('is_obsolete', $is_obsolete);          
+          $cvterm->setValue('is_obsolete', $is_obsolete);
           
-          // Case #1:  The name of the cvterm with this accession matches
-          // the name in the stanza. All is well and we can update.
-          if ($cvterm->getValue('name') == $stanza['name'][0]) {
-            // We'll do the upate at the end of this large if block.
+          try {
+            $cvterm->update();
           }
-          // Case #2:  The name of this cvterm is different from the stanza.
-          else {
-          
-            // So the names are different, Does a term exist with
-            // the name for this term already?  We can't update a term to
-            // have the same name as another. It will break the cvterm 
-            // unique constraints.  
-            $check_cvterm = new ChadoRecord('cvterm');
-            $check_cvterm->setValues(['cv_id' => $cv->cv_id, 'name' => $name]);
-            if ($check_cvterm->find()) {
-              
-              // Get the accession of this conflicting term and see if it 
-              // exists in the OBO that's being loaded.
-              $check_dbxref = new ChadoRecord('dbxref', $cvterm->getValue('dbxref_id'));
-              $check_db = new ChadoRecord('db', $check_dbxref->getValue('db_id'));
-              $check_accession = $check_db->getValue('name') . ':' . $check_dbxref->getValue('accession');
-              $check_stanza = $this->getCachedTermStanza($check_accession);
-                            
-              // Case 2a:  The other term that currently has the same name is 
-              // missing in the OBO file (i.e. no stanza).  So, that means
-              // that this term probably got relgated to an alt_id.  We do 
-              // not want to delete this term because it may be linked to other
-              // records. Instead, let's update it's name to let folks know
-              // what happend to it and so we can get around the unique
-              // constraint.  An example of this is the GO:0015881 and
-              // GO:1902598 terms where the latter became an alt_id of the 
-              // first and no longer has its own entry.
-              if (!$check_stanza) {
-                $check_cvterm->setValue('name', $check_cvterm->getValue('name') . ' (removed from . ' . $check_db->getValue('name') . ')');
-                $check_cvterm->update();
-              }
-              
-              // Case 2b:  The other term is in the OBO file (ie. has a stanza).
-              // That means that there has been some name swapping between 
-              // terms. We need to temporarily rename that term so that
-              // we don't have a unique constraint violation when we update
-              // this one.  An example of this is where GO:000425 and 
-              // GO:0030242 changed names and one was renamed to the previous 
-              // name of the other.
-              else {
-                $check_cvterm->setValue('name', $check_cvterm->getValue('name') . ' (term name needs update)');
-                $check_cvterm->update();
-              }             
-            }
-            
-            // Case 2c:  The name has changed but there is no ther term
-            // with the same new name. We are good!  
-            else {
-              // Do nothing, let the update for this term occur at the 
-              // end of the if block.
-            }
+          catch (Exception $e) {
+            $this->logMessage('Could not update the term, "!term", with name, "!name" for vocabulary, "!vocab". ERROR: !error.',
+              ['!term' => $id,
+                '!name' => $name,
+                '!vocab' => $cv->name,
+                '!error' => $e->getMessage(),
+              ],
+              TRIPAL_ERROR);
+            throw $e;
           }
-          
-          // Now update this cvterm record.
-          $cvterm->update();
-        }
-      }
-      else {
-        $do_cvterm_insert = TRUE;
-      }
+        }                                    
+      }      
     }
-    else {
-      // The dbxref doesn't exist, so let's add it.
+    // The dbxref doesn't exist, so let's add it.
+    else {      
       $dbxref->insert();
-      $do_cvterm_insert = TRUE;
     }
     
+    // Add the cvterm if we didn't do an update.
     if ($do_cvterm_insert) {
-      // Before inserting the term let's check to see if it already exists. The
-      // cvterm table has two unique constraints, one on the dbxref and another
-      // on the name, cv_id and is_obsolete columns.  If for some reason the
-      // term exists but is assocaited with a different dbxref we'll have 
-      // a problem.  Because the is_obsolete may be set this time, but the
-      // term may not have it set from a previous load we will check for the
-      // presence of the term without the is_obsolete.
+      
+      // Before updating the term let's check to see if it already exists
+      // and make corrections.
       $cvterm = new ChadoRecord('cvterm');
+      $cvterm->setValue('cv_id', $cv->cv_id);
+      $cvterm->setValue('name', $name);
+      if ($cvterm->find()) {
+        $fixed = $this->fixTermMismatch($stanza, $dbxref, $cv, $name);
+      }
+      
+      // The term doesnt exist, so let's just do our insert.
       $cvterm->setValues([
         'cv_id' => $cv->cv_id,
         'name' => $name,
+        'definition' => $definition,
+        'dbxref_id' => $dbxref->getID(),
+        'is_relationshiptype' => $is_relationship,
+        'is_obsolete' => $is_obsolete,
       ]);
-      if ($cvterm->find()) {
-        // We found the term so that means it's assocaited with a different
-        // dbxref.  We need to correct it.
-        if (!$is_borrowed) {
-          $old_dbxref = new ChadoRecord('dbxref');
-          $old_dbxref->setValues(['dbxref_id' => $cvterm->getValue('dbxref_id')]);
-          $old_dbxref->find();
-          $old_db = new ChadoRecord('db');
-          $old_db->setValues(['db_id' => $dbxref->getValue('db_id')]);
-          $old_db->find();
-          $this->logMessage('Correcting misassigned accession: !id => "!name". Previously was: !old_id.', 
-            ['!id' => $id, '!name' => $name, 
-             '!old_id' => $old_db->getValue('name') . ':' . $old_dbxref->getValue('accession')]);
-          $cvterm->setValue('dbxref_id', $dbxref->getID());
-          $cvterm->setValue('definition', $definition);
-          $cvterm->setValue('dbxref_id', $dbxref->getID());
-          $cvterm->setValue('is_obsolete', $is_obsolete);
-          $cvterm->update();
-        }
-      }
-      else {
-
-        // The term doesnt exist, so let's just do our insert.
-        $cvterm->setValues([
-          'cv_id' => $cv->cv_id,
-          'name' => $name,
-          'definition' => $definition,
-          'dbxref_id' => $dbxref->getID(),
-          'is_relationshiptype' => $is_relationship,
-          'is_obsolete' => $is_obsolete,
-        ]);
+      
+      // If the insert failes lets catch the error so we can
+      // give a more informative message.
+      try {
         $cvterm->insert();
       }
+      catch (Exception $e) {
+        $this->logMessage('Could not insert the term, "!term", with name, "!name" for vocabulary, "!vocab". ERROR: !error.',
+          ['!term' => $id,
+           '!name' => $name,
+           '!vocab' => $cv->name,
+           '!error' => $e->getMessage(),
+          ], 
+          TRIPAL_ERROR);
+        throw $e;
+      }
     }
     
     // Save the cvterm_id for this term so we don't look it up again.
@@ -1288,6 +1247,95 @@ class OBOImporter extends TripalImporter {
     return $cvterm_id;
   }
 
+  /**
+   * Fixes mistmaches between two terms with the same name.
+   * 
+   * If it has been determined that a term's name has changed. Before we update
+   * or insert it we must check to make sure no other terms have that name. If
+   * they do we must make a correction.
+   *
+   * @param $dbxref
+   *   The ChadoRecord object conaining the dbxref record for the term
+   *   to be inserted/updated.
+   * @param $cv
+   *   The cvterm object.
+   * @param $name
+   *   The name of the term that is a potential conflict.
+   *   
+   * @return 
+   *   Returns TRUE if a conflict was found and corrected.
+   */
+  public function fixTermMismatch($stanza, $dbxref, $cv, $name) {
+          
+    $id = $stanza['id'][0];
+    $name =  $stanza['name'][0];
+    
+    // First get the record for any potential conflicting term.
+    $check_cvterm = new ChadoRecord('cvterm');
+    $check_cvterm->setValues(['cv_id' => $cv->cv_id, 'name' => $name]);
+    if ($check_cvterm->find()) {
+                  
+      // If the dbxref of this matched term is the same as the current term 
+      // then it is the same term and there is no conflict.
+      if ($dbxref->getID() == $check_cvterm->getValue('dbxref_id')) {
+        return FALSE;
+      }     
+     
+      // At this point, we have a cvterm with the same name and vocabulary
+      // but with a different dbxref. First let's get that other accession.
+      $check_dbxref = new ChadoRecord('dbxref', $check_cvterm->getValue('dbxref_id'));
+      $check_db = new ChadoRecord('db', $check_dbxref->getValue('db_id'));
+      $check_accession = $check_db->getValue('name') . ':' . $check_dbxref->getValue('accession');
+
+      // Enable this linee for debugging"
+      //$this->logMessage('The term, !id, "!name", has a name that belongs to another term: !id2.',
+      //  ['!id' => $id, '!name' => $name, '!id2' => $check_accession]);
+      
+      
+      // Case 1:  The other term that currently has the same name is
+      // missing in the OBO file (i.e. no stanza).  So, that means that this
+      // term probably got relgated to an alt_id on another term.  We do
+      // not want to delete a term because it may be linked to other
+      // records. Instead, let's update its name to let folks know
+      // what happend to it and so we can get around the unique
+      // constraint.  An example of this is the GO:0015881 and
+      // GO:1902598 terms where the latter became an alt_id of the
+      // first and no longer has its own entry.
+      $check_stanza = $this->getCachedTermStanza($check_accession);
+      if (!$check_stanza) {
+        $new_name = $check_cvterm->getValue('name') . ' (' . $check_accession . ')';
+        
+        // Enable these lines for debugging"
+        //$this->logMessage('The term, !id, it is no longer a valid term.', 
+        //  ['!id' => $check_accession]);   
+        //$this->logMessage('Renaming !id1 to "!new_name".',
+        //  ['!id1' => $check_accession, '!new_name' => $new_name]);
+        $check_cvterm->setValue('name', $new_name);
+        $check_cvterm->update();
+        
+        return TRUE;
+      }
+      
+      // Case 2:  The conflicting term is in the OBO file (ie. has a stanza).
+      // That means that there has been some name swapping between
+      // terms. We need to temporarily rename the term so that
+      // we don't have a unique constraint violation when we update
+      // the new one.  An example of this is where GO:000425 and
+      // GO:0030242 changed names and one was renamed to the previous
+      // name of the other.
+      else {
+        $new_name = $check_cvterm->getValue('name') . ' (' . $check_accession . ')';
+        $this->logMessage('Renaming !id1 to "!new_name".',
+          ['!id1' => $check_accession, '!new_name' => $new_name]);
+        $check_cvterm->setValue('name', $new_name);
+        $check_cvterm->update();
+        return TRUE;
+      }
+    }
+    
+    // We have no conflict so it's save to update or insert.
+    return FALSE;
+  }
   /**
    * Uses the provided term array to add/update information to Chado about the
    * term including the term, dbxref, synonyms, properties, and relationships.
@@ -1638,7 +1686,7 @@ class OBOImporter extends TripalImporter {
       throw new Exception('Cannot cache terms without a default DB.' . print_r($stanza, TRUE));
     }
     
-    $id = $stanza['id'][0];
+    $id = $stanza['id'][0]; 
     
     // First check if this term is already in the cache, if so then skip it.
     if ($this->getCachedTermStanza($id)) {
@@ -1740,9 +1788,15 @@ class OBOImporter extends TripalImporter {
     }
     
 
+    // Cache the term stanza
     $this->termStanzaCache['ids'][$id] = $stanza;
     $this->termStanzaCache['count'][$type]++;
     $this->termStanzaCache['types'][$type][] = $id;
+    
+    // Cache the term name so we don't have conflicts.
+    $name = $stanza['name'][0];
+    $this->term_names[$name] = 1;
+    
   }
   
   /**
@@ -1919,6 +1973,15 @@ class OBOImporter extends TripalImporter {
             $cv = $this->all_cvs[$namespace];
             $this->obo_namespaces[$namespace] = $cv->cv_id;
           }
+          
+          // Before caching this stanza, check the term's name to 
+          // make sure it doesn't conflict. If it does we'll just
+          // add the ID to the name to ensure it doesn't.
+          if (array_key_exists($stanza['name'][0], $this->term_names)) {
+            $new_name = $stanza['name'][0] . '(' . $stanza['id'][0] .')';
+            $stanza['name'][0] = $stanza['name'][0];
+          }
+          
           $this->cacheTermStanza($stanza, $type);
           
         }