Parcourir la source

Addressed issue #1850226. Added a check to makesure that if a feature was being updated through a match on the name and it had a new uniquename, that it wouldn't conflict with an existing feature of the same uniquename. Also cleaned up some of the error messages and added some more problem checking

spficklin il y a 12 ans
Parent
commit
f5204888f4
1 fichiers modifiés avec 85 ajouts et 30 suppressions
  1. 85 30
      tripal_feature/includes/fasta_loader.inc

+ 85 - 30
tripal_feature/includes/fasta_loader.inc

@@ -50,7 +50,7 @@ function tripal_feature_fasta_load_form( ) {
     '#type' => 'textfield',
     '#title' => t('Sequence Type'),
     '#required' => TRUE,
-    '#description' => t('Please enter the Sequence Ontology term that describes the sequences in the FASTA file.'),
+    '#description' => t('Please enter the Sequence Ontology (SO) term name that describes the sequences in the FASTA file (e.g. gene, mRNA, protein, etc...)'),
   );
 
 
@@ -99,18 +99,17 @@ function tripal_feature_fasta_load_form( ) {
       t('Name'),
       t('Unique name'),
     ),
-    '#description' => t('Feature data is stored in Chado with both a human-readable
-      name and a unique name. If the features in your FASTA file are identified using
+    '#description' => t('Used for "updates only" or "insert and update" methods. Not required if method type is "insert".  
+      Feature data is stored in Chado with both a human-readable
+      name and a unique name. If the features in your FASTA file are uniquely identified using
       a human-readable name then select the "Name" button. If your features are
-      identified using the unique name then select the "Unique name" button.  If you
+      uniquely identified using the unique name then select the "Unique name" button.  If you
       loaded your features first using the GFF loader then the unique name of each
       features were indicated by the "ID=" attribute and the name by the "Name=" attribute.
       By default, the FASTA loader will use the first word (character string
       before the first space) as  the name for your feature. If
       this does not uniquely identify your feature consider specifying a regular expression in the advanced section below.
-      Additionally, you may import both a name and a unique name for each sequence using the advanced options.
-      When updating a sequence, the value selected here will be used to identify the sequence in the
-      database in combination with any regular expression provided below.'),
+      Additionally, you may import both a name and a unique name for each sequence using the advanced options.'),
     '#default_value' => 1,
   );
 
@@ -484,6 +483,11 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type,
     $interval = 1;
   }
   $inv_read = 0;
+  
+  // we need to get the table schema to make sure we don't overrun the 
+  // size of fields with what our regular expressions retrieve
+  $feature_tbl = tripal_core_get_chado_table_schema('feature');
+  $dbxref_tbl = tripal_core_get_chado_table_schema('dbxref');
 
   //foreach ($lines as $line_num => $line) {  
   while ($line = fgets($fh)) {
@@ -496,7 +500,7 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type,
     if (preg_match('/^>/', $line)) {
       // if we have a feature name then we are starting a new sequence
       // so lets handle the previous one before moving on
-      if ($name or $uname) {
+      if ($name or $uname) {       
         tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id,
           $accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
           $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
@@ -505,26 +509,42 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type,
         $uname = '';
       }
 
-      $line = preg_replace("/^>/", '', $line);
+      $line = preg_replace("/^>/", '', $line); // remove the > symbol from the defline
+     
       // get the feature name
       if ($re_name) {
         if (!preg_match("/$re_name/", $line, $matches)) {
-          print "WARNING: Regular expression for the feature name finds nothing\n";
+          watchdog('trp-fasta', "ERROR: Regular expression for the feature name finds nothing. Line %line.", array('%line' => $i), 'error');
+        }
+        elseif (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
+          watchdog('trp-fasta', "WARNING: Regular expression retrieves a value too long for the feature name. Line %line.", array('%line' => $i), 'error');  
         }
-        $name = trim($matches[1]);
+        else {
+          $name = trim($matches[1]);
+        }        
       }
       else {
         // if the match_type is name and no regular expression was provided
         // then use the first word as the name, otherwise we don't set the name
         if (strcmp($match_type, 'Name')==0) {
-          preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches);
-          $name = trim($matches[1]);
+          if(preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches)){
+            if (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
+              watchdog('trp-fasta', "WARNING: Regular expression retrieves a feature name too long for the feature name. Line %line.", array('%line' => $i), 'error');  
+            }
+            else {
+              $name = trim($matches[1]);
+            }
+          }
+          else {
+            watchdog('trp-fasta', "ERROR: Cannot find a feature name. Line %line.", array('%line' => $i), 'error');  
+          }
         }
       }
+      
       // get the feature unique name
       if ($re_uname) {
         if (!preg_match("/$re_uname/", $line, $matches)) {
-          print "WARNING: Regular expression for the feature unique name finds nothing\n";
+          watchdog('trp-fasta', "ERROR: Regular expression for the feature unique name finds nothing. Line %line.", array('%line' => $i), 'error');
         }
         $uname = trim($matches[1]);
       }
@@ -532,13 +552,22 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type,
         // if the match_type is name and no regular expression was provided
         // then use the first word as the name, otherwise, we don't set the unqiuename
         if (strcmp($match_type, 'Unique name')==0) {
-          preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches);
-          $uname = trim($matches[1]);
+          if(preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches)){
+            $uname = trim($matches[1]);
+          }
+          else {
+            watchdog('trp-fasta', "ERROR: Cannot find a feature unique name. Line %line.", array('%line' => $i), 'error');  
+          }
         }
       }
       // get the accession
       preg_match("/$re_accession/", $line, $matches);
-      $accession = trim($matches[1]);
+      if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
+        watchdog('trp-fasta', "WARNING: Regular expression retrieves an accession too long for the feature name. Cannot add cross reference. Line %line.", array('%line' => $i), 'warning');  
+      }
+      else {
+        $accession = trim($matches[1]);
+      }
 
       // get the relationship subject
       preg_match("/$re_subject/", $line, $matches);
@@ -552,16 +581,17 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type,
         $intv_read = 0;
         $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
         if ($name) {
-          print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Parsing: $name\r";
+          print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Current feature: $name\r";
         }
         else {
-          print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Parsing: $uname\r";  
+          print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Current feature: $uname\r";  
         }
         tripal_job_set_progress($job, intval(($num_read / $filesize) * 100));
       }
     }
   }
-   // now load the last sequence in the file
+  
+  // now load the last sequence in the file
   tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id,
     $accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
     $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
@@ -596,7 +626,7 @@ function tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id, $acce
     } 
     if (count($results) == 1) {  
       $feature = $results[0];
-    } 
+    }     
   }
   // check to see if this feature already exists if the match_type is 'Unique Name'
   if (strcmp($match_type, 'Unique name')==0) {
@@ -615,14 +645,20 @@ function tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id, $acce
     } 
     if (count($results) == 1) {  
       $feature = $results[0];
+    }     
+    
+    // if the feature exists but this is an "insert only" method then skip this feature 
+    if ($feature and (strcmp($method, 'Insert only')==0)) {
+      watchdog('T_fasta_loader', "Feature already exists '%name' ('%uname') while matching on %type. Skipping insert.", 
+        array('%name' => $name, '%uname' => $uname, '%type' => drupal_strtolower($match_type)), WATCHDOG_WARNING);
+      return 0;
     } 
   }
 
   // if we don't have a feature and we're doing an insert then do the insert
   $inserted = 0;
   if (!$feature and (strcmp($method, 'Insert only')==0 or strcmp($method, 'Insert and update')==0)) {
-    // if we have a unique name but not a name then set them to be the same
-    // and vice versa
+    // if we have a unique name but not a name then set them to be the same and vice versa
     if (!$uname) {
       $uname = $name;
     }
@@ -666,23 +702,40 @@ function tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id, $acce
       watchdog('T_fasta_loader', "Failed to retreive newly inserted feature '%name (%uname)'", 
         array('%name' => $name, '%uname' => $numane), WATCHDOG_ERROR);
       return 0;  
-    }   
+    }     
   }
   
-  // if we don't have a feature and the uesr wants to do an update then fail
+  // if we don't have a feature and the user wants to do an update then fail
   if (!$feature and (strcmp($method, 'Update only')==0 or drupal_strcmp($method, 'Insert and update')==0)) {
-    watchdog('T_fasta_loader', "Failed to find feature '%name' ('%uiname') while matching on " . 
-      drupal_strtolower($match_type), array('%name' => $name, '%uiname' => $uname), WATCHDOG_ERROR);
+    watchdog('T_fasta_loader', "Failed to find feature '%name' ('%uname') while matching on " . 
+      drupal_strtolower($match_type), array('%name' => $name, '%uname' => $uname), WATCHDOG_ERROR);
     return 0;
   }
-
+  
   // if we do have a feature and this is an update then proceed with the update
   if ($feature and !$inserted and (strcmp($method, 'Update only')==0 or strcmp($method, 'Insert and update')==0)) {
     // if the user wants to match on the Name field
     if (strcmp($match_type, 'Name')==0) {
-      // if we're matching on the name but do not have a new unique name then we don't want to update the uniquename.  
+      // if we're matching on the name but do not have a unique name then we don't want to update the uniquename.  
       $values = array();
       if ($uname) {
+        // first check to make sure that by changing the unique name of this feature that we won't conflict with
+        // another existing feature of the same name
+        $values = array(
+          'organism_id' => $organism_id,
+          'uniquename' => $uname,
+          'type_id' => $cvterm->cvterm_id,    
+        );    
+        $options = array('statement_name' => 'sel_feature_oruqty');
+        $results = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
+        if (count($results) > 0) {
+          watchdog('T_fasta_loader', "Cannot update the feature '%name' with a uniquename of '%uname' and type of '%type' as it 
+            conflicts with an existing feature with the same uniquename and type.", 
+            array('%name' => $name, '%uname' => $uname, '%type' => $type));
+          return 0;
+        } 
+        
+        // the changes to the uniquename don't conflict so proceed with the update
         $values = array(
           'uniquename' => $uname,
           'residues' => $residues,
@@ -698,7 +751,7 @@ function tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id, $acce
         );
         $options = array('statement_name' => 'upd_feature_resemdisis_naorty_un');        
       }
-      // if we have a unique name then update it after matching by the name
+      // if we do not have a new unique name then don't change the existing uniquename field
       else {
         $values = array(                 
           'residues' => $residues,
@@ -714,6 +767,8 @@ function tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id, $acce
         );
         $options = array('statement_name' => 'upd_feature_unresemdisis_naorty'); 
       }
+      
+      // perform the update
       $success = tripal_core_chado_update('feature', $match, $values, $options);
       if (!$success) {
         watchdog('T_fasta_loader', "Failed to update feature '%name' ('%name')",