Browse Source

warnings, typos, small cleanup

Anthony Bretaudeau 8 years ago
parent
commit
56fa9a3446

+ 1 - 4
tripal_feature/includes/tripal_feature.chado_node.inc

@@ -771,10 +771,7 @@ function tripal_feature_node_presave($node) {
         $organism_id = $node->feature->organism_id;
         $name        = $node->feature->name;
         $uname       = $node->feature->uniquename;
-        $type_id     = $node->feature->type_id;
-        $values = array('cvterm_id' => $type_id);
-        $ftype = chado_select_record('cvterm', array('name'), $values);
-        $type = $ftype[0]->name;
+        $type        = $node->feature->cvtname;
       }
 
       $values = array('organism_id' => $organism_id);

+ 43 - 26
tripal_feature/includes/tripal_feature.fasta_loader.inc

@@ -389,7 +389,7 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
   $db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type,
   $job = NULL) {
   $transaction = db_transaction();
-  print "\nNOTE: Loading of this GFF file is performed using a database transaction. \n" .
+  print "\nNOTE: Loading of this Fasta file is performed using a database transaction. \n" .
      "If the load fails or is terminated prematurely then the entire set of \n" .
      "insertions/updates is rolled back and will not be found in the database\n\n";
   try {
@@ -410,10 +410,11 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
     }
 
     // Second, if there is a parent type then get that.
+    $parentcvterm = NULL;
     if ($parent_type) {
       $parentcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $parent_type,':synonym' => $parent_type))->fetchObject();
       if (!$parentcvterm) {
-        tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the paretne term type: '%type'", array(
+        tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the parent term type: '%type'", array(
           '%type' => $parentcvterm
         ));
         return 0;
@@ -421,6 +422,7 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
     }
 
     // Third, if there is a relationship type then get that.
+    $relcvterm = NULL;
     if ($rel_type) {
       $relcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence',':name' => $rel_type,':synonym' => $rel_type))->fetchObject();
       if (!$relcvterm) {
@@ -460,7 +462,10 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
     $num_seqs = 0;
     $prev_pos = 0;
     $set_start = FALSE;
+    $intv_read = 0;
+    $line_num = 0;
     while ($line = fgets($fh)) {
+      $line_num++;
       $num_read += strlen($line);
       $intv_read += strlen($line);
 
@@ -472,6 +477,7 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
         $defline = preg_replace("/^>/", '', $line);
 
         // Get the feature name if a regular expression is provided.
+        $name = "";
         if ($re_name) {
           if (!preg_match("/$re_name/", $defline, $matches)) {
             tripal_report_error('trp-fasta', TRIPAL_ERROR, "ERROR: Regular expression for the feature name finds nothing. Line %line.", array(
@@ -506,6 +512,7 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
         }
 
         // Get the feature uniquename if a regular expression is provided.
+        $uname = "";
         if ($re_uname) {
           if (!preg_match("/$re_uname/", $defline, $matches)) {
             tripal_report_error('trp-fasta', TRIPAL_ERROR, "ERROR: Regular expression for the feature unique name finds nothing. Line %line.", array(
@@ -515,7 +522,7 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
         }
         // If the match_type is name and no regular expression was provided
         // then use the first word as the name, otherwise, we don't set the
-        // unqiuename.
+        // uniquename.
         elseif (strcmp($match_type, 'Unique name') == 0) {
           if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
             $uname = trim($matches[1]);
@@ -527,19 +534,25 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
         }
 
         // Get the accession if a regular expression is provided.
-        preg_match("/$re_accession/", $defline, $matches);
-        if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
-          tripal_report_error('trp-fasta', TRIPAL_WARNING, "WARNING: Regular expression retrieves an accession too long for the feature name. " .
-             "Cannot add cross reference. Line %line.", array('%line' => $i
-            ));
-        }
-        else {
-          $accession = trim($matches[1]);
+        $accession = "";
+        if (!empty($re_accession)) {
+            preg_match("/$re_accession/", $defline, $matches);
+            if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
+              tripal_report_error('trp-fasta', TRIPAL_WARNING, "WARNING: Regular expression retrieves an accession too long for the feature name. " .
+                 "Cannot add cross reference. Line %line.", array('%line' => $i
+                ));
+            }
+            else {
+              $accession = trim($matches[1]);
+            }
         }
 
         // Get the relationship subject
-        preg_match("/$re_subject/", $line, $matches);
-        $subject = trim($matches[1]);
+        $subject = "";
+        if (!empty($re_subject)) {
+            preg_match("/$re_subject/", $line, $matches);
+            $subject = trim($matches[1]);
+        }
 
         // Add the details to the sequence.
         $seqs[$num_seqs] = array(
@@ -566,18 +579,18 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
         $intv_read = 0;
         $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
         if ($name) {
-          print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
+          print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
              " bytes.\r";
         }
         else {
-          print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
+          print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
              " bytes.\r";
         }
         tripal_set_job_progress($job, intval(($num_read / $filesize) * 100));
       }
     }
     $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
-    print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
+    print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
        " bytes.\r";
     tripal_set_job_progress($job, 50);
 
@@ -586,9 +599,9 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
 
     // Now that we know where the sequences are in the file we need to add them.
     print "\nStep 2: Importing sequences\n";
-    for ($i = 0; $i < $num_seqs; $i++) {
-      $seq = $seqs[$i];
-      print "Importing " . ($i + 1) . " of $num_seqs. ";
+    for ($j = 0; $j < $num_seqs; $j++) {
+      $seq = $seqs[$j];
+      print "Importing " . ($j + 1) . " of $num_seqs. ";
       if ($name) {
         print "Current feature: " . $seq['name'] . ".\n";
       }
@@ -596,6 +609,7 @@ function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_un
         print "Current feature: " . $seq['uname'] . ".\n";
       }
 
+      $source = NULL;
       tripal_feature_load_fasta_feature($fh, $seq['name'], $seq['uname'], $db_id, $seq['accession'], $seq['subject'], $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm, $source, $method, $re_name, $match_type, $parentcvterm, $relcvterm, $seq['seq_start'], $seq['seq_end']);
     }
     tripal_set_job_progress($job, 100);
@@ -666,7 +680,7 @@ function tripal_feature_load_fasta_feature($fh, $name, $uname, $db_id, $accessio
 
   // If we don't have a feature and we're doing an insert then do the insert.
   $inserted = 0;
-  if (!$feature and (strcmp($method, 'Insert only') == 0 or strcmp($method, 'Insert and update') == 0)) {
+  if (!isset($feature) and (strcmp($method, 'Insert only') == 0 or strcmp($method, 'Insert and update') == 0)) {
     // If we have a unique name but not a name then set them to be the same
     if (!$uname) {
       $uname = $name;
@@ -711,7 +725,7 @@ function tripal_feature_load_fasta_feature($fh, $name, $uname, $db_id, $accessio
   }
 
   // if we don't have a feature and the user wants to do an update then fail
-  if (!$feature and (strcmp($method, 'Update only') == 0 or
+  if (!isset($feature) and (strcmp($method, 'Update only') == 0 or
      drupal_strcmp($method, 'Insert and update') == 0)) {
     tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "Failed to find feature '%name' ('%uname') while matching on " .
       drupal_strtolower($match_type), array('%name' => $name,'%uname' => $uname));
@@ -912,7 +926,7 @@ function tripal_feature_load_fasta_residues($fh, $feature_id, $seq_start, $seq_e
   fseek($fh, $seq_start, SEEK_SET);
   $chunk_size = 100000000;
   $chunk = '';
-  $seqlen = ($seq_end - $seq_start) + 1;
+  $seqlen = ($seq_end - $seq_start);
 
   // Calculate the interval at which we updated the precent complete.
   $interval = intval($seqlen * 0.01);
@@ -938,9 +952,11 @@ function tripal_feature_load_fasta_residues($fh, $feature_id, $seq_start, $seq_e
   // get a specific bytes read then append the sequence to the one in the
   // database.
   print "Sequence complete: 0%. Memory: " . number_format(memory_get_usage()) . " bytes. \r";
+  $partial_seq_size = 0;
   while ($line = fgets($fh)) {
     $num_read += strlen($line) + 1;
     $chunk_intv_read += strlen($line) + 1;
+    $partial_seq_size += strlen($line);
     $intv_read += strlen($line) + 1;
     $chunk .= trim($line);
 
@@ -956,7 +972,8 @@ function tripal_feature_load_fasta_residues($fh, $feature_id, $seq_start, $seq_e
       if (!$success) {
         return FALSE;
       }
-      $total_seq_size += strlen($chunk);
+      $total_seq_size += $partial_seq_size;
+      $partial_seq_size = 0;
       $chunk = '';
       $chunk_intv_read = 0;
     }
@@ -967,7 +984,7 @@ function tripal_feature_load_fasta_residues($fh, $feature_id, $seq_start, $seq_e
       $intv_read = 0;
     }
 
-    // If we've reached the ned of the sequence then break out of the loop
+    // If we've reached the end of the sequence then break out of the loop
     if (ftell($fh) == $seq_end) {
       break;
     }
@@ -985,7 +1002,7 @@ function tripal_feature_load_fasta_residues($fh, $feature_id, $seq_start, $seq_e
     if (!$success) {
       return FALSE;
     }
-    $total_seq_size += strlen($chunk);
+    $total_seq_size += $partial_seq_size;
     $chunk = '';
     $chunk_intv_read = 0;
   }
@@ -995,7 +1012,7 @@ function tripal_feature_load_fasta_residues($fh, $feature_id, $seq_start, $seq_e
   chado_query($sql, array(':feature_id' => $feature_id
   ));
 
-  $percent = sprintf("%.2f", ($num_read / $seqlen) * 100);
+  $percent = sprintf("%.2f", ($total_seq_size / $seqlen) * 100);
   print "Sequence complete: " . $percent . "%. Memory: " . number_format(memory_get_usage()) .
      " bytes. \r";
 }