Browse Source

Merge pull request #11 from srobb1/noLoops

Update GFF3 export to be more efficient.
Lacey-Anne Sanderson 8 years ago
parent
commit
8259aa73d5
1 changed files with 84 additions and 62 deletions
  1. 84 62
      api/blast_ui.api.inc

+ 84 - 62
api/blast_ui.api.inc

@@ -744,7 +744,11 @@ function convert_tsv2gff3($blast_tsv,$blast_gff){
   $tsv = fopen($blast_tsv, "r") or die("Unable to open tsv file!");
 
   // For each line in the TSV file...
-  $results = array();
+  // Need to go thru each line of tsv to find the first and last hsp of a hit.
+  $last_s = NULL;
+  $hsp = NULL;
+  $HitResult=array();
+  
   while(!feof($tsv)) {
     $line = fgets($tsv);
     $line = rtrim($line);
@@ -754,6 +758,8 @@ function convert_tsv2gff3($blast_tsv,$blast_gff){
       continue;
     }
 
+    ## for keeping track of new queries and hits
+
     // Each line has the following parts:
     //  0: query id,
     //  1: subject id,
@@ -769,73 +775,89 @@ function convert_tsv2gff3($blast_tsv,$blast_gff){
     // 11: bit score
     $parts = preg_split('/\t/', $line);
 
-    // Assign the important parts of the time to readable variables.
-    $hitname = $parts[1];
-    $queryId = $parts[0];
-    $subjectStart = $parts[8];
-    $subjectEnd = $parts[9];
-    $queryStart = $part[6];
-    $queryEnd = $parts[7];
-    $eval = $parts[10];
-    $hspInfo = "$queryId,$subjectStart,$subjectEnd,$queryStart,$queryEnd";
-    $results[$hitname][$hspInfo] = $eval;
+    // Assign the important parts of the line to readable variables.
+    $s = $parts[1];
+    $q = $parts[0];
+    $ss = $parts[8];
+    $se = $parts[9];
+    $qs = $part[6];
+    $qe = $parts[7];
+    $e = $parts[10];
+     
+
+    // if this is a new hit print the last and 
+    // empty the $HitResult array and
+    // reset hsp counter
+    if ($last_s != NULL and $s != $last_s ) {
+      printGFF_parent_children($gff,$HitResult);
+      $HitResult = array();
+      $hsp=0;
+    }
+  
+   // every line is a new hsp
+    $hsp++;
+  
+    // determine query strand to use in match_part line, no need to store, just print
+    $q_strand = '+';
+    if ($qs > $qe) {
+        list($qs,$qe) = array($qe,$qs);
+        $q_strand = '-';
+    }
+
+    // determine subject (hit) strand to use in match line, needs to be stored
+    $HitResult["$s,$q"]['strand']='+';
+    list($start,$end) = array($ss,$se);
+    if($ss > $se) {
+       list($start,$end) = array($se,$ss);
+       $HitResult["$s,$q"]['strand']='-';
+     }
+  
+    // store smallest start
+     if (!array_key_exists('SS',$HitResult["$s,$q"]) or $ss < $HitResult["$s,$q"]['SS']) {
+       $HitResult["$s,$q"]['SS'] = $ss;
+     }
+    
+    // store largest end
+     if (!array_key_exists('SE',$HitResult["$s,$q"]) or $se > $HitResult["$s,$q"]['SE']) {
+       $HitResult["$s,$q"]['SE'] = $se;
+     }
+
+     // store best evalue
+     if (!array_key_exists('E',$HitResult["$s,$q"]) or $e < $HitResult["$s,$q"]['E']) {
+       $HitResult["$s,$q"]['E'] = $e;
+     }   
+    
+     // generate the match_part line for each hsp
+     $HitResult["$s,$q"]['HSPs'][] = join("\t", array($s, "BLASTRESULT" , "match_part" , $start , $end , $e , $HitResult["$s,$q"]['strand'] , '.' , "ID=$s.$q.$hsp;Parent=$s.$q;Target=$q $qs $qe $q_strand"));
+     $last_s = $s;
   } // end tsv file while
 
-  $IDs = array();
-  $count = 0;
-  $last_s = NULL;
+  // print hit and hsp for the last hit
+  printGFF_parent_children($gff,$HitResult);
 
-    // Need to go thru each line of tsv to find the first and last hsp of a hit.
-    // Need to get the smallest and largest coordinate for the parent feature
-    foreach ($results as $s => $hspInfoArray) {
-      $count++;
-      $hsp = 0;
-      foreach ($hspInfoArray as $hspInfoStr => $e) {
-        list($q,$ss,$se,$qs,$qe) = preg_split('/,/',$hspInfoStr);
-        if ($s != NULL and  $s != $last_s ) {
-          $hsp=0;
-        }
-        $IDs["$s,$q"]['count']=$count;
-        $q_strand = '+';
-        if ($qs > $qe) {
-          list($qs,$qe) = array($qe,$qs);
-          $q_strand = '-';
-        }
-        $IDs["$s,$q"]['strand']='+';
-        list($start,$end) = array($ss,$se);
-        if($ss > $se) {
-          list($start,$end) = array($se,$ss);
-          $IDs["$s,$q"]['strand']='-';
-        }
-        if (!array_key_exists('SS',$IDs["$s,$q"]) or $ss < $IDs["$s,$q"]['SS']) {
-          $IDs["$s,$q"]['SS'] = $ss;
-        }
-        if (!array_key_exists('SE',$IDs["$s,$q"]) or $se > $IDs["$s,$q"]['SE']) {
-          $IDs["$s,$q"]['SE'] = $se;
-        }
-        if (!array_key_exists('E',$IDs["$s,$q"]) or $e < $IDs["$s,$q"]['E']) {
-          $IDs["$s,$q"]['E'] = $e;
-        }
-        $hsp++;
-        $IDs["$s,$q"]['HSPs'][] = join("\t", array($s, "BLASTRESULT" , "match_part" , $start , $end , $e , $IDs["$s,$q"]['strand'] , '.' , "ID=$s.$count.$hsp;Parent=$s.$count;Target=$q $qs $qe $q_strand"));
-        $last_s = $s;
-      }
-    }
+  // Close the files.
+  fclose($tsv);
+  fclose($gff);
+}
 
-  // Now can print a parent gff line and all the children.
-  // Note: the evalues seem to be sorted properly without actually sorting them.
-  // @todo: need to make sure this is always true.
-  foreach ($IDs as $sq => $value ) {
+/**
+ *   printGFF_parent_children
+ *   prints the GFF parent feature and all of its children features
+ *
+ *
+ *  @param $blast_feature_array
+ *  an array of the all the child features which is used to generate the smallest and largest coordinates for the parent
+ *
+ *
+ */
+
+function printGFF_parent_children ($gff,$blast_feature_array){
+  foreach ($blast_feature_array as $sq => $value ) {
     list ($s,$q) = preg_split('/,/' , $sq);
-    $evalue =  $IDs["$s,$q"]['E'];
-    $count = $IDs["$s,$q"]['count'];
-    $parent =  join ("\t", array($s, "BLASTRESULT" , "match" , $IDs["$s,$q"]['SS'] , $IDs["$s,$q"]['SE'] , $IDs["$s,$q"]['E'] , $IDs["$s,$q"]['strand'] , '.' , "ID=$s.$count;Name=$q($evalue)")) . "\n";
-    $child = join ("\n",$IDs[$sq]['HSPs']) . "\n";
+    $evalue =  $blast_feature_array["$s,$q"]['E'];
+    $parent =  join ("\t", array($s, "BLASTRESULT" , "match" , $blast_feature_array["$s,$q"]['SS'] , $blast_feature_array["$s,$q"]['SE'] , $blast_feature_array["$s,$q"]['E'] , $blast_feature_array["$s,$q"]['strand'] , '.' , "ID=$s.$q;Name=$q($evalue)")) . "\n";
+    $child = join ("\n",$blast_feature_array["$s,$q"]['HSPs']) . "\n";
     fwrite($gff,$parent);
     fwrite($gff,$child);
   }
-
-  // Close the files.
-  fclose($tsv);
-  fclose($gff);
 }