Browse Source

Fixed a few bugs in the GFF loader, sped up insertions and added the 'score' column of the GFF file to the 'significance' column of the featureanalysis

spficklin 13 years ago
parent
commit
72355c4080
1 changed files with 105 additions and 27 deletions
  1. 105 27
      tripal_feature/gff_loader.php

+ 105 - 27
tripal_feature/gff_loader.php

@@ -224,8 +224,13 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
    $previous_db = tripal_db_set_active('chado');
    print "Opening $gff_file\n";
     
-   $lines = file($dfile,FILE_SKIP_EMPTY_LINES);
-   $i = 0;
+   //$lines = file($dfile,FILE_SKIP_EMPTY_LINES);
+   $fh = fopen($dfile,'r');
+   if(!$fh){
+      print "ERROR: cannot open file: $dfile\n";
+      return 0;
+   }
+   $filesize = filesize($dfile);
 
    // get the controlled vocaubulary that we'll be using.  The
    // default is the 'sequence' ontology
@@ -240,19 +245,23 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
    $sql = "SELECT * FROM organism WHERE organism_id = %d";
    $organism = db_fetch_object(db_query($sql,$organism_id));
 
-
-   $num_lines = sizeof($lines);
-   $interval = intval($num_lines * 0.01);
+   $interval = intval($filesize * 0.01);
    if($interval == 0){
-      $interval = $num_lines;
+      $interval = 1;
    }
    $in_fasta = 0;
-   foreach ($lines as $line_num => $line) {
-   
-      $i++;  // update the line count
+//   foreach ($lines as $line_num => $line) {
+   $line_num = 0;
+   $num_read = 0;
+
+   while($line = fgets($fh)){
+
+      $line_num++;
+      $num_read += strlen($line);
+
       // update the job status every 1% features
-      if($job and $i % $interval == 0){
-         tripal_job_set_progress($job,intval(($i/$num_lines)*100));
+      if($job and $num_read % $interval == 0){
+         tripal_job_set_progress($job,intval(($num_read/$filesize)*100));
       }
       // check to see if we have FASTA section, if so then set the variable
       // to start parsing
@@ -277,7 +286,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
       // remove URL encoding and get the columns
       $cols = explode("\t",$line);
       if(sizeof($cols) != 9){
-         print "ERROR: improper number of columns on line $i\n";
+         print "ERROR: improper number of columns on line $line_num\n";
          print_r($cols);
          return '';
       }
@@ -323,10 +332,10 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
                     WHERE CV.cv_id = %d and (CVT.name = '%s' or CVTS.synonym = '%s')";
       $cvterm = db_fetch_object(db_query($cvtermsql,$cv->cv_id,$type,$type));
       if(!$cvterm){
-         print "ERROR: cannot find ontology term '$type' on line $i.\n";
+         print "ERROR: cannot find ontology term '$type' on line $line_num.\n";
          return '';
       }
-      
+
       // break apart each of the attributes
       $tags = array();
       $attr_name = '';
@@ -346,7 +355,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
             continue;
          }
          if(!preg_match('/^[^\=]+\=[^\=]+$/',$attr)){
-            print "ERROR: attribute is not correctly formatted on line $i: $attr\n";
+            print "ERROR: attribute is not correctly formatted on line $line_num: $attr\n";
             return '';
          }
 
@@ -375,7 +384,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
          if(array_key_exists('Parent',$tags)){
             $attr_uniquename = $tags['Parent'][0]."-$type-$landmark:$fmin..$fmax";
          } else { 
-           print "ERROR: cannot generate a uniquename for feature on line $i\n";
+           print "ERROR: cannot generate a uniquename for feature on line $line_num\n";
            exit;
          }
          $attr_name = $attr_uniquename;
@@ -415,7 +424,6 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
             return '';
          }
       }
-
       
       // if the option is to remove or refresh then we want to remove
       // the feature from the database.
@@ -435,10 +443,10 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
     
 
          // add/update the feature
-         print "$i ";
+         print "line $line_num, ". intval(($num_read/$filesize)*100). "%. ";
          $feature = tripal_feature_load_gff3_feature($organism,$analysis_id,$cvterm,
             $attr_uniquename,$attr_name,$residues,$attr_is_analysis,
-            $attr_is_obsolete, $add_only);
+            $attr_is_obsolete, $add_only,$score);
 
          // store all of the features for use later by parent and target
          // relationships
@@ -477,12 +485,11 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
             // add any additional attributes
             if($attr_others){
                foreach($attr_others as $property => $value){
-                  print "   Setting feature property: $property -> $value\n";
                   tripal_feature_load_gff3_property($feature,$property,$value);
                }
             }
          }
-      }      
+      } 
    }
    // now set the rank of any parent/child relationships.  The order is based
    // on the fmin.  The start rank is 1.  This allows features with other
@@ -858,7 +865,7 @@ function tripal_feature_load_gff3_alias($feature,$aliases){
  * @ingroup gff3_loader
  */
 function tripal_feature_load_gff3_feature($organism,$analysis_id,$cvterm,$uniquename,$name,
-   $residues,$is_analysis='f',$is_obsolete='f',$add_only)  {
+   $residues,$is_analysis='f',$is_obsolete='f',$add_only,$score)  {
 
    // check to see if the feature already exists
    $feature_sql = "SELECT * FROM {feature} 
@@ -905,17 +912,37 @@ function tripal_feature_load_gff3_feature($organism,$analysis_id,$cvterm,$unique
       print "Skipping existing feature: '$uniquename' ($cvterm->name).\n";
       return 0;
    }
+
    // get the newly added feature
    $feature = db_fetch_object(db_query($feature_sql,$organism->organism_id,$uniquename,$cvterm->cvterm_id));
 
    // add the analysisfeature entry to the analysisfeature table if it doesn't already exist
-   $af_values = array('analysis_id' => $analysis_id, 'feature_id' => $feature->feature_id);
-   if(tripal_core_chado_select('analysisfeature',array('analysisfeature_id'),$af_values,array('has_record'))){
+   $af_values = array(
+      'analysis_id' => $analysis_id, 
+      'feature_id' => $feature->feature_id
+   );
+   $afeature = tripal_core_chado_select('analysisfeature',array('analysisfeature_id'),$af_values,array('has_record'));
+   if(count($afeature)==0){
+      // if a score is avaialble then set that to be the significance field
+      if(strcmp($score,'.')!=0){
+        $af_values['significance'] = $score;
+      }
       if(!tripal_core_chado_insert('analysisfeature',$af_values)){
          print "ERROR: could not add analysisfeature record: $analysis_id, $feature->feature_id\n";
       } else {
          print "   Added analysisfeature record\n";
       }
+   } else {
+      // if a score is avaialble then set that to be the significance field
+      $new_vals = array();
+      if(strcmp($score,'.')!=0){
+        $new_vals['significance'] = $score;
+      }
+      if(!$add_only and !tripal_core_chado_update('analysisfeature',$af_values,$new_vals)){
+         print "ERROR: could not update analysisfeature record: $analysis_id, $feature->feature_id\n";
+      } else {
+         print "   Updated analysisfeature record\n";
+      } 
    }
 
    return $feature;
@@ -946,9 +973,10 @@ function tripal_feature_load_gff3_featureloc($feature,$organism,$landmark,$fmin,
 
    // check to see if this featureloc already exists, but also keep track of the
    // last rank value
-   $rank = -1;  
+   $rank = 0;  
    $exists = 0;  
-   $featureloc_sql = "SELECT FL.featureloc_id,FL.fmin,FL.fmax,F.uniquename as srcname
+   $featureloc_sql = "SELECT FL.featureloc_id,FL.fmin,FL.fmax,F.uniquename as srcname,
+                         rank
                       FROM {featureloc} FL
                         INNER JOIN {feature} F on F.feature_id = FL.srcfeature_id
                       WHERE FL.feature_id = %d
@@ -962,7 +990,7 @@ function tripal_feature_load_gff3_featureloc($feature,$organism,$landmark,$fmin,
          print "   No change to featureloc\n";
          $exists = 1;
       }
-      $rank = $featureloc->rank;
+      $rank = $featureloc->rank + 1;
    }
    if(!$exists){
       $rank++;
@@ -1002,6 +1030,55 @@ function tripal_feature_load_gff3_featureloc($feature,$organism,$landmark,$fmin,
  *
  * @ingroup gff3_loader
  */
+function tripal_feature_load_gff3_property($feature,$property,$value){
+   // first make sure the cvterm exists.  If the term already exists then
+   // the function should return it of not, then add it
+   $cvt_sql = "SELECT * FROM {cvterm} CVT
+               INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
+               WHERE CV.name = '%s' and CVT.name = '%s'";
+   $cvterm = db_fetch_object(db_query($cvt_sql,'feature_property',$property));
+   if(!$cvterm){
+      $term = array(
+         'id' => "null:$property",
+         'name' => $property,
+         'namespace' => 'feature_property', 
+         'is_obsolete' => 0,
+      );
+      print "   Adding cvterm, $property\n";
+      $cvterm = (object) tripal_cv_add_cvterm($term,'feature_property',0,0);
+   }
+
+   if(!$cvterm){
+      print "ERROR: cannot add cvterm, $property\n";
+      exit;
+   }
+
+   // check to see if the property already exists for this feature
+   // if it does but the value is unique then increment the rank and add it. 
+   // if the value is not unique then don't add it.
+   $add = 1;
+   $rank = 0;
+   $sql = "SELECT rank,value FROM {featureprop} 
+           WHERE feature_id = %d and type_id = %d
+           ORDER BY rank ASC";
+   $result = db_query($sql,$feature->feature_id,$cvterm->cvterm_id);
+   while($prop = db_fetch_object($result)){
+      if(strcmp($prop->value,$value)==0){
+        $add = NULL; // don't add it, it already exists
+        print "   Property already exists, skipping\n";
+      }
+      $rank = $prop->rank + 1;
+   }
+   
+   // add the property if we pass the check above
+   if($add){
+      print "   Setting feature property. $property: $value\n";
+      $isql = "INSERT INTO {featureprop} (feature_id,type_id,value,rank)
+               VALUES (%d,%d,'%s',%d)";
+      db_query($isql,$feature->feature_id,$cvterm->cvterm_id,$value,$rank);
+   }
+}
+/*
 function tripal_feature_load_gff3_property($feature,$property,$value){
    // first make sure the cvterm exists.  If the term already exists then
    // the function should return it
@@ -1031,4 +1108,5 @@ function tripal_feature_load_gff3_property($feature,$property,$value){
    // next give the feature the property
    tripal_core_insert_property('feature',$feature->feature_id,$property,'feature_property',$value,1);
 }
+*/