ソースを参照

Fixed a few bugs in the GFF loader, sped up insertions and added the 'score' column of the GFF file to the 'significance' column of the featureanalysis

spficklin 14 年 前
コミット
72355c4080
1 ファイル変更105 行追加27 行削除
  1. 105 27
      tripal_feature/gff_loader.php

+ 105 - 27
tripal_feature/gff_loader.php

@@ -224,8 +224,13 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
    $previous_db = tripal_db_set_active('chado');
    print "Opening $gff_file\n";
     
-   $lines = file($dfile,FILE_SKIP_EMPTY_LINES);
-   $i = 0;
+   //$lines = file($dfile,FILE_SKIP_EMPTY_LINES);
+   $fh = fopen($dfile,'r');
+   if(!$fh){
+      print "ERROR: cannot open file: $dfile\n";
+      return 0;
+   }
+   $filesize = filesize($dfile);
 
    // get the controlled vocaubulary that we'll be using.  The
    // default is the 'sequence' ontology
@@ -240,19 +245,23 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
    $sql = "SELECT * FROM organism WHERE organism_id = %d";
    $organism = db_fetch_object(db_query($sql,$organism_id));
 
-
-   $num_lines = sizeof($lines);
-   $interval = intval($num_lines * 0.01);
+   $interval = intval($filesize * 0.01);
    if($interval == 0){
-      $interval = $num_lines;
+      $interval = 1;
    }
    $in_fasta = 0;
-   foreach ($lines as $line_num => $line) {
-   
-      $i++;  // update the line count
+//   foreach ($lines as $line_num => $line) {
+   $line_num = 0;
+   $num_read = 0;
+
+   while($line = fgets($fh)){
+
+      $line_num++;
+      $num_read += strlen($line);
+
       // update the job status every 1% features
-      if($job and $i % $interval == 0){
-         tripal_job_set_progress($job,intval(($i/$num_lines)*100));
+      if($job and $num_read % $interval == 0){
+         tripal_job_set_progress($job,intval(($num_read/$filesize)*100));
       }
       // check to see if we have FASTA section, if so then set the variable
       // to start parsing
@@ -277,7 +286,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
       // remove URL encoding and get the columns
       $cols = explode("\t",$line);
       if(sizeof($cols) != 9){
-         print "ERROR: improper number of columns on line $i\n";
+         print "ERROR: improper number of columns on line $line_num\n";
          print_r($cols);
          return '';
       }
@@ -323,10 +332,10 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
                     WHERE CV.cv_id = %d and (CVT.name = '%s' or CVTS.synonym = '%s')";
       $cvterm = db_fetch_object(db_query($cvtermsql,$cv->cv_id,$type,$type));
       if(!$cvterm){
-         print "ERROR: cannot find ontology term '$type' on line $i.\n";
+         print "ERROR: cannot find ontology term '$type' on line $line_num.\n";
          return '';
       }
-      
+
       // break apart each of the attributes
       $tags = array();
       $attr_name = '';
@@ -346,7 +355,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
             continue;
          }
          if(!preg_match('/^[^\=]+\=[^\=]+$/',$attr)){
-            print "ERROR: attribute is not correctly formatted on line $i: $attr\n";
+            print "ERROR: attribute is not correctly formatted on line $line_num: $attr\n";
             return '';
          }
 
@@ -375,7 +384,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
          if(array_key_exists('Parent',$tags)){
             $attr_uniquename = $tags['Parent'][0]."-$type-$landmark:$fmin..$fmax";
          } else { 
-           print "ERROR: cannot generate a uniquename for feature on line $i\n";
+           print "ERROR: cannot generate a uniquename for feature on line $line_num\n";
            exit;
          }
          $attr_name = $attr_uniquename;
@@ -415,7 +424,6 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
             return '';
          }
       }
-
       
       // if the option is to remove or refresh then we want to remove
       // the feature from the database.
@@ -435,10 +443,10 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
     
 
          // add/update the feature
-         print "$i ";
+         print "line $line_num, ". intval(($num_read/$filesize)*100). "%. ";
          $feature = tripal_feature_load_gff3_feature($organism,$analysis_id,$cvterm,
             $attr_uniquename,$attr_name,$residues,$attr_is_analysis,
-            $attr_is_obsolete, $add_only);
+            $attr_is_obsolete, $add_only,$score);
 
          // store all of the features for use later by parent and target
          // relationships
@@ -477,12 +485,11 @@ function tripal_feature_load_gff3($gff_file, $organism_id,$analysis_id,$add_only
             // add any additional attributes
             if($attr_others){
                foreach($attr_others as $property => $value){
-                  print "   Setting feature property: $property -> $value\n";
                   tripal_feature_load_gff3_property($feature,$property,$value);
                }
             }
          }
-      }      
+      } 
    }
    // now set the rank of any parent/child relationships.  The order is based
    // on the fmin.  The start rank is 1.  This allows features with other
@@ -858,7 +865,7 @@ function tripal_feature_load_gff3_alias($feature,$aliases){
  * @ingroup gff3_loader
  */
 function tripal_feature_load_gff3_feature($organism,$analysis_id,$cvterm,$uniquename,$name,
-   $residues,$is_analysis='f',$is_obsolete='f',$add_only)  {
+   $residues,$is_analysis='f',$is_obsolete='f',$add_only,$score)  {
 
    // check to see if the feature already exists
    $feature_sql = "SELECT * FROM {feature} 
@@ -905,17 +912,37 @@ function tripal_feature_load_gff3_feature($organism,$analysis_id,$cvterm,$unique
       print "Skipping existing feature: '$uniquename' ($cvterm->name).\n";
       return 0;
    }
+
    // get the newly added feature
    $feature = db_fetch_object(db_query($feature_sql,$organism->organism_id,$uniquename,$cvterm->cvterm_id));
 
    // add the analysisfeature entry to the analysisfeature table if it doesn't already exist
-   $af_values = array('analysis_id' => $analysis_id, 'feature_id' => $feature->feature_id);
-   if(tripal_core_chado_select('analysisfeature',array('analysisfeature_id'),$af_values,array('has_record'))){
+   $af_values = array(
+      'analysis_id' => $analysis_id, 
+      'feature_id' => $feature->feature_id
+   );
+   $afeature = tripal_core_chado_select('analysisfeature',array('analysisfeature_id'),$af_values,array('has_record'));
+   if(count($afeature)==0){
+      // if a score is avaialble then set that to be the significance field
+      if(strcmp($score,'.')!=0){
+        $af_values['significance'] = $score;
+      }
       if(!tripal_core_chado_insert('analysisfeature',$af_values)){
          print "ERROR: could not add analysisfeature record: $analysis_id, $feature->feature_id\n";
       } else {
          print "   Added analysisfeature record\n";
       }
+   } else {
+      // if a score is avaialble then set that to be the significance field
+      $new_vals = array();
+      if(strcmp($score,'.')!=0){
+        $new_vals['significance'] = $score;
+      }
+      if(!$add_only and !tripal_core_chado_update('analysisfeature',$af_values,$new_vals)){
+         print "ERROR: could not update analysisfeature record: $analysis_id, $feature->feature_id\n";
+      } else {
+         print "   Updated analysisfeature record\n";
+      } 
    }
 
    return $feature;
@@ -946,9 +973,10 @@ function tripal_feature_load_gff3_featureloc($feature,$organism,$landmark,$fmin,
 
    // check to see if this featureloc already exists, but also keep track of the
    // last rank value
-   $rank = -1;  
+   $rank = 0;  
    $exists = 0;  
-   $featureloc_sql = "SELECT FL.featureloc_id,FL.fmin,FL.fmax,F.uniquename as srcname
+   $featureloc_sql = "SELECT FL.featureloc_id,FL.fmin,FL.fmax,F.uniquename as srcname,
+                         rank
                       FROM {featureloc} FL
                         INNER JOIN {feature} F on F.feature_id = FL.srcfeature_id
                       WHERE FL.feature_id = %d
@@ -962,7 +990,7 @@ function tripal_feature_load_gff3_featureloc($feature,$organism,$landmark,$fmin,
          print "   No change to featureloc\n";
          $exists = 1;
       }
-      $rank = $featureloc->rank;
+      $rank = $featureloc->rank + 1;
    }
    if(!$exists){
       $rank++;
@@ -1002,6 +1030,55 @@ function tripal_feature_load_gff3_featureloc($feature,$organism,$landmark,$fmin,
  *
  * @ingroup gff3_loader
  */
+function tripal_feature_load_gff3_property($feature,$property,$value){
+   // first make sure the cvterm exists.  If the term already exists then
+   // the function should return it of not, then add it
+   $cvt_sql = "SELECT * FROM {cvterm} CVT
+               INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
+               WHERE CV.name = '%s' and CVT.name = '%s'";
+   $cvterm = db_fetch_object(db_query($cvt_sql,'feature_property',$property));
+   if(!$cvterm){
+      $term = array(
+         'id' => "null:$property",
+         'name' => $property,
+         'namespace' => 'feature_property', 
+         'is_obsolete' => 0,
+      );
+      print "   Adding cvterm, $property\n";
+      $cvterm = (object) tripal_cv_add_cvterm($term,'feature_property',0,0);
+   }
+
+   if(!$cvterm){
+      print "ERROR: cannot add cvterm, $property\n";
+      exit;
+   }
+
+   // check to see if the property already exists for this feature
+   // if it does but the value is unique then increment the rank and add it. 
+   // if the value is not unique then don't add it.
+   $add = 1;
+   $rank = 0;
+   $sql = "SELECT rank,value FROM {featureprop} 
+           WHERE feature_id = %d and type_id = %d
+           ORDER BY rank ASC";
+   $result = db_query($sql,$feature->feature_id,$cvterm->cvterm_id);
+   while($prop = db_fetch_object($result)){
+      if(strcmp($prop->value,$value)==0){
+        $add = NULL; // don't add it, it already exists
+        print "   Property already exists, skipping\n";
+      }
+      $rank = $prop->rank + 1;
+   }
+   
+   // add the property if we pass the check above
+   if($add){
+      print "   Setting feature property. $property: $value\n";
+      $isql = "INSERT INTO {featureprop} (feature_id,type_id,value,rank)
+               VALUES (%d,%d,'%s',%d)";
+      db_query($isql,$feature->feature_id,$cvterm->cvterm_id,$value,$rank);
+   }
+}
+/*
 function tripal_feature_load_gff3_property($feature,$property,$value){
    // first make sure the cvterm exists.  If the term already exists then
    // the function should return it
@@ -1031,4 +1108,5 @@ function tripal_feature_load_gff3_property($feature,$property,$value){
    // next give the feature the property
    tripal_core_insert_property('feature',$feature->feature_id,$property,'feature_property',$value,1);
 }
+*/