Преглед изворни кода

Added support for 'organism=genus:speices' attribute for importing GFF files

spficklin пре 11 година
родитељ
комит
1156b83efb
2 измењених фајлова са 81 додато и 16 уклоњено
  1. 80 16
      tripal_feature/includes/gff_loader.inc
  2. 1 0
      tripal_feature/tripal_feature.module

+ 80 - 16
tripal_feature/includes/gff_loader.inc

@@ -137,6 +137,16 @@ function tripal_feature_gff3_load_form() {
     '#description' => t('Features present in the GFF file that exist in the database
                          will be removed rather than imported'),
   );
+  $form['import_options']['create_organism']= array(
+    '#type' => 'checkbox',
+    '#title' => t('Create organism'),
+    '#required' => FALSE,
+    '#description' => t('The Tripal GFF loader supports the "organism" attribute. This allows features of a 
+       different organism to be aligned to the landmark sequence of another species.  The format of the 
+       attribute is "organism=[genus]:[species]", where [genus] is the organism\'s genus and [species] is the
+       species name. Check this box to automatically add the organism to the database if it does not already exists.
+       Otherwise lines with an oraganism attribute where the organism is not present in the database will be skipped.'),
+  );
 
   $form['targets'] = array(
     '#type' => 'fieldset',
@@ -204,6 +214,7 @@ function tripal_feature_gff3_load_form_validate($form, &$form_state) {
   $target_organism_id = $form_state['values']['target_organism_id'];
   $target_type = trim($form_state['values']['target_type']);
   $create_target = $form_state['values']['create_target'];
+  $create_organism = $form_state['values']['create_organism'];
   $add_only = $form_state['values']['add_only'];
   $update   = $form_state['values']['update'];
   $refresh  = $form_state['values']['refresh'];
@@ -261,10 +272,13 @@ function tripal_feature_gff3_load_form_submit($form, &$form_state) {
   $line_number   = trim($form_state['values']['line_number']);
   $landmark_type   = trim($form_state['values']['landmark_type']);
   $alt_id_attr   = trim($form_state['values']['alt_id_attr']);
+  $create_organism = $form_state['values']['create_organism'];
+  
     
   $args = array($gff_file, $organism_id, $analysis_id, $add_only, 
     $update, $refresh, $remove, $use_transaction, $target_organism_id, 
-    $target_type, $create_target, $line_number, $landmark_type, $alt_id_attr);
+    $target_type, $create_target, $line_number, $landmark_type, $alt_id_attr, 
+    $create_organism);
     
   $type = '';
   if ($add_only) {
@@ -292,9 +306,10 @@ function tripal_feature_gff3_load_form_submit($form, &$form_state) {
  * @ingroup gff3_loader
  */
 function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id, 
-  $add_only =0, $update = 0, $refresh = 0, $remove = 0, $use_transaction = 1, 
+  $add_only = 0, $update = 0, $refresh = 0, $remove = 0, $use_transaction = 1, 
   $target_organism_id = NULL, $target_type = NULL,  $create_target = 0, 
-  $start_line = 1, $landmark_type = '', $alt_id_attr = '', $job = NULL) {     
+  $start_line = 1, $landmark_type = '', $alt_id_attr = '', $create_organism = FALSE, 
+  $job = NULL) {     
 
   // make sure our temporary table exists
   $ret = array(); 
@@ -517,8 +532,14 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
     $attr_fmax_partial = 'f';
     $attr_is_obsolete = 'f';
     $attr_is_analysis = 'f';
-    $attr_others = '';
+    $attr_others = '';       
     $residues = '';
+    
+    // the organism to which a feature belongs can be set in the GFF
+    // file using the 'organism' attribute.  By default we 
+    // set the $feature_organism variable to the default organism for the landmark
+    $attr_organism = ''; 
+    $feature_organism = $organism;
 
     foreach ($attrs as $attr) {
       $attr = rtrim($attr);
@@ -549,24 +570,67 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
       }
       
       // get the name and ID tags
+      $skip_feature = 0;  // if there is a problem with any of the attributes this variable gets set
       if (strcmp($tag_name, 'ID') == 0) {
         $attr_uniquename =  urldecode($tag[1]);
       }
       elseif (strcmp($tag_name, 'Name') == 0) {
         $attr_name =  urldecode($tag[1]);
       }
+      elseif (strcmp($tag_name, 'organism') == 0) {
+        $attr_organism = urldecode($tag[1]);
+        $org_matches = array();
+        if (preg_match('/^(.*?):(.*?)$/', $attr_organism, $org_matches)) {
+          $values = array(
+            'genus' => $org_matches[1],
+            'species' => $org_matches[2],
+          ); 
+          $options = array('statement_name' => 'sel_organism_gesp');
+          $org = tripal_core_chado_select('organism', array("*"), $values, $options);
+          if (count($org) == 0) {
+            if ($create_organism) {
+              $options = array('statement_name' => 'ins_organism_gesp');
+              $feature_organism = (object) tripal_core_chado_insert('organism', $values, $options);
+              if (!$feature_organism) {
+                watchdog('T_gff3_loader', "Could not add the organism, '%org', from line %line. Skipping this line. ",
+                  array('%org' => $attr_organism, '%line' => $line_num), WATCHDOG_ERROR); 
+                $skip_feature = 1; 
+              }                
+            } 
+            else {
+              watchdog('T_gff3_loader', "The organism attribute '%org' on line %line does not exist. Skipping this line. ",
+                array('%org' => $attr_organism, '%line' => $line_num), WATCHDOG_ERROR); 
+              $skip_feature = 1;
+            }             
+          }
+          else {
+            // we found the organism in the database so use it
+            $feature_organism = $org[0];
+          }
+        } 
+        else {
+          watchdog('T_gff3_loader', "The organism attribute '%org' on line %line is not properly formated. It ".
+            "should be of the form: organism=Genus:species.  Skipping this line.", 
+            array('%org' => $attr_organism, '%line' => $line_num), WATCHDOG_ERROR);
+          $skip_feature = 1;  
+        }        
+      }
       // get the list of non-reserved attributes
-      elseif (strcmp($tag_name, 'Alias') !=0        and strcmp($tag_name, 'Parent') !=0 and
-              strcmp($tag_name, 'Target') !=0       and strcmp($tag_name, 'Gap') !=0 and
-              strcmp($tag_name, 'Derives_from') !=0 and strcmp($tag_name, 'Note') !=0 and
-              strcmp($tag_name, 'Dbxref') !=0       and strcmp($tag_name, 'Ontology_term') !=0 and
-              strcmp($tag_name, 'Is_circular') !=0  and strcmp($tag_name, 'target_organism') !=0 and
-              strcmp($tag_name, 'target_type') != 0) {
+      elseif (strcmp($tag_name, 'Alias') != 0        and strcmp($tag_name, 'Parent') != 0 and
+              strcmp($tag_name, 'Target') != 0       and strcmp($tag_name, 'Gap') != 0 and
+              strcmp($tag_name, 'Derives_from') != 0 and strcmp($tag_name, 'Note') != 0 and
+              strcmp($tag_name, 'Dbxref') != 0       and strcmp($tag_name, 'Ontology_term') != 0 and
+              strcmp($tag_name, 'Is_circular') != 0  and strcmp($tag_name, 'target_organism') != 0 and
+              strcmp($tag_name, 'target_type') != 0  and strcmp($tag_name, 'organism' != 0)) {
         foreach ($tags[$tag_name] as $value) {
           $attr_others[$tag_name][] = $value;
         }
       }
     }
+    
+    if ($skip_line) {
+      continue;
+    }
 
     // if neither name nor uniquename are provided then generate one
     if (!$attr_uniquename and !$attr_name) {
@@ -611,7 +675,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
     // different.
     if (!$remove and !(strcmp($landmark, $attr_uniquename) == 0 or strcmp($landmark, $attr_name) == 0)) {
       $select = array(
-        'organism_id' => $organism_id,
+        'organism_id' => $organism->organism_id,
         'uniquename'  => $landmark,
       );      
       $columns = array('count(*) as num_landmarks');
@@ -626,7 +690,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
       if (!$count or count($count) == 0 or $count[0]->num_landmarks == 0) {        
         // now look for the landmark using the name rather than uniquename.
         $select = array(
-          'organism_id' => $organism_id,
+          'organism_id' => $organism->organism_id,
           'name'  => $landmark,
         );
         $columns = array('count(*) as num_landmarks');
@@ -664,7 +728,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
       $sql = "DELETE FROM {feature}
               WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
       $match = array(
-         'organism_id' => $organism->organism_id,
+         'organism_id' => $feature_organism->organism_id,
          'uniquename'  => $attr_uniquename,
          'type_id'     => $cvterm->cvterm_id
       );
@@ -681,7 +745,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
     if ($update or $refresh or $add_only) {
 
       // add/update the feature
-      $feature = tripal_feature_load_gff3_feature($organism, $analysis_id, $cvterm,
+      $feature = tripal_feature_load_gff3_feature($feature_organism, $analysis_id, $cvterm,
         $attr_uniquename, $attr_name, $residues, $attr_is_analysis,
         $attr_is_obsolete, $add_only, $score);  
    
@@ -729,7 +793,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
         }       
         // add parent relationships
         if (array_key_exists('Parent', $tags)) {
-          tripal_feature_load_gff3_parents($feature, $cvterm, $tags['Parent'], $organism_id, $fmin);
+          tripal_feature_load_gff3_parents($feature, $cvterm, $tags['Parent'], $feature_organism->organism_id, $fmin);
         }               
         // add target relationships
         if (array_key_exists('Target', $tags)) {
@@ -749,7 +813,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
         }
         // add the Derives_from relationship (e.g. polycistronic genes).
         if (array_key_exists('Derives_from', $tags)) {
-          tripal_feature_load_gff3_derives_from($feature, $tags['Derives_from'][0], $organism);
+          tripal_feature_load_gff3_derives_from($feature, $tags['Derives_from'][0], $feature_organism);
         }
         // add in the GFF3_source dbxref so that GBrowse can find the feature using the source column
         $source_ref = array('GFF_source:' . $source);

+ 1 - 0
tripal_feature/tripal_feature.module

@@ -2237,6 +2237,7 @@ function tripal_feature_job_describe_args($callback, $args) {
     $new_args['Starting line'] = $args[11];
     $new_args['Landmark Type'] = $args[12];
     $new_args['Alternate ID attribute'] = $args[13];
+    $new_args['Create Organism'] = ($args[14] == 1) ? "Yes" : "No";
   }
   if ($callback == 'tripal_feature_sync_features') {
     if ($args[0]) {