Browse Source

Updated GFF loader with fixes from Tv2.x

Stephen Ficklin 8 years ago
parent
commit
bf33279705
1 changed files with 117 additions and 26 deletions
  1. 117 26
      tripal_chado/includes/loaders/tripal_chado.gff_loader.inc

+ 117 - 26
tripal_chado/includes/loaders/tripal_chado.gff_loader.inc

@@ -92,13 +92,57 @@ function tripal_feature_gff3_load_form() {
       you may specify the name of the attribute to use for the name."),
       you may specify the name of the attribute to use for the name."),
   );
   );
 
 
-  $form['import_options'] = array(
+  // Advanced Options
+  $form['advanced'] = array(
+    '#type' => 'fieldset',
+    '#title' => t('Advanced Options'),
+    '#collapsible' => TRUE,
+    '#collapsed' => TRUE,
+  );
+
+  $form['advanced']['protein_names'] = array(
+    '#type' => 'fieldset',
+    '#title' => t('Protein Names'),
+    '#collapsible' => TRUE,
+    '#collapsed' => FALSE,
+    '#weight' => 5,
+  );
+
+  $form['advanced']['protein_names']['re_help'] = array(
+    '#type' => 'item',
+    '#markup' => t('A regular expression is an advanced method for extracting information from a string of text.
+                   If your GFF3 file does not contain polypeptide (or protein) features, but contains CDS features, proteins will be automatically created.
+                   By default the loader will give each protein a name based on the name of the corresponding mRNA followed by the "-protein" suffix.
+                   If you want to customize the name of the created protein, you can use the following regex.')
+  );
+  $form['advanced']['protein_names']['re_mrna'] = array(
+    '#type' => 'textfield',
+    '#title' => t('Regular expression for the mRNA name'),
+    '#required' => FALSE,
+    '#description' => t('Enter the regular expression that will extract portions of
+       the mRNA unique name. For example, for a
+       mRNA with a unique name finishing by -RX (e.g. SPECIES0000001-RA),
+       the regular expression would be, "^(.*?)-R([A-Z]+)$".')
+  );
+  $form['advanced']['protein_names']['re_protein'] = array(
+    '#type' => 'textfield',
+    '#title' => t('Replacement string for the protein name'),
+    '#required' => FALSE,
+    '#description' => t('Enter the replacement string that will be used to create
+       the protein name based on the mRNA regular expression. For example, for a
+       mRNA regular expression "^(.*?)-R()[A-Z]+)$", the corresponding protein regular
+       expression would be "$1-P$2".')
+  );
+
+  $form['advanced']['import_options'] = array(
     '#type' => 'fieldset',
     '#type' => 'fieldset',
     '#title' => t('Import Options'),
     '#title' => t('Import Options'),
-    '#collapsed' => TRUE
+    '#collapsible' => TRUE,
+    '#collapsed' => FALSE,
+    '#weight' => 0,
   );
   );
 
 
-  $form['import_options']['use_transaction']= array(
+  $form['advanced']['import_options']['use_transaction']= array(
     '#type' => 'checkbox',
     '#type' => 'checkbox',
     '#title' => t('Use a transaction'),
     '#title' => t('Use a transaction'),
     '#required' => FALSE,
     '#required' => FALSE,
@@ -108,14 +152,14 @@ function tripal_feature_gff3_load_form() {
       of failure will be present in the database.'),
       of failure will be present in the database.'),
     '#default_value' => 1,
     '#default_value' => 1,
   );
   );
-  $form['import_options']['add_only']= array(
+  $form['advanced']['import_options']['add_only']= array(
     '#type' => 'checkbox',
     '#type' => 'checkbox',
     '#title' => t('Import only new features'),
     '#title' => t('Import only new features'),
     '#required' => FALSE,
     '#required' => FALSE,
     '#description' => t('The job will skip features in the GFF file that already
     '#description' => t('The job will skip features in the GFF file that already
                          exist in the database and import only new features.'),
                          exist in the database and import only new features.'),
   );
   );
-  $form['import_options']['update']= array(
+  $form['advanced']['import_options']['update']= array(
     '#type' => 'checkbox',
     '#type' => 'checkbox',
     '#title' => t('Import all and update'),
     '#title' => t('Import all and update'),
     '#required' => FALSE,
     '#required' => FALSE,
@@ -145,7 +189,7 @@ function tripal_feature_gff3_load_form() {
 //     '#description' => t('Features present in the GFF file that exist in the database
 //     '#description' => t('Features present in the GFF file that exist in the database
 //                          will be removed rather than imported'),
 //                          will be removed rather than imported'),
 //   );
 //   );
-  $form['import_options']['create_organism']= array(
+  $form['advanced']['import_options']['create_organism']= array(
     '#type' => 'checkbox',
     '#type' => 'checkbox',
     '#title' => t('Create organism'),
     '#title' => t('Create organism'),
     '#required' => FALSE,
     '#required' => FALSE,
@@ -156,12 +200,14 @@ function tripal_feature_gff3_load_form() {
        Otherwise lines with an oraganism attribute where the organism is not present in the database will be skipped.'),
        Otherwise lines with an oraganism attribute where the organism is not present in the database will be skipped.'),
   );
   );
 
 
-  $form['targets'] = array(
+  $form['advanced']['targets'] = array(
     '#type' => 'fieldset',
     '#type' => 'fieldset',
     '#title' => t('Targets'),
     '#title' => t('Targets'),
-    '#collapsed' => TRUE
+    '#collapsible' => TRUE,
+    '#collapsed' => FALSE,
+    '#weight' => 1,
   );
   );
-  $form['targets']['adesc'] = array(
+  $form['advanced']['targets']['adesc'] = array(
     '#markup' => t("When alignments are represented in the GFF file (e.g. such as
     '#markup' => t("When alignments are represented in the GFF file (e.g. such as
        alignments of cDNA sequences to a whole genome, or blast matches), they are
        alignments of cDNA sequences to a whole genome, or blast matches), they are
        represented using two feature types: 'match' (or cDNA_match, EST_match, etc.)
        represented using two feature types: 'match' (or cDNA_match, EST_match, etc.)
@@ -172,7 +218,7 @@ function tripal_feature_gff3_load_form() {
        The options here will apply to all targets unless the organism and type are explicity
        The options here will apply to all targets unless the organism and type are explicity
        set in the GFF file using the 'target_organism' and 'target_type' attributes."),
        set in the GFF file using the 'target_organism' and 'target_type' attributes."),
   );
   );
-  $form['targets']['target_organism_id'] = array(
+  $form['advanced']['targets']['target_organism_id'] = array(
     '#title'       => t('Target Organism'),
     '#title'       => t('Target Organism'),
     '#type'        => t('select'),
     '#type'        => t('select'),
     '#description' => t("Optional. Choose the organism to which target sequences belong.
     '#description' => t("Optional. Choose the organism to which target sequences belong.
@@ -183,7 +229,7 @@ function tripal_feature_gff3_load_form() {
       attribute in the GFF file."),
       attribute in the GFF file."),
     '#options'     => $organisms,
     '#options'     => $organisms,
   );
   );
-  $form['targets']['target_type'] = array(
+  $form['advanced']['targets']['target_type'] = array(
     '#title'       => t('Target Type'),
     '#title'       => t('Target Type'),
     '#type'        => t('textfield'),
     '#type'        => t('textfield'),
     '#description' => t("Optional. If the unique name for a target sequence is not unique (e.g. a protein
     '#description' => t("Optional. If the unique name for a target sequence is not unique (e.g. a protein
@@ -191,7 +237,7 @@ function tripal_feature_gff3_load_form() {
        the targets are of different types then the type must be specified using the 'target_type=type' attribute
        the targets are of different types then the type must be specified using the 'target_type=type' attribute
        in the GFF file. This must be a valid Sequence Ontology (SO) term."),
        in the GFF file. This must be a valid Sequence Ontology (SO) term."),
   );
   );
-  $form['targets']['create_target']= array(
+  $form['advanced']['targets']['create_target']= array(
     '#type' => 'checkbox',
     '#type' => 'checkbox',
     '#title' => t('Create Target'),
     '#title' => t('Create Target'),
     '#required' => FALSE,
     '#required' => FALSE,
@@ -230,6 +276,8 @@ function tripal_feature_gff3_load_form_validate($form, &$form_state) {
   $line_number   = trim($form_state['values']['line_number']);
   $line_number   = trim($form_state['values']['line_number']);
   $landmark_type   = trim($form_state['values']['landmark_type']);
   $landmark_type   = trim($form_state['values']['landmark_type']);
   $alt_id_attr   = trim($form_state['values']['alt_id_attr']);
   $alt_id_attr   = trim($form_state['values']['alt_id_attr']);
+  $re_mrna = trim($form_state['values']['re_mrna']);
+  $re_protein = trim($form_state['values']['re_protein']);
 
 
 
 
 
 
@@ -256,6 +304,21 @@ function tripal_feature_gff3_load_form_validate($form, &$form_state) {
   if ($line_number and !is_numeric($line_number) or $line_number < 0) {
   if ($line_number and !is_numeric($line_number) or $line_number < 0) {
     form_set_error('line_number', t("Please provide an integer line number greater than zero."));
     form_set_error('line_number', t("Please provide an integer line number greater than zero."));
   }
   }
+
+  if (!($re_mrna and $re_protein) and ($re_mrna or $re_protein)) {
+    form_set_error('re_uname', t("You must provide both a regular expression for mRNA and a replacement string for protein"));
+  }
+
+  // check the regular expression to make sure it is valid
+  set_error_handler(function() {}, E_WARNING);
+  $result_re = preg_match("/" . $re_mrna . "/", null);
+  $result = preg_replace("/" . $re_mrna . "/", $re_protein, null);
+  restore_error_handler();
+  if ($result_re === FALSE) {
+    form_set_error('re_mrna', 'Invalid regular expression.');
+  } else if ($result === FALSE) {
+    form_set_error('re_protein', 'Invalid replacement string.');
+  }
 }
 }
 
 
 /**
 /**
@@ -281,12 +344,14 @@ function tripal_feature_gff3_load_form_submit($form, &$form_state) {
   $landmark_type   = trim($form_state['values']['landmark_type']);
   $landmark_type   = trim($form_state['values']['landmark_type']);
   $alt_id_attr   = trim($form_state['values']['alt_id_attr']);
   $alt_id_attr   = trim($form_state['values']['alt_id_attr']);
   $create_organism = $form_state['values']['create_organism'];
   $create_organism = $form_state['values']['create_organism'];
+  $re_mrna = trim($form_state['values']['re_mrna']);
+  $re_protein = trim($form_state['values']['re_protein']);
 
 
 
 
   $args = array($gff_file, $organism_id, $analysis_id, $add_only,
   $args = array($gff_file, $organism_id, $analysis_id, $add_only,
     $update, $refresh, $remove, $use_transaction, $target_organism_id,
     $update, $refresh, $remove, $use_transaction, $target_organism_id,
     $target_type, $create_target, $line_number, $landmark_type, $alt_id_attr,
     $target_type, $create_target, $line_number, $landmark_type, $alt_id_attr,
-    $create_organism);
+    $create_organism, $re_mrna, $re_protein);
 
 
   $type = '';
   $type = '';
   if ($add_only) {
   if ($add_only) {
@@ -378,6 +443,10 @@ function tripal_feature_gff3_load_form_submit($form, &$form_state) {
  *   Check this box to automatically add the organism to the database if it does
  *   Check this box to automatically add the organism to the database if it does
  *   not already exists. Otherwise lines with an oraganism attribute where the
  *   not already exists. Otherwise lines with an oraganism attribute where the
  *   organism is not present in the database will be skipped.
  *   organism is not present in the database will be skipped.
+ * @param $re_mrna A
+ *          regular expression to extract portions from mRNA id
+ * @param $re_protein A
+ *          replacement string to generate the protein id
  * @param $job
  * @param $job
  *  The tripal job_id.  Only used by the Tripal Jobs subsystem.
  *  The tripal job_id.  Only used by the Tripal Jobs subsystem.
  *
  *
@@ -386,8 +455,8 @@ function tripal_feature_gff3_load_form_submit($form, &$form_state) {
 function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
 function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
   $add_only = 0, $update = 1, $refresh = 0, $remove = 0, $use_transaction = 1,
   $add_only = 0, $update = 1, $refresh = 0, $remove = 0, $use_transaction = 1,
   $target_organism_id = NULL, $target_type = NULL,  $create_target = 0,
   $target_organism_id = NULL, $target_type = NULL,  $create_target = 0,
-  $start_line = 1, $landmark_type = '', $alt_id_attr = '', $create_organism = FALSE,
-  $job = NULL) {
+  $start_line = 1, $landmark_type = '', $alt_id_attr = '',  $create_organism = FALSE,
+  $re_mrna = '', $re_protein = '', $job = NULL) {
 
 
   $ret = array();
   $ret = array();
   $date = getdate();
   $date = getdate();
@@ -396,6 +465,10 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
   // to do the database query every time.
   // to do the database query every time.
   $cvterm_lookup = array();
   $cvterm_lookup = array();
 
 
+  // An array that stores Landmarks that have been looked up so we don't have
+  // to do the database query every time.
+  $landmark_lookup = array();
+
   // empty the temp tables
   // empty the temp tables
   $sql = "DELETE FROM {tripal_gff_temp}";
   $sql = "DELETE FROM {tripal_gff_temp}";
   chado_query($sql);
   chado_query($sql);
@@ -606,7 +679,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
       $attr_fmax_partial = 'f';
       $attr_fmax_partial = 'f';
       $attr_is_obsolete = 'f';
       $attr_is_obsolete = 'f';
       $attr_is_analysis = 'f';
       $attr_is_analysis = 'f';
-      $attr_others = '';
+      $attr_others = [];
       $residues = '';
       $residues = '';
 
 
       // the organism to which a feature belongs can be set in the GFF
       // the organism to which a feature belongs can be set in the GFF
@@ -737,9 +810,10 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
       // Make sure the landmark sequence exists in the database.  If the user
       // Make sure the landmark sequence exists in the database.  If the user
       // has not specified a landmark type (and it's not required in the GFF
       // has not specified a landmark type (and it's not required in the GFF
       // format) then we don't know the type of the landmark so we'll hope
       // format) then we don't know the type of the landmark so we'll hope
-      // that it's unique across all types for the orgnaism. Only do this
+      // that it's unique across all types for the organism. Only do this
       // test if the landmark and the feature are different.
       // test if the landmark and the feature are different.
-      if (!$remove and !(strcmp($landmark, $attr_uniquename) == 0 or strcmp($landmark, $attr_name) == 0)) {
+      if (!$remove and !(strcmp($landmark, $attr_uniquename) == 0 or strcmp($landmark, $attr_name) == 0) and !in_array($landmark, $landmark_lookup)) {
+
         $select = array(
         $select = array(
           'organism_id' => $organism->organism_id,
           'organism_id' => $organism->organism_id,
           'uniquename'  => $landmark,
           'uniquename'  => $landmark,
@@ -782,6 +856,9 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
                 "The features cannot be associated", array('%landmark' => $landmark));
                 "The features cannot be associated", array('%landmark' => $landmark));
           return '';
           return '';
         }
         }
+
+        // The landmark was found, remember it
+        $landmark_lookup[] = $landmark;
       }
       }
 /*
 /*
       // If the option is to remove or refresh then we want to remove
       // If the option is to remove or refresh then we want to remove
@@ -822,7 +899,7 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
             'type_name' => $type,
             'type_name' => $type,
             'uniquename' => $feature->uniquename
             'uniquename' => $feature->uniquename
           );
           );
-          // make sure this record doesn't already exist in oru temp table
+          // make sure this record doesn't already exist in our temp table
           $results = chado_select_record('tripal_gff_temp', array('*'), $values);
           $results = chado_select_record('tripal_gff_temp', array('*'), $values);
 
 
           if (count($results) == 0) {
           if (count($results) == 0) {
@@ -908,13 +985,15 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
             CVT.cvterm_id, CVT.name as feature_type,
             CVT.cvterm_id, CVT.name as feature_type,
             min(TGCT.fmin) as fmin, max(TGCT.fmax) as fmax,
             min(TGCT.fmin) as fmin, max(TGCT.fmax) as fmax,
             TGPT.feature_id as protein_id, TGPT.fmin as protein_fmin,
             TGPT.feature_id as protein_id, TGPT.fmin as protein_fmin,
-            TGPT.fmax as protein_fmax
+            TGPT.fmax as protein_fmax, FLM.uniquename as landmark
           FROM {tripal_gffcds_temp} TGCT
           FROM {tripal_gffcds_temp} TGCT
             INNER JOIN {feature} F on F.feature_id = TGCT.parent_id
             INNER JOIN {feature} F on F.feature_id = TGCT.parent_id
             INNER JOIN {cvterm} CVT on CVT.cvterm_id = F.type_id
             INNER JOIN {cvterm} CVT on CVT.cvterm_id = F.type_id
+            INNER JOIN {featureloc} L on F.feature_id = L.feature_id
+            INNER JOIN {feature} FLM on L.srcfeature_id = FLM.feature_id
             LEFT JOIN {tripal_gffprotein_temp} TGPT on TGPT.parent_id = F.feature_id
             LEFT JOIN {tripal_gffprotein_temp} TGPT on TGPT.parent_id = F.feature_id
           GROUP BY F.feature_id, F.name, F.uniquename, CVT.cvterm_id, CVT.name,
           GROUP BY F.feature_id, F.name, F.uniquename, CVT.cvterm_id, CVT.name,
-            TGPT.feature_id, TGPT.fmin, TGPT.fmax, TGCT.strand
+            TGPT.feature_id, TGPT.fmin, TGPT.fmax, TGCT.strand, FLM.uniquename
         ";
         ";
         $results = chado_query($sql);
         $results = chado_query($sql);
         $protein_cvterm = tripal_get_cvterm(array(
         $protein_cvterm = tripal_get_cvterm(array(
@@ -928,8 +1007,16 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
           // protein.
           // protein.
           if (!$result->protein_id) {
           if (!$result->protein_id) {
             // Get details about this protein
             // Get details about this protein
-            $uname = $result->uniquename . '-protein';
-            $name =  $result->name;
+            if ($re_mrna and $re_protein) {
+              // We use a regex to generate protein name from mRNA name
+              $uname = preg_replace("/$re_mrna/", $re_protein, $result->uniquename);
+              $name =  $result->name;
+            }
+            else {
+              // No regex, use the default '-protein' suffix
+              $uname = $result->uniquename . '-protein';
+              $name =  $result->name;
+            }
             $values = array(
             $values = array(
               'parent_id' => $result->feature_id,
               'parent_id' => $result->feature_id,
               'fmin' => $result->fmin
               'fmin' => $result->fmin
@@ -959,8 +1046,8 @@ function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id,
               $result->uniquename, $organism, $pfmin, $pfmax);
               $result->uniquename, $organism, $pfmin, $pfmax);
             // Add the featureloc record. Set the start of the protein to
             // Add the featureloc record. Set the start of the protein to
             // be the start of the coding sequence minus the phase.
             // be the start of the coding sequence minus the phase.
-            tripal_feature_load_gff3_featureloc($feature, $organism, $landmark,
-              $pfmin, $pfmax, $strand, '', 'f', 'f', '', 0);
+            tripal_feature_load_gff3_featureloc($feature, $organism, $result->landmark,
+              $pfmin, $pfmax, $result->strand, '', 'f', 'f', '', 0);
           }
           }
         }
         }
       }
       }
@@ -1216,6 +1303,10 @@ function tripal_feature_load_gff3_parents($feature, $cvterm, $parents,
     // try to find the parent
     // try to find the parent
     $parentcvterm = chado_query($cvterm_sql, array(':cvname' => 'sequence', ':name' => $parent_type, ':synonym' => $parent_type))->fetchObject();
     $parentcvterm = chado_query($cvterm_sql, array(':cvname' => 'sequence', ':name' => $parent_type, ':synonym' => $parent_type))->fetchObject();
     $relcvterm = chado_query($cvterm_sql, array(':cvname' => 'sequence', ':name' => $rel_type, ':synonym' => $rel_type))->fetchObject();
     $relcvterm = chado_query($cvterm_sql, array(':cvname' => 'sequence', ':name' => $rel_type, ':synonym' => $rel_type))->fetchObject();
+    if (!$relcvterm) {
+      tripal_report_error("tripal_feature", TRIPAL_WARNING, "Cannot find the term, 'part_of', from the sequence ontology. This term is used for associating parent and children features. Please check that the ontology is fully imported.");
+      exit;
+    }
     $values = array(
     $values = array(
         'organism_id' => $organism_id,
         'organism_id' => $organism_id,
         'uniquename' => $parent,
         'uniquename' => $parent,
@@ -1464,7 +1555,7 @@ function tripal_feature_load_gff3_alias($feature, $aliases) {
     // insert the 'synonym_type' vocabulary
     // insert the 'synonym_type' vocabulary
     $values = array(
     $values = array(
       'name' => 'synonym_type',
       'name' => 'synonym_type',
-      'definition' => 'A local vocabulary added for synonym types.',
+      'definition' => 'vocabulary for synonym types',
     );
     );
     $success = chado_insert_record('cv', $values);
     $success = chado_insert_record('cv', $values);
     if (!$success) {
     if (!$success) {