Procházet zdrojové kódy

Adjusted Interpro module to support Blast2GO XML directory. Fixed panel support for features

spficklin před 13 roky
rodič
revize
b3cea67c81

+ 78 - 28
tripal_analysis_interpro/parseInterpro.inc

@@ -244,40 +244,89 @@ function tripal_analysis_interpro_parseHTMLFile ($analysis_id, $interprofile, $p
 function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile, 
    $parsego, $query_re, $query_type, $query_uniquename, $job_id) 
 {
-
    // clear out the anslysisfeature table for this analysis before getting started
    tripal_core_chado_delete('analysisfeature',array('analysis_id' => $analysis_id));
 
-   // Prepare log
-   $filename = preg_replace("/.*\/(.*)/", "$1", $interproxmlfile);
-   $logfile = file_directory_path() . "/tripal/tripal_analysis_interpro/load_$filename.log";
-   $log = fopen($logfile, 'a'); // append parsing results to log file
+   // If user input a file (e.g. blast.xml)
+	if (is_file($interproxmlfile)) {
+      tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, $interproxmlfile, 
+         $parsego, $query_re, $query_type, $query_uniquename, $job_id);
+   } 
+   else {
+		$dir_handle = @opendir($interproxmlfile) or die("Unable to open $interproxmlfile");
+		$pattern = sql_regcase($interproxmlfile . "/*.xml");
+		$total_files = count(glob($pattern));
+		print "$total_files file(s) to be parsed.\n";
+		
+		$interval = intval($total_files * 0.01);
+      if($interval == 0){
+         $interval = 1;
+      }
+		$no_file = 0;
+			
+		// Parsing all files in the directory
+		while ($file = readdir($dir_handle)) {
+			if(preg_match("/^.*\.xml/i",$file)){
+
+            tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, "$interproxmlfile/$file", 
+               $parsego, $query_re, $query_type, $query_uniquename, $job_id,0);
+
+			   // Set job status
+			   if ($no_file % $interval == 0) {
+			      $percentage = (int) (($no_file / $total_files) * 100);
+				   tripal_job_set_progress($job_id, $percentage);
+				   print $percentage."% ";
+			   }				
+         }
+   	   $no_file ++;
+		}
+	}
+
+	print "Done.";
+}
+/**
+*
+*/
+function tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, $interproxmlfile, 
+   $parsego, $query_re, $query_type, $query_uniquename, $job_id,$uptate_status = 1) 
+{
 
    // Parsing started
    print "Parsing File:".$interproxmlfile." ...\n";
-   fwrite($log, date("D M j G:i:s Y").". Loading $interproxmlfile\n");
+
 
    // Get cvterm_id for 'analysis_interpro_xmloutput_hits' which is required
    // for inserting into the analysisfeatureprop table
    $previous_db = db_set_active('chado'); // use chado database
    $sql = "SELECT CVT.cvterm_id FROM {cvterm} CVT ".
-   "INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
-   "WHERE CVT.name = 'analysis_interpro_xmloutput_hit' ".
-   "AND CV.name = 'tripal'";
+          "   INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
+          "WHERE CVT.name = 'analysis_interpro_xmloutput_hit' ".
+          "   AND CV.name = 'tripal'";
    $type_id = db_result(db_query($sql));
 
    // Load the XML file
    $interproput =  simplexml_load_file($interproxmlfile);
 
    // Get entries parsing
-   $proteins = $interproput->children();
+   $xml = $interproput->children();
+
+   // If there is an EBI header then we need to skip that
+   // and set our proteins array to be the second element of the array. This
+   // occurs if results were generated with the online InterProScan tool.
+   // if the XML starts in with the results then this happens when InterProScan
+   // is used command-line and we can just use the object as is
+   if(preg_match('/^Header$/',$xml[0]->getname())){
+      $proteins = $xml[1];  
+   } else {
+      $proteins = $xml[0];
+   }
 
    // Count the number of entires to be processed
    $no_iterations = 0;
-   foreach($proteins as $tmp) {
+   foreach($proteins as $protein) {
       $no_iterations ++;
    }
-   print "$no_iterations proteins to be processed.\n";
+   print "  Found results for $no_iterations sequences\n";
    $interval = intval($no_iterations * 0.01);
    $idx_iterations = 0;
 
@@ -290,9 +339,10 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
 
    // Processed each protein
    foreach ($proteins as $protein) {
+
       // Set job status
       $idx_iterations ++;
-      if ($idx_iterations % $interval == 0) {
+      if ($idx_iterations % $interval == 0 and $update_status) {
          $percentage = (int) ($idx_iterations / $no_iterations * 100);
          db_set_active($previous_db);
          tripal_job_set_progress($job_id, $percentage);
@@ -305,6 +355,17 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
       $attr = $protein->attributes();
       $seqname =$attr ['id'];
 
+      // is the sequence name a generic name (i.e. 'Sequence_1') then the 
+      // blast results do not contain the original sequence names.  The only
+      // option we have is to use the filename.  This will work in the case of
+      // Blast2GO which stores the XML for each sequence in a file with the
+      // the filename the name of the sequence
+      if(preg_match('/Sequence_\d+/',$seqname)){
+         $filename = preg_replace('/^.*\/(.*).xml$/', '$1', $interproxmlfile);
+         print "  Sequence name is not specific, using filename: $filename\n";
+         $seqname = $filename;   
+      }
+
       // Remove _ORF from the sequence name
       $seqname = preg_replace('/^(.+)_\d+_ORF\d+.*/', '$1', $seqname);
      
@@ -324,7 +385,7 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
       }   
 
       if(!$feature and $query_re){
-         print fwrite($log, "Failed: Cannot find feature for '$seqname' using the regular expression: $query_re\n");
+         print "Failed: Cannot find feature for '$seqname' using the regular expression: $query_re\n";
          continue;
       }
 
@@ -346,11 +407,11 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
 
       $feature_arr = tripal_core_chado_select('feature',array('feature_id'),$select);
       if(count($feature_arr) > 1){
-		   fwrite($log, "Ambiguous: '$feature' matches more than one feature and is being skipped.\n");
+		   print "Ambiguous: '$feature' matches more than one feature and is being skipped.\n";
 			continue;
       }
       if(count($feature_arr) == 0){
-			fwrite($log, "Failed: '$feature' cannot find a matching feature in the database.\n");
+			print "Failed: '$feature' cannot find a matching feature in the database.\n";
          continue;
       }
       $feature_id = $feature_arr[0]->feature_id;
@@ -360,14 +421,9 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
       // feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
       if ($feature_id) {
 
-         print "$idx_iterations Adding InterPro results for feature '$seqname' ($feature_id)\n";
-
-         // If a matched feature is found, write to log.
-         fwrite($log, "Succeeded: ".$seqname." => feature id:".$feature_id);
+         print "  Adding InterPro results for feature '$seqname' ($feature_id)\n";
 
-         //------------------------------------
          // Insert into analysisfeature table
-         //------------------------------------
          $sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
                 "VALUES (%d, %d)";
          db_query ($sql, $feature_id, $analysis_id);                     
@@ -376,9 +432,7 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
          $sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE feature_id = %d AND analysis_id = %d";
          $analysisfeature_id = db_result(db_query($sql, $feature_id, $analysis_id));
 
-         //------------------------------------------------------------
          // Insert interpro xml results into analysisfeatureprop table
-         //------------------------------------------------------------
          // Check to see if we have an existing entry
          $sql = "SELECT analysisfeatureprop_id,rank 
                  FROM {analysisfeatureprop} 
@@ -393,7 +447,6 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
          $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
                 "VALUES (%d, %d, '%s', %d)";
          db_query($sql, $analysisfeature_id, $type_id, $protein->asXML(), $rank);
-         fwrite($log, " (Insert)\n"); // write to log
 
          // parse the XML for each protein if GO terms are requested
 
@@ -431,10 +484,7 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
       } // end if($feature_id)            
    } // end foreach ($proteins as $protein)
    db_set_active ($previous_db); // Use drupal database
-   print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
 
-   fwrite($log, "\n");
-   fclose($log);
    return;
 }
 /********************************************************************************

+ 2 - 1
tripal_analysis_interpro/tripal_analysis_interpro.install

@@ -45,7 +45,8 @@ function tripal_analysis_interpro_uninstall() {
 */
 function tripal_analysis_interpro_add_cvterms(){
    // Add cvterm 'analysis_interpro_output_iteration_hits' for inserting into featureprop table
-   tripal_add_cvterms('analysis_interpro_output_hit', 'Hit in the interpro html output. Each hit belongs to a chado feature. This cvterm represents a hit in the output');
+   tripal_add_cvterms('analysis_interpro_xmloutput_hit', 'Hit in the interpro XML output. Each hit belongs to a chado feature. This cvterm represents a hit in the output');
+   tripal_add_cvterms('analysis_interpro_output_hit', 'Hit in the interpro HTML output. Each hit belongs to a chado feature. This cvterm represents a hit in the output');
    tripal_add_cvterms('analysis_interpro_settings', 'Settings of an interpro analysis, including output file and run parameters separated by a bar |');
    tripal_add_cvterms('analysis_interpro_interproparameters','The parameters used when executing an InterProScan job');
    tripal_add_cvterms('analysis_interpro_interprofile','Used to hold the name of the XML file containing the InterProScan results');

+ 25 - 25
tripal_analysis_interpro/tripal_analysis_interpro.module

@@ -85,18 +85,18 @@ function chado_analysis_interpro_form ($node){
       '#weight' => 11
 	);
 	$form['interpro']['interprofile'] = array(
-      '#title' => t('Interproscan Output File (in XML format)'),
+      '#title' => t('InterProScan XML File/Directory: (if you input a directory without the tailing slash, all xml files in the directory will be loaded)'),
       '#type' => 'textfield',
-      '#description' => t('Please provide the full path to the XML output file generated by InterProScan.'),
+      '#description' => t('Please provide the full path to the XML output file generated by InterProScan or a directory containing multiple XML files.'),
       '#default_value' => $interprofile,
 	);
 	$form['interpro']['interprojob'] = array(
       '#type' => 'checkbox',
-      '#title' => t('Submit a job to parse the InterProScan XML file'),
+      '#title' => t('Submit a job to parse the InterProScan XML file(s)'),
       '#description' => t('Note: features associated with the interpro results must '.
                              'exist in chado before parsing the file. Otherwise, interpro '.
-                             'results that cannot be linked to a feature will be '.
-                             'discarded.'),
+                             'results that cannot be linked to a feature will not '.
+                             'be imported.  The feature name must be unique'),
       '#default_value' => $interprojob,
 	   '#attributes' => array(
         'onclick' => 'return isSubmittingJob(this)'
@@ -105,7 +105,7 @@ function chado_analysis_interpro_form ($node){
 	$form['interpro']['parseHTML'] = array(
       '#type' => 'checkbox',
       '#title' => t('The input file is in HTML format (deprecated, only provided for backwards compatibility)'),
-      '#description' => t('Check the box to use the HTML parser. The feature name must be unique across all organisms and types.'),
+      '#description' => t('Check the box to use the HTML parser.'),
       '#default_value' => $parseHTML
    );
 	$form['interpro']['parsego'] = array(
@@ -114,11 +114,11 @@ function chado_analysis_interpro_form ($node){
       '#description' => t('Check the box to load GO terms to chado database'),
       '#default_value' => $parsego
    );
-   $form['interpro']['interprokeywordjob'] = array(
-      '#type' => 'checkbox',
-      '#title' => t('Submit a job to extract keywords from the Interpro html output'),
-      '#description' => t('Note: Interpro results are only searchable after keywords are extracted. Do not run this twice if you have already done so.'),
-	);
+//   $form['interpro']['interprokeywordjob'] = array(
+//      '#type' => 'checkbox',
+//      '#title' => t('Submit a job to extract keywords from the Interpro html output'),
+//      '#description' => t('Note: Interpro results are only searchable after keywords are extracted. Do not run this twice if you have already done so.'),
+//	);
 	$form['interpro']['interproparameters'] = array(
       '#title' => t('Parameters'),
       '#type' => 'textfield',
@@ -130,30 +130,30 @@ function chado_analysis_interpro_form ($node){
       '#title' => t('Query Name RE'),
       '#type' => 'textfield',
       '#description' => t('Enter the regular expression that will extract the '.
-         'feature name from the query line in the interpro results. This should be '.
-         'the same as the definition line in the query FASTA file.  This option is '.
+         'feature name from the query line in the interpro results. This option is '.
          'is only required when the query does not identically match a feature '.
-         'in the database.'),
+         'in the database. By default, the parser will try to match results to '.
+         'a feature in Chado using the feature name.  Select the check box below'.
+         'to match against the unique name if needed.'),
       '#default_value' => $query_re,
 	);
 
-	$form['interpro']['query_type'] = array(
-      '#title' => t('Query Type'),
-      '#type' => 'textfield',
-      '#description' => t('Please enter the Sequence Ontology term that describes '.
-         'the query sequences used for InterProScan.  This is only necessary if two '.
-         'or more sequences have the same name.'),
-      '#default_value' => $query_type,
-	);
-
 	$form['interpro']['query_uniquename'] = array(
       '#title' => t('Use Unique Name'),
       '#type' => 'checkbox',
       '#description' => t('Select this checboxk if the query name in the results file '.
-        'matches the uniquename of the feature.  By default, the blast results will '.
-        'mapped to the "name" of the feature.'),
+        'matches the unique name of the feature. '),
       '#default_value' => $query_uniquename,
 	);
+
+	$form['interpro']['query_type'] = array(
+      '#title' => t('Query Type'),
+      '#type' => 'textfield',
+      '#description' => t('Please enter the Sequence Ontology term (e.g. contig, polypeptide, mRNA) that describes '.
+         'the query sequences in the interproscan XML results file(s).  This is only necessary if two '.
+         'or more sequences have the same name.'),
+      '#default_value' => $query_type,
+	);
 	return $form;
 }
 /**

+ 15 - 37
tripal_feature/tripal_feature.module

@@ -308,20 +308,14 @@ function tripal_feature_block($op = 'list', $delta = 0, $edit=array()){
          $blocks['properties']['info'] = t('Tripal Feature Properties');
          $blocks['properties']['cache'] = BLOCK_NO_CACHE;;
 
-         $blocks['featureloc_sequences']['info'] = t('Tripal Formatted Location Sequence');
+         $blocks['featureloc_sequences']['info'] = t('Tripal Formatted Sequence');
          $blocks['featureloc_sequences']['cache'] = BLOCK_NO_CACHE;
 
-         $blocks['featurelocs_as_parent']['info'] = t('Tripal Feature Locations as Parent');
-         $blocks['featurelocs_as_parent']['cache'] = BLOCK_NO_CACHE;
+         $blocks['alignments']['info'] = t('Tripal Feature Alignments');
+         $blocks['alignments']['cache'] = BLOCK_NO_CACHE;
 
-         $blocks['featurelocs_as_child']['info'] = t('Tripal Feature Locations as Child');
-         $blocks['featurelocs_as_child']['cache'] = BLOCK_NO_CACHE;
-
-         $blocks['relationships_as_object']['info'] = t('Tripal Feature Relationships as Object');
-         $blocks['relationships_as_object']['cache'] = BLOCK_NO_CACHE;
-
-         $blocks['relationships_as_subject']['info'] = t('Tripal Feature Relationships as Subject');
-         $blocks['relationships_as_subject']['cache'] = BLOCK_NO_CACHE;
+         $blocks['relationships']['info'] = t('Tripal Feature Relationships');
+         $blocks['relationships']['cache'] = BLOCK_NO_CACHE;
 
          $blocks['org_feature_counts']['info'] = t('Tripal Organism Feature Counts');
          $blocks['org_feature_counts']['cache'] = BLOCK_NO_CACHE;
@@ -362,21 +356,13 @@ function tripal_feature_block($op = 'list', $delta = 0, $edit=array()){
                   $block['subject'] = t('Formatted Sequences');
                   $block['content'] = theme('tripal_feature_featureloc_sequences',$node);
                   break;
-               case 'featurelocs_as_parent':
-                  $block['subject'] = t('Parent Features');
-                  $block['content'] = theme('tripal_feature_featurelocs_as_parent',$node);
-                  break;
-               case 'featurelocs_as_child':
-                  $block['subject'] = t('Child Features');
-                  $block['content'] = theme('tripal_feature_featurelocs_as_child',$node);
-                  break;
-               case 'relationships_as_object':
-                  $block['subject'] = t('Object Relationships');
-                  $block['content'] = theme('tripal_feature_relationships_as_object',$node);
+               case 'alignments':
+                  $block['subject'] = t('Alignments');
+                  $block['content'] = theme('tripal_feature_featurelocs',$node);
                   break;
-               case 'relationships_as_subject':
-                  $block['subject'] = t('Subject Relationships');
-                  $block['content'] = theme('tripal_feature_relationships_as_child',$node);
+               case 'relationships':
+                  $block['subject'] = t('Relationships');
+                  $block['content'] = theme('tripal_feature_relationships',$node);
                   break;
                case 'org_feature_counts':
                   $block['subject'] = t('Feature Type Summary');
@@ -1590,21 +1576,13 @@ function tripal_feature_theme () {
          'arguments' => array('node'=> null),
          'template' => 'tripal_feature_properties',
       ),
-      'tripal_feature_featurelocs_as_parent' => array (
-         'arguments' => array('node'=> null),
-         'template' => 'tripal_feature_featurelocs_as_parent',
-      ),
-      'tripal_feature_featurelocs_as_child' => array (
-         'arguments' => array('node'=> null),
-         'template' => 'tripal_feature_featurelocs_as_child',
-      ),
-      'tripal_feature_relationships_as_object' => array (
+      'tripal_feature_featurelocs' => array (
          'arguments' => array('node'=> null),
-         'template' => 'tripal_feature_relationships_as_object',
+         'template' => 'tripal_feature_featurelocs',
       ),
-      'tripal_feature_relationships_as_subject' => array (
+      'tripal_feature_relationships' => array (
          'arguments' => array('node'=> null),
-         'template' => 'tripal_feature_relationships_as_subject',
+         'template' => 'tripal_feature_relationships',
       ),
    );
 }