|  | @@ -244,40 +244,89 @@ function tripal_analysis_interpro_parseHTMLFile ($analysis_id, $interprofile, $p
 | 
	
		
			
				|  |  |  function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile, 
 | 
	
		
			
				|  |  |     $parsego, $query_re, $query_type, $query_uniquename, $job_id) 
 | 
	
		
			
				|  |  |  {
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  |     // clear out the anslysisfeature table for this analysis before getting started
 | 
	
		
			
				|  |  |     tripal_core_chado_delete('analysisfeature',array('analysis_id' => $analysis_id));
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -   // Prepare log
 | 
	
		
			
				|  |  | -   $filename = preg_replace("/.*\/(.*)/", "$1", $interproxmlfile);
 | 
	
		
			
				|  |  | -   $logfile = file_directory_path() . "/tripal/tripal_analysis_interpro/load_$filename.log";
 | 
	
		
			
				|  |  | -   $log = fopen($logfile, 'a'); // append parsing results to log file
 | 
	
		
			
				|  |  | +   // If user input a file (e.g. blast.xml)
 | 
	
		
			
				|  |  | +	if (is_file($interproxmlfile)) {
 | 
	
		
			
				|  |  | +      tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, $interproxmlfile, 
 | 
	
		
			
				|  |  | +         $parsego, $query_re, $query_type, $query_uniquename, $job_id);
 | 
	
		
			
				|  |  | +   } 
 | 
	
		
			
				|  |  | +   else {
 | 
	
		
			
				|  |  | +		$dir_handle = @opendir($interproxmlfile) or die("Unable to open $interproxmlfile");
 | 
	
		
			
				|  |  | +		$pattern = sql_regcase($interproxmlfile . "/*.xml");
 | 
	
		
			
				|  |  | +		$total_files = count(glob($pattern));
 | 
	
		
			
				|  |  | +		print "$total_files file(s) to be parsed.\n";
 | 
	
		
			
				|  |  | +		
 | 
	
		
			
				|  |  | +		$interval = intval($total_files * 0.01);
 | 
	
		
			
				|  |  | +      if($interval == 0){
 | 
	
		
			
				|  |  | +         $interval = 1;
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +		$no_file = 0;
 | 
	
		
			
				|  |  | +			
 | 
	
		
			
				|  |  | +		// Parsing all files in the directory
 | 
	
		
			
				|  |  | +		while ($file = readdir($dir_handle)) {
 | 
	
		
			
				|  |  | +			if(preg_match("/^.*\.xml/i",$file)){
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +            tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, "$interproxmlfile/$file", 
 | 
	
		
			
				|  |  | +               $parsego, $query_re, $query_type, $query_uniquename, $job_id,0);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +			   // Set job status
 | 
	
		
			
				|  |  | +			   if ($no_file % $interval == 0) {
 | 
	
		
			
				|  |  | +			      $percentage = (int) (($no_file / $total_files) * 100);
 | 
	
		
			
				|  |  | +				   tripal_job_set_progress($job_id, $percentage);
 | 
	
		
			
				|  |  | +				   print $percentage."% ";
 | 
	
		
			
				|  |  | +			   }				
 | 
	
		
			
				|  |  | +         }
 | 
	
		
			
				|  |  | +   	   $no_file ++;
 | 
	
		
			
				|  |  | +		}
 | 
	
		
			
				|  |  | +	}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +	print "Done.";
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | +*
 | 
	
		
			
				|  |  | +*/
 | 
	
		
			
				|  |  | +function tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, $interproxmlfile, 
 | 
	
		
			
				|  |  | +   $parsego, $query_re, $query_type, $query_uniquename, $job_id,$uptate_status = 1) 
 | 
	
		
			
				|  |  | +{
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |     // Parsing started
 | 
	
		
			
				|  |  |     print "Parsing File:".$interproxmlfile." ...\n";
 | 
	
		
			
				|  |  | -   fwrite($log, date("D M j G:i:s Y").". Loading $interproxmlfile\n");
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |     // Get cvterm_id for 'analysis_interpro_xmloutput_hits' which is required
 | 
	
		
			
				|  |  |     // for inserting into the analysisfeatureprop table
 | 
	
		
			
				|  |  |     $previous_db = db_set_active('chado'); // use chado database
 | 
	
		
			
				|  |  |     $sql = "SELECT CVT.cvterm_id FROM {cvterm} CVT ".
 | 
	
		
			
				|  |  | -   "INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
 | 
	
		
			
				|  |  | -   "WHERE CVT.name = 'analysis_interpro_xmloutput_hit' ".
 | 
	
		
			
				|  |  | -   "AND CV.name = 'tripal'";
 | 
	
		
			
				|  |  | +          "   INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
 | 
	
		
			
				|  |  | +          "WHERE CVT.name = 'analysis_interpro_xmloutput_hit' ".
 | 
	
		
			
				|  |  | +          "   AND CV.name = 'tripal'";
 | 
	
		
			
				|  |  |     $type_id = db_result(db_query($sql));
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |     // Load the XML file
 | 
	
		
			
				|  |  |     $interproput =  simplexml_load_file($interproxmlfile);
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |     // Get entries parsing
 | 
	
		
			
				|  |  | -   $proteins = $interproput->children();
 | 
	
		
			
				|  |  | +   $xml = $interproput->children();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +   // If there is an EBI header then we need to skip that
 | 
	
		
			
				|  |  | +   // and set our proteins array to be the second element of the array. This
 | 
	
		
			
				|  |  | +   // occurs if results were generated with the online InterProScan tool.
 | 
	
		
			
				|  |  | +   // if the XML starts in with the results then this happens when InterProScan
 | 
	
		
			
				|  |  | +   // is used command-line and we can just use the object as is
 | 
	
		
			
				|  |  | +   if(preg_match('/^Header$/',$xml[0]->getname())){
 | 
	
		
			
				|  |  | +      $proteins = $xml[1];  
 | 
	
		
			
				|  |  | +   } else {
 | 
	
		
			
				|  |  | +      $proteins = $xml[0];
 | 
	
		
			
				|  |  | +   }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |     // Count the number of entires to be processed
 | 
	
		
			
				|  |  |     $no_iterations = 0;
 | 
	
		
			
				|  |  | -   foreach($proteins as $tmp) {
 | 
	
		
			
				|  |  | +   foreach($proteins as $protein) {
 | 
	
		
			
				|  |  |        $no_iterations ++;
 | 
	
		
			
				|  |  |     }
 | 
	
		
			
				|  |  | -   print "$no_iterations proteins to be processed.\n";
 | 
	
		
			
				|  |  | +   print "  Found results for $no_iterations sequences\n";
 | 
	
		
			
				|  |  |     $interval = intval($no_iterations * 0.01);
 | 
	
		
			
				|  |  |     $idx_iterations = 0;
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -290,9 +339,10 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |     // Processed each protein
 | 
	
		
			
				|  |  |     foreach ($proteins as $protein) {
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |        // Set job status
 | 
	
		
			
				|  |  |        $idx_iterations ++;
 | 
	
		
			
				|  |  | -      if ($idx_iterations % $interval == 0) {
 | 
	
		
			
				|  |  | +      if ($idx_iterations % $interval == 0 and $update_status) {
 | 
	
		
			
				|  |  |           $percentage = (int) ($idx_iterations / $no_iterations * 100);
 | 
	
		
			
				|  |  |           db_set_active($previous_db);
 | 
	
		
			
				|  |  |           tripal_job_set_progress($job_id, $percentage);
 | 
	
	
		
			
				|  | @@ -305,6 +355,17 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
 | 
	
		
			
				|  |  |        $attr = $protein->attributes();
 | 
	
		
			
				|  |  |        $seqname =$attr ['id'];
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +      // is the sequence name a generic name (i.e. 'Sequence_1') then the 
 | 
	
		
			
				|  |  | +      // blast results do not contain the original sequence names.  The only
 | 
	
		
			
				|  |  | +      // option we have is to use the filename.  This will work in the case of
 | 
	
		
			
				|  |  | +      // Blast2GO which stores the XML for each sequence in a file with the
 | 
	
		
			
				|  |  | +      // the filename the name of the sequence
 | 
	
		
			
				|  |  | +      if(preg_match('/Sequence_\d+/',$seqname)){
 | 
	
		
			
				|  |  | +         $filename = preg_replace('/^.*\/(.*).xml$/', '$1', $interproxmlfile);
 | 
	
		
			
				|  |  | +         print "  Sequence name is not specific, using filename: $filename\n";
 | 
	
		
			
				|  |  | +         $seqname = $filename;   
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |        // Remove _ORF from the sequence name
 | 
	
		
			
				|  |  |        $seqname = preg_replace('/^(.+)_\d+_ORF\d+.*/', '$1', $seqname);
 | 
	
		
			
				|  |  |       
 | 
	
	
		
			
				|  | @@ -324,7 +385,7 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
 | 
	
		
			
				|  |  |        }   
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |        if(!$feature and $query_re){
 | 
	
		
			
				|  |  | -         print fwrite($log, "Failed: Cannot find feature for '$seqname' using the regular expression: $query_re\n");
 | 
	
		
			
				|  |  | +         print "Failed: Cannot find feature for '$seqname' using the regular expression: $query_re\n";
 | 
	
		
			
				|  |  |           continue;
 | 
	
		
			
				|  |  |        }
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -346,11 +407,11 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |        $feature_arr = tripal_core_chado_select('feature',array('feature_id'),$select);
 | 
	
		
			
				|  |  |        if(count($feature_arr) > 1){
 | 
	
		
			
				|  |  | -		   fwrite($log, "Ambiguous: '$feature' matches more than one feature and is being skipped.\n");
 | 
	
		
			
				|  |  | +		   print "Ambiguous: '$feature' matches more than one feature and is being skipped.\n";
 | 
	
		
			
				|  |  |  			continue;
 | 
	
		
			
				|  |  |        }
 | 
	
		
			
				|  |  |        if(count($feature_arr) == 0){
 | 
	
		
			
				|  |  | -			fwrite($log, "Failed: '$feature' cannot find a matching feature in the database.\n");
 | 
	
		
			
				|  |  | +			print "Failed: '$feature' cannot find a matching feature in the database.\n";
 | 
	
		
			
				|  |  |           continue;
 | 
	
		
			
				|  |  |        }
 | 
	
		
			
				|  |  |        $feature_id = $feature_arr[0]->feature_id;
 | 
	
	
		
			
				|  | @@ -360,14 +421,9 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
 | 
	
		
			
				|  |  |        // feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
 | 
	
		
			
				|  |  |        if ($feature_id) {
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -         print "$idx_iterations Adding InterPro results for feature '$seqname' ($feature_id)\n";
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -         // If a matched feature is found, write to log.
 | 
	
		
			
				|  |  | -         fwrite($log, "Succeeded: ".$seqname." => feature id:".$feature_id);
 | 
	
		
			
				|  |  | +         print "  Adding InterPro results for feature '$seqname' ($feature_id)\n";
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -         //------------------------------------
 | 
	
		
			
				|  |  |           // Insert into analysisfeature table
 | 
	
		
			
				|  |  | -         //------------------------------------
 | 
	
		
			
				|  |  |           $sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
 | 
	
		
			
				|  |  |                  "VALUES (%d, %d)";
 | 
	
		
			
				|  |  |           db_query ($sql, $feature_id, $analysis_id);                     
 | 
	
	
		
			
				|  | @@ -376,9 +432,7 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
 | 
	
		
			
				|  |  |           $sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE feature_id = %d AND analysis_id = %d";
 | 
	
		
			
				|  |  |           $analysisfeature_id = db_result(db_query($sql, $feature_id, $analysis_id));
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -         //------------------------------------------------------------
 | 
	
		
			
				|  |  |           // Insert interpro xml results into analysisfeatureprop table
 | 
	
		
			
				|  |  | -         //------------------------------------------------------------
 | 
	
		
			
				|  |  |           // Check to see if we have an existing entry
 | 
	
		
			
				|  |  |           $sql = "SELECT analysisfeatureprop_id,rank 
 | 
	
		
			
				|  |  |                   FROM {analysisfeatureprop} 
 | 
	
	
		
			
				|  | @@ -393,7 +447,6 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
 | 
	
		
			
				|  |  |           $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
 | 
	
		
			
				|  |  |                  "VALUES (%d, %d, '%s', %d)";
 | 
	
		
			
				|  |  |           db_query($sql, $analysisfeature_id, $type_id, $protein->asXML(), $rank);
 | 
	
		
			
				|  |  | -         fwrite($log, " (Insert)\n"); // write to log
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |           // parse the XML for each protein if GO terms are requested
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -431,10 +484,7 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
 | 
	
		
			
				|  |  |        } // end if($feature_id)            
 | 
	
		
			
				|  |  |     } // end foreach ($proteins as $protein)
 | 
	
		
			
				|  |  |     db_set_active ($previous_db); // Use drupal database
 | 
	
		
			
				|  |  | -   print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -   fwrite($log, "\n");
 | 
	
		
			
				|  |  | -   fclose($log);
 | 
	
		
			
				|  |  |     return;
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  |  /********************************************************************************
 |