@@ -244,40 +244,89 @@ function tripal_analysis_interpro_parseHTMLFile ($analysis_id, $interprofile, $p
function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
$parsego, $query_re, $query_type, $query_uniquename, $job_id)
// clear out the anslysisfeature table for this analysis before getting started
tripal_core_chado_delete('analysisfeature',array('analysis_id' => $analysis_id));
- // Prepare log
- $filename = preg_replace("/.*\/(.*)/", "$1", $interproxmlfile);
- $logfile = file_directory_path() . "/tripal/tripal_analysis_interpro/load_$filename.log";
- $log = fopen($logfile, 'a'); // append parsing results to log file
+ // If user input a file (e.g. blast.xml)
+ if (is_file($interproxmlfile)) {
+ tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, $interproxmlfile,
+ $parsego, $query_re, $query_type, $query_uniquename, $job_id);
+ }
+ else {
+ $dir_handle = @opendir($interproxmlfile) or die("Unable to open $interproxmlfile");
+ $pattern = sql_regcase($interproxmlfile . "/*.xml");
+ $total_files = count(glob($pattern));
+ print "$total_files file(s) to be parsed.\n";
+ $interval = intval($total_files * 0.01);
+ if($interval == 0){
+ $interval = 1;
+ }
+ $no_file = 0;
+ // Parsing all files in the directory
+ while ($file = readdir($dir_handle)) {
+ if(preg_match("/^.*\.xml/i",$file)){
+ tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, "$interproxmlfile/$file",
+ $parsego, $query_re, $query_type, $query_uniquename, $job_id,0);
+ // Set job status
+ if ($no_file % $interval == 0) {
+ $percentage = (int) (($no_file / $total_files) * 100);
+ tripal_job_set_progress($job_id, $percentage);
+ print $percentage."% ";
+ }
+ }
+ $no_file ++;
+ }
+ }
+ print "Done.";
+function tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, $interproxmlfile,
+ $parsego, $query_re, $query_type, $query_uniquename, $job_id,$uptate_status = 1)
// Parsing started
print "Parsing File:".$interproxmlfile." ...\n";
- fwrite($log, date("D M j G:i:s Y").". Loading $interproxmlfile\n");
// Get cvterm_id for 'analysis_interpro_xmloutput_hits' which is required
// for inserting into the analysisfeatureprop table
$previous_db = db_set_active('chado'); // use chado database
$sql = "SELECT CVT.cvterm_id FROM {cvterm} CVT ".
- "INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
- "WHERE CVT.name = 'analysis_interpro_xmloutput_hit' ".
- "AND CV.name = 'tripal'";
+ " INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
+ "WHERE CVT.name = 'analysis_interpro_xmloutput_hit' ".
+ " AND CV.name = 'tripal'";
$type_id = db_result(db_query($sql));
// Load the XML file
$interproput = simplexml_load_file($interproxmlfile);
// Get entries parsing
- $proteins = $interproput->children();
+ $xml = $interproput->children();
+ // If there is an EBI header then we need to skip that
+ // and set our proteins array to be the second element of the array. This
+ // occurs if results were generated with the online InterProScan tool.
+ // if the XML starts in with the results then this happens when InterProScan
+ // is used command-line and we can just use the object as is
+ if(preg_match('/^Header$/',$xml[0]->getname())){
+ $proteins = $xml[1];
+ } else {
+ $proteins = $xml[0];
+ }
// Count the number of entires to be processed
$no_iterations = 0;
- foreach($proteins as $tmp) {
+ foreach($proteins as $protein) {
$no_iterations ++;
- print "$no_iterations proteins to be processed.\n";
+ print " Found results for $no_iterations sequences\n";
$interval = intval($no_iterations * 0.01);
$idx_iterations = 0;
@@ -290,9 +339,10 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
// Processed each protein
foreach ($proteins as $protein) {
// Set job status
$idx_iterations ++;
- if ($idx_iterations % $interval == 0) {
+ if ($idx_iterations % $interval == 0 and $update_status) {
$percentage = (int) ($idx_iterations / $no_iterations * 100);
tripal_job_set_progress($job_id, $percentage);
@@ -305,6 +355,17 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
$attr = $protein->attributes();
$seqname =$attr ['id'];
+ // is the sequence name a generic name (i.e. 'Sequence_1') then the
+ // blast results do not contain the original sequence names. The only
+ // option we have is to use the filename. This will work in the case of
+ // Blast2GO which stores the XML for each sequence in a file with the
+ // the filename the name of the sequence
+ if(preg_match('/Sequence_\d+/',$seqname)){
+ $filename = preg_replace('/^.*\/(.*).xml$/', '$1', $interproxmlfile);
+ print " Sequence name is not specific, using filename: $filename\n";
+ $seqname = $filename;
+ }
// Remove _ORF from the sequence name
$seqname = preg_replace('/^(.+)_\d+_ORF\d+.*/', '$1', $seqname);
@@ -324,7 +385,7 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
if(!$feature and $query_re){
- print fwrite($log, "Failed: Cannot find feature for '$seqname' using the regular expression: $query_re\n");
+ print "Failed: Cannot find feature for '$seqname' using the regular expression: $query_re\n";
@@ -346,11 +407,11 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
$feature_arr = tripal_core_chado_select('feature',array('feature_id'),$select);
if(count($feature_arr) > 1){
- fwrite($log, "Ambiguous: '$feature' matches more than one feature and is being skipped.\n");
+ print "Ambiguous: '$feature' matches more than one feature and is being skipped.\n";
if(count($feature_arr) == 0){
- fwrite($log, "Failed: '$feature' cannot find a matching feature in the database.\n");
+ print "Failed: '$feature' cannot find a matching feature in the database.\n";
$feature_id = $feature_arr[0]->feature_id;
@@ -360,14 +421,9 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
// feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
if ($feature_id) {
- print "$idx_iterations Adding InterPro results for feature '$seqname' ($feature_id)\n";
- // If a matched feature is found, write to log.
- fwrite($log, "Succeeded: ".$seqname." => feature id:".$feature_id);
+ print " Adding InterPro results for feature '$seqname' ($feature_id)\n";
- //------------------------------------
// Insert into analysisfeature table
- //------------------------------------
$sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
"VALUES (%d, %d)";
db_query ($sql, $feature_id, $analysis_id);
@@ -376,9 +432,7 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
$sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE feature_id = %d AND analysis_id = %d";
$analysisfeature_id = db_result(db_query($sql, $feature_id, $analysis_id));
- //------------------------------------------------------------
// Insert interpro xml results into analysisfeatureprop table
- //------------------------------------------------------------
// Check to see if we have an existing entry
$sql = "SELECT analysisfeatureprop_id,rank
FROM {analysisfeatureprop}
@@ -393,7 +447,6 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
$sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
"VALUES (%d, %d, '%s', %d)";
db_query($sql, $analysisfeature_id, $type_id, $protein->asXML(), $rank);
- fwrite($log, " (Insert)\n"); // write to log
// parse the XML for each protein if GO terms are requested
@@ -431,10 +484,7 @@ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
} // end if($feature_id)
} // end foreach ($proteins as $protein)
db_set_active ($previous_db); // Use drupal database
- print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
- fwrite($log, "\n");
- fclose($log);