loadHTMLFile($interprofile);
$xml = $dom->saveXML();
$interproput = simplexml_load_string($xml);
// Get html tables for parsing
$tables = $interproput->children()->children();
// Count the number of tables to be processed
$no_iterations = 0;
foreach($tables as $tmp) {
if ($tmp->getName() == 'table') {
$no_iterations ++;
}
}
print "$no_iterations html tables to be processed.\n";
$interval = intval($no_iterations * 0.01);
$idx_iterations = 0;
// Processed the tables
foreach ($tables as $table) {
//if (preg_match('/No hits reported/', $table->asXML()) ) {
//print "skipping this table b/c no hits are reported\n";
//}
// make sure we are looking at a table and its not an empty table
if ($table->getName() == 'table' && !preg_match('/No hits reported/', $table->asXML()) ) {
$idx_iterations ++;
if ($idx_iterations % $interval == 0) {
$percentage = (int) ($idx_iterations / $no_iterations * 100);
tripal_db_set_active($previous_db);
tripal_job_set_progress($job_id, $percentage);
$previous_db = tripal_db_set_active('chado');
print $percentage."% ";
}
// Set job status
// Get the first row and match its name with the feature name
$firsttd = $table->children()->children()->children();
$feature_id = 0;
foreach($firsttd as $b) {
foreach($b->children() as $a) {
if ($a->getName() == 'a') {
// Remove _ORF from the sequence name
$seqname = preg_replace('/^(.+?)_\d_.+/', "$1", $a);
print "seqname is $seqname\n";
// Find out how many features match this uniquename
$sql = "SELECT count(feature_id) FROM {feature} ".
"WHERE uniquename = '%s' ";
$no_features = db_result(db_query($sql, $seqname));
// If there is only one match, get the feature_id
if ($no_features == 1) {
$sql = "SELECT feature_id FROM {feature} ".
"WHERE uniquename = '%s' ";
$feature_id = db_result(db_query($sql, $seqname));
print "\tfeature id is $feature_id\n";
// If the uniquename matches more than one features then skip and print 'Ambiguous'
} else if ($no_features > 1) {
fwrite($log, "Ambiguous: ".$seqname." matches more than one feature and is not processed.\n");
continue;
// If the uniquename did not match, skip and print 'Failed'
} else {
fwrite($log, "Failed: ".$seqname."\n");
}
}
}
}
// Successfully matched. print 'Succeeded'. Add analysis_id and
// feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
if ($feature_id) {
//------------------------------------
// Clease unwanted rows from the table
//------------------------------------
$parent_row = "/
Parent<\/b><\/td>\s* | \s*no.*?parent<\/td>\s*<\/tr>/";
$children_row = "/ |
Children<\/b><\/td>\s* | \s*no.*?children<\/td>\s*<\/tr>/";
$found_row = "/ |
Found.*?in<\/b><\/td>\s* | \s*no.*?entries<\/td>\s*<\/tr>/";
$contains_row = "/ |
Contains<\/b><\/td>\s* | \s*no.*?entries<\/td>\s*<\/tr>/";
$go_row = "/ |
GO.*?terms<\/b><\/td>\s* | \s*none<\/td>\s*<\/tr>/";
$table_txt = $table->asXML();
$table_txt = preg_replace($parent_row, "", $table_txt);
$table_txt = preg_replace($children_row, "", $table_txt);
$table_txt = preg_replace($found_row, "", $table_txt);
$table_txt = preg_replace($contains_row, "", $table_txt);
$table_txt = preg_replace($go_row, "", $table_txt);
//------------------------------------
// Clease unwanted ORF link from table
//------------------------------------
$orf_link = "/(.*?)<\/a><\/b>/";
$table_txt = preg_replace($orf_link, "$1", $table_txt);
//print "----------------------------\n";
//print "old: ".$table->asXML()."\n\n\n";
//print "----------------------------\n";
//print "Fixed: $table_txt\n";
//print "----------------------------\n";
//------------------------------------
// If this feature has already been associated with this analysis, do not reinsert
// Otherwise, Insert into analysisfeature table
//------------------------------------
$sql = "Select analysisfeature_id as id from {analysisfeature} where feature_id = %d and analysis_id = %d";
$analysisfeature = db_fetch_object(db_query($sql, $feature_id, $analysis_id));
if($analysisfeature){ $analysisfeature_id = $analysisfeature->id; }
if(!$analysisfeature_id){
print "inserting analysisfeature\n";
$sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
"VALUES (%d, %d)";
db_query ($sql, $feature_id, $analysis_id);
$sql = "Select analysisfeature_id from {analysisfeature} where feature_id = %d and analysis_id = %d";
$analysisfeature = db_fetch_object(db_query($sql, $feature_id, $analysis_id));
$analysisfeature_id = $analysisfeature->id;
}
print "analysisfeature_id is $analysisfeature_id (analysis_id = $analysis_id; feature_id = $feature_id)\n";
// Get the higest rank for this feature_id in analysisfeatureprop table.
// If the value of the inserting content is not duplicate, add it to
// analysisfeaturepro with 'higest_rank + 1'
$sql = "SELECT MAX(rank) FROM {analysisfeatureprop} AFP ".
"INNER JOIN analysisfeature AF ON AF.analysisfeature_id = AFP.analysisfeature_id ".
"WHERE feature_id=%d ".
"AND analysis_id=%d ".
"AND type_id=%d ";
$afp = db_fetch_object(db_query($sql, $feature_id, $analysis_id, $type_id));
$hi_rank = 0;
if ($afp) {
$hi_rank = $afp->max + 1;
}
//------------------------------------------------------------
// Insert interpro html tags into analysisfeatureprop table
//------------------------------------------------------------
// Before inserting, make sure it's not a duplicate
$sql = "SELECT value FROM {analysisfeatureprop} WHERE analysisfeature_id = %d AND type_id = %d";
$result = db_query($sql, $analysisfeature_id, $type_id);
$duplicate = 0;
while ($afp_value = db_fetch_object($result)) {
if ($table_txt == $afp_value->value) {
$duplicate = 1;
}
}
if (!$duplicate) {
$sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
"VALUES (%d, %d, '%s', %d)";
db_query($sql, $analysisfeature_id, $type_id, $table_txt, $hi_rank);
fwrite($log, " (Insert)\n"); // write to log
print "\twriting table\n";
} else {
fwrite($log, " (Skipped)\n");
print "\tskipping table - dup\n";
}
// Parse GO terms. Make sure GO database schema is installed in chado
$go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'"));
if (!$go_db_id) {
print 'GO schema not installed in chado. GO terms are not processed.';
}
if ($go_db_id && $parsego) {
$trs = $table->children();
foreach ($trs as $tr) {
$tds = $tr->children();
foreach($tds as $td) {
$gotags = $td->children();
foreach ($gotags as $gotag) {
// Look for 'GO:accession#'
if (preg_match("/^.*?GO:(\d+).*$/", $gotag, $matches)) {
// Find cvterm_id for the matched GO term
$sql = "SELECT cvterm_id FROM {cvterm} CVT
INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id
WHERE DBX.accession = '%s' AND DBX.db_id = %d";
$goterm_id = db_result(db_query($sql, $matches[1], $go_db_id));
//-------------------------------------------
// Insert GO terms into feature_cvterm table
//-------------------------------------------
// Default pub_id = 1 (NULL) was used
$sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id)
VALUES (%d, %d, 1)";
db_query($sql, $feature_id, $goterm_id);
//------------------------------------------------
// Insert GO terms into analysisfeatureprop table
//------------------------------------------------
$sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) ".
"VALUES (%d, %d, '%s', 0)";
db_query($sql, $analysisfeature_id, $goterm_id, $matches[1]);
}
}
}
}
}
}
}
}
tripal_db_set_active ($previous_db); // Use drupal database
print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
fwrite($log, "\n");
fclose($log);
return;
}
/**
*
*/
function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
$parsego, $query_re, $query_type, $query_uniquename, $job_id)
{
// clear out the anslysisfeature table for this analysis before getting started
tripal_core_chado_delete('analysisfeature',array('analysis_id' => $analysis_id));
// If user input a file (e.g. blast.xml)
if (is_file($interproxmlfile)) {
tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, $interproxmlfile,
$parsego, $query_re, $query_type, $query_uniquename, $job_id);
}
else {
$dir_handle = @opendir($interproxmlfile) or die("Unable to open $interproxmlfile");
$pattern = sql_regcase($interproxmlfile . "/*.xml");
$total_files = count(glob($pattern));
print "$total_files file(s) to be parsed.\n";
$interval = intval($total_files * 0.01);
if($interval == 0){
$interval = 1;
}
$no_file = 0;
// Parsing all files in the directory
while ($file = readdir($dir_handle)) {
if(preg_match("/^.*\.xml/i",$file)){
tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, "$interproxmlfile/$file",
$parsego, $query_re, $query_type, $query_uniquename, $job_id,0);
// Set job status
if ($no_file % $interval == 0) {
$percentage = (int) (($no_file / $total_files) * 100);
tripal_job_set_progress($job_id, $percentage);
print $percentage."% ";
}
}
$no_file ++;
}
}
print "Done.";
}
/**
*
*/
function tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, $interproxmlfile,
$parsego, $query_re, $query_type, $query_uniquename, $job_id,$uptate_status = 1)
{
// Parsing started
print "Parsing File:".$interproxmlfile." ...\n";
// Get cvterm_id for 'analysis_interpro_xmloutput_hits' which is required
// for inserting into the analysisfeatureprop table
$previous_db = db_set_active('chado'); // use chado database
$sql = "SELECT CVT.cvterm_id FROM {cvterm} CVT ".
" INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
"WHERE CVT.name = 'analysis_interpro_xmloutput_hit' ".
" AND CV.name = 'tripal'";
$type_id = db_result(db_query($sql));
// Load the XML file
$xml = simplexml_load_file($interproxmlfile);
// If there is an EBI header then we need to skip that
// and set our proteins array to be the second element of the array. This
// occurs if results were generated with the online InterProScan tool.
// if the XML starts in with the results then this happens when InterProScan
// is used command-line and we can just use the object as is
if(preg_match('/^EBIInterProScanResults/',$xml->getname())){
$children = $xml->children();
$header = $children[0];
$proteins = $children[1];
}
// if the XML starts with the tag
elseif(preg_match('/^interpro_matches/',$xml->getname())) {
$proteins = $xml;
}
else {
print "ERROR: cannot parse XML file format is not recognized\n";
return;
}
// Count the number of entires to be processed
$no_iterations = 0;
foreach($proteins as $protein) {
$no_iterations ++;
}
print " Found results for $no_iterations sequences\n";
$interval = intval($no_iterations * 0.01);
if($interval == 0){
$interval = 1;
}
$idx_iterations = 0;
// get the DB id for the GO database
$parsego = tripal_analysis_get_property($analysis_id,'analysis_interpro_parsego');
$go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'"));
if ($parsego and !$go_db_id) {
print 'GO schema not installed in chado. GO terms are not processed.';;
}
// Processed each protein
foreach ($proteins as $protein) {
// Set job status
$idx_iterations ++;
if ($idx_iterations % $interval == 0 and $update_status) {
$percentage = (int) ($idx_iterations / $no_iterations * 100);
db_set_active($previous_db);
tripal_job_set_progress($job_id, $percentage);
$previous_db = db_set_active('chado');
print $percentage."% ";
}
// match the protein id with the feature name
$feature_id = 0;
$attr = $protein->attributes();
$seqname =$attr ['id'];
// is the sequence name a generic name (i.e. 'Sequence_1') then the
// blast results do not contain the original sequence names. The only
// option we have is to use the filename. This will work in the case of
// Blast2GO which stores the XML for each sequence in a file with the
// the filename the name of the sequence
if(preg_match('/Sequence_\d+/',$seqname)){
$filename = preg_replace('/^.*\/(.*).xml$/', '$1', $interproxmlfile);
print " Sequence name is not specific, using filename: $filename\n";
$seqname = $filename;
}
// Remove _ORF from the sequence name
$seqname = preg_replace('/^(.+)_\d+_ORF\d+.*/', '$1', $seqname);
// if a regular expression is provided then pick out the portion requested
if ($query_re and preg_match("/$query_re/", $seqname, $matches)) {
$feature = $matches[1];
}
// If no match by the regular expression then get everything up to the first space
else {
if (preg_match('/^(.*?)\s.*$/', $seqname, $matches)) {
$feature = $matches[1];
}
// if no match up to the first space then just use the entire string
else {
$feature = $seqname;
}
}
if(!$feature and $query_re){
print "Failed: Cannot find feature for '$seqname' using the regular expression: $query_re\n";
continue;
}
// now find the feature in chado
$select = array();
if($query_uniquename){
$select['uniquename'] = $feature;
} else {
$select['name'] = $feature;
}
if($query_type){
$select['type_id'] = array(
'cv_id' => array(
'name' => 'sequence'
),
'name' => $query_type,
);
}
$feature_arr = tripal_core_chado_select('feature',array('feature_id'),$select);
if(count($feature_arr) > 1){
print "Ambiguous: '$feature' matches more than one feature and is being skipped.\n";
continue;
}
if(count($feature_arr) == 0){
print "Failed: cannot find a matching feature for '$feature' in the database.\n";
continue;
}
$feature_id = $feature_arr[0]->feature_id;
// Successfully matched. print 'Succeeded'. Add analysis_id and
// feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
if ($feature_id) {
print " Adding InterPro results for feature '$seqname' ($feature_id)\n";
// Insert into analysisfeature table only if it doesn't already exist
$values = array('feature_id' => $feature_id, 'analysis_id' => $analysis_id);
$analysisfeature = tripal_core_chado_select('analysisfeature',array('*'),$values);
if(sizeof($analysisfeature) == 0){
$analysisfeature = tripal_core_chado_insert('analysisfeature',$values);
$analysisfeature_id = $analysisfeature['analysisfeature_id'];
} else {
$analysisfeature_id = $analysisfeature[0]->analysisfeature_id;
}
// Insert interpro xml results into analysisfeatureprop table
// Check to see if we have an existing entry
$sql = "SELECT analysisfeatureprop_id,rank
FROM {analysisfeatureprop}
WHERE analysisfeature_id = %d AND type_id = %d
ORDER BY rank DESC";
$result = db_fetch_object(db_query($sql, $analysisfeature_id, $type_id));
$rank = 0;
if($result){
$afp_id = $result->analysisfeatureprop_id;
$rank = $result->rank + 1;
}
$sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
"VALUES (%d, %d, '%s', %d)";
db_query($sql, $analysisfeature_id, $type_id, $protein->asXML(), $rank);
// parse the XML for each protein if GO terms are requested
if($parsego and $go_db_id){
$protein = tripal_analysis_interpro_get_result_object($protein->asXML(),$feature_id);
$goterms = $protein['goterms'];
// cycle through the GO terms and add them to the database
foreach($goterms as $goterm){
// seperate the 'GO:' from the term
if (preg_match("/^.*?GO:(\d+).*$/", $goterm, $matches)) {
// Find cvterm_id for the matched GO term
$sql = "SELECT cvterm_id FROM {cvterm} CVT
INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id
WHERE DBX.accession = '%s' AND DBX.db_id = %d";
$goterm_id = db_result(db_query($sql, $matches[1], $go_db_id));
// Insert GO terms into feature_cvterm table
// Default pub_id = 1 (NULL) was used
$values = array('feature_id' => $feature_id, 'cvterm_id' => $goterm_id, 'pub_id' => 1);
$feature_cvterm = tripal_core_chado_select('feature_cvterm',array('*'),$values);
if(sizeof($feature_cvterm) == 0){
$feature_cvterm = tripal_core_chado_insert('feature_cvterm',$values);
}
// Insert GO terms into analysisfeatureprop table
$values = array('analysisfeature_id' => $analysisfeature_id,
'type_id' => $goterm_id,
'rank' => 0);
$analysisfeatureprop = tripal_core_chado_select('analysisfeatureprop',array('*'),$values);
if(sizeof($analysisfeatureprop) == 0){
$values['value'] = $matches[1];
$analysisfeatureprop = tripal_core_chado_insert('analysisfeatureprop',$values);
}
} // end if preg_match
} // end for each goterm
} // end if($parsego and $go_db_id)
} // end if($feature_id)
} // end foreach ($proteins as $protein)
db_set_active ($previous_db); // Use drupal database
return;
}
/********************************************************************************
*
*/
function tripal_analysis_interpro_get_result_object($interpro_xml,$feature_id){
// Load the XML into an object
$xmlObj = simplexml_load_string($interpro_xml);
// iterate through each interpro results for this protein
$results = array();
$terms = array();
$protein = array();
$iprterms = array();
$goterms = array();
$term_count = 0;
$match_count = 0;
// get the properties of this result
$attr = $xmlObj->attributes();
$protein['orf_id'] = (string) $attr["id"];
$protein['orf_length'] = (string) $attr["length"];
$protein['orf_crc64'] = (string) $attr["crc64"];
foreach($xmlObj->children() as $intepro){
// get the interpro term for this match
$attr = $intepro->attributes();
$terms[$term_count]['ipr_id'] = (string) $attr["id"];
$terms[$term_count]['ipr_name'] = (string) $attr["name"];
$terms[$term_count]['ipr_type'] = (string) $attr["type"];
$iprterms[] = array($terms[$term_count]['ipr_id'],$terms[$term_count]['ipr_name']);
// iterate through the elements of the interpro result
$matches[$term_count]['matches'] = array();
$match_count = 0;
foreach($intepro->children() as $level1){
$element_name = $level1->getName();
if($element_name == 'match'){
// get the match name for this match
$attr = $level1->attributes();
$terms[$term_count]['matches'][$match_count]['match_id'] = (string) $attr["id"];
$terms[$term_count]['matches'][$match_count]['match_name'] = (string) $attr["name"];
$terms[$term_count]['matches'][$match_count]['match_dbname'] = (string) $attr["dbname"];
// get the location information for this match
$loc_count = 0;
foreach($level1->children() as $level2){
$element_name = $level2->getName();
if($element_name == 'location'){
$attr = $level2->attributes();
$terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_start'] = (string) $attr["start"];
$terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_end'] = (string) $attr["end"];
$terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_score'] = (string) $attr["score"];
$terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_status'] = (string) $attr["status"];
$terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_evidence'] = (string) $attr["evidence"];
$loc_count++;
}
}
$match_count++;
}
if($element_name == 'classification'){
$attr = $level1->attributes();
if($attr['class_type'] == 'GO'){
$terms[$term_count]['matches'][$match_count]['go_terms'][] = (string) $attr['id'];
$goterms[] = (string) $attr['id'];
}
}
}
$term_count++;
}
$results['terms'] = $terms;
$results['orf'] = $protein;
$results['iprterms'] = $iprterms;
$results['goterms'] = $goterms;
return $results;
}
|