loadHTMLFile($interprofile); $xml = $dom->saveXML(); $interproput = simplexml_load_string($xml); // Get html tables for parsing $tables = $interproput->children()->children(); // Count the number of tables to be processed $no_iterations = 0; foreach($tables as $tmp) { if ($tmp->getName() == 'table') { $no_iterations ++; } } print "$no_iterations html tables to be processed.\n"; $interval = intval($no_iterations * 0.01); $idx_iterations = 0; // Processed the tables foreach ($tables as $table) { //if (preg_match('/No hits reported/', $table->asXML()) ) { //print "skipping this table b/c no hits are reported\n"; //} // make sure we are looking at a table and its not an empty table if ($table->getName() == 'table' && !preg_match('/No hits reported/', $table->asXML()) ) { $idx_iterations ++; if ($idx_iterations % $interval == 0) { $percentage = (int) ($idx_iterations / $no_iterations * 100); tripal_db_set_active($previous_db); tripal_job_set_progress($job_id, $percentage); $previous_db = tripal_db_set_active('chado'); print $percentage."% "; } // Set job status // Get the first row and match its name with the feature name $firsttd = $table->children()->children()->children(); $feature_id = 0; foreach($firsttd as $b) { foreach($b->children() as $a) { if ($a->getName() == 'a') { // Remove _ORF from the sequence name $seqname = preg_replace('/^(.+?)_\d_.+/', "$1", $a); print "seqname is $seqname\n"; // Find out how many features match this uniquename $sql = "SELECT count(feature_id) FROM {feature} ". "WHERE uniquename = '%s' "; $no_features = db_result(db_query($sql, $seqname)); // If there is only one match, get the feature_id if ($no_features == 1) { $sql = "SELECT feature_id FROM {feature} ". "WHERE uniquename = '%s' "; $feature_id = db_result(db_query($sql, $seqname)); print "\tfeature id is $feature_id\n"; // If the uniquename matches more than one features then skip and print 'Ambiguous' } else if ($no_features > 1) { fwrite($log, "Ambiguous: ".$seqname." matches more than one feature and is not processed.\n"); continue; // If the uniquename did not match, skip and print 'Failed' } else { fwrite($log, "Failed: ".$seqname."\n"); } } } } // Successfully matched. print 'Succeeded'. Add analysis_id and // feature_id to analysisfeature. Add the table as XML to analysisfeatureprop if ($feature_id) { //------------------------------------ // Clease unwanted rows from the table //------------------------------------ $parent_row = "/Parent<\/b><\/td>\s*\s*no.*?parent<\/td>\s*<\/tr>/"; $children_row = "/Children<\/b><\/td>\s*\s*no.*?children<\/td>\s*<\/tr>/"; $found_row = "/Found.*?in<\/b><\/td>\s*\s*no.*?entries<\/td>\s*<\/tr>/"; $contains_row = "/Contains<\/b><\/td>\s*\s*no.*?entries<\/td>\s*<\/tr>/"; $go_row = "/GO.*?terms<\/b><\/td>\s*\s*none<\/td>\s*<\/tr>/"; $table_txt = $table->asXML(); $table_txt = preg_replace($parent_row, "", $table_txt); $table_txt = preg_replace($children_row, "", $table_txt); $table_txt = preg_replace($found_row, "", $table_txt); $table_txt = preg_replace($contains_row, "", $table_txt); $table_txt = preg_replace($go_row, "", $table_txt); //------------------------------------ // Clease unwanted ORF link from table //------------------------------------ $orf_link = "/(.*?)<\/a><\/b>/"; $table_txt = preg_replace($orf_link, "$1", $table_txt); //print "----------------------------\n"; //print "old: ".$table->asXML()."\n\n\n"; //print "----------------------------\n"; //print "Fixed: $table_txt\n"; //print "----------------------------\n"; //------------------------------------ // If this feature has already been associated with this analysis, do not reinsert // Otherwise, Insert into analysisfeature table //------------------------------------ $sql = "Select analysisfeature_id as id from {analysisfeature} where feature_id = %d and analysis_id = %d"; $analysisfeature = db_fetch_object(db_query($sql, $feature_id, $analysis_id)); if($analysisfeature){ $analysisfeature_id = $analysisfeature->id; } if(!$analysisfeature_id){ print "inserting analysisfeature\n"; $sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ". "VALUES (%d, %d)"; db_query ($sql, $feature_id, $analysis_id); $sql = "Select analysisfeature_id from {analysisfeature} where feature_id = %d and analysis_id = %d"; $analysisfeature = db_fetch_object(db_query($sql, $feature_id, $analysis_id)); $analysisfeature_id = $analysisfeature->id; } print "analysisfeature_id is $analysisfeature_id (analysis_id = $analysis_id; feature_id = $feature_id)\n"; // Get the higest rank for this feature_id in analysisfeatureprop table. // If the value of the inserting content is not duplicate, add it to // analysisfeaturepro with 'higest_rank + 1' $sql = "SELECT MAX(rank) FROM {analysisfeatureprop} AFP ". "INNER JOIN analysisfeature AF ON AF.analysisfeature_id = AFP.analysisfeature_id ". "WHERE feature_id=%d ". "AND analysis_id=%d ". "AND type_id=%d "; $afp = db_fetch_object(db_query($sql, $feature_id, $analysis_id, $type_id)); $hi_rank = 0; if ($afp) { $hi_rank = $afp->max + 1; } //------------------------------------------------------------ // Insert interpro html tags into analysisfeatureprop table //------------------------------------------------------------ // Before inserting, make sure it's not a duplicate $sql = "SELECT value FROM {analysisfeatureprop} WHERE analysisfeature_id = %d AND type_id = %d"; $result = db_query($sql, $analysisfeature_id, $type_id); $duplicate = 0; while ($afp_value = db_fetch_object($result)) { if ($table_txt == $afp_value->value) { $duplicate = 1; } } if (!$duplicate) { $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)". "VALUES (%d, %d, '%s', %d)"; db_query($sql, $analysisfeature_id, $type_id, $table_txt, $hi_rank); fwrite($log, " (Insert)\n"); // write to log print "\twriting table\n"; } else { fwrite($log, " (Skipped)\n"); print "\tskipping table - dup\n"; } // Parse GO terms. Make sure GO database schema is installed in chado $go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'")); if (!$go_db_id) { print 'GO schema not installed in chado. GO terms are not processed.'; } if ($go_db_id && $parsego) { $trs = $table->children(); foreach ($trs as $tr) { $tds = $tr->children(); foreach($tds as $td) { $gotags = $td->children(); foreach ($gotags as $gotag) { // Look for 'GO:accession#' if (preg_match("/^.*?GO:(\d+).*$/", $gotag, $matches)) { // Find cvterm_id for the matched GO term $sql = "SELECT cvterm_id FROM {cvterm} CVT INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id WHERE DBX.accession = '%s' AND DBX.db_id = %d"; $goterm_id = db_result(db_query($sql, $matches[1], $go_db_id)); //------------------------------------------- // Insert GO terms into feature_cvterm table //------------------------------------------- // Default pub_id = 1 (NULL) was used $sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id) VALUES (%d, %d, 1)"; db_query($sql, $feature_id, $goterm_id); //------------------------------------------------ // Insert GO terms into analysisfeatureprop table //------------------------------------------------ $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) ". "VALUES (%d, %d, '%s', 0)"; db_query($sql, $analysisfeature_id, $goterm_id, $matches[1]); } } } } } } } } tripal_db_set_active ($previous_db); // Use drupal database print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n"; fwrite($log, "\n"); fclose($log); return; } /** * */ function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile, $parsego, $query_re, $query_type, $query_uniquename, $job_id) { // clear out the anslysisfeature table for this analysis before getting started tripal_core_chado_delete('analysisfeature',array('analysis_id' => $analysis_id)); // If user input a file (e.g. blast.xml) if (is_file($interproxmlfile)) { tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, $interproxmlfile, $parsego, $query_re, $query_type, $query_uniquename, $job_id); } else { $dir_handle = @opendir($interproxmlfile) or die("Unable to open $interproxmlfile"); $pattern = sql_regcase($interproxmlfile . "/*.xml"); $total_files = count(glob($pattern)); print "$total_files file(s) to be parsed.\n"; $interval = intval($total_files * 0.01); if($interval == 0){ $interval = 1; } $no_file = 0; // Parsing all files in the directory while ($file = readdir($dir_handle)) { if(preg_match("/^.*\.xml/i",$file)){ tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, "$interproxmlfile/$file", $parsego, $query_re, $query_type, $query_uniquename, $job_id,0); // Set job status if ($no_file % $interval == 0) { $percentage = (int) (($no_file / $total_files) * 100); tripal_job_set_progress($job_id, $percentage); print $percentage."% "; } } $no_file ++; } } print "Done."; } /** * */ function tripal_analysis_interpro_parseSingleXMLFile ($analysis_id, $interproxmlfile, $parsego, $query_re, $query_type, $query_uniquename, $job_id,$uptate_status = 1) { // Parsing started print "Parsing File:".$interproxmlfile." ...\n"; // Get cvterm_id for 'analysis_interpro_xmloutput_hits' which is required // for inserting into the analysisfeatureprop table $previous_db = db_set_active('chado'); // use chado database $sql = "SELECT CVT.cvterm_id FROM {cvterm} CVT ". " INNER JOIN cv ON cv.cv_id = CVT.cv_id ". "WHERE CVT.name = 'analysis_interpro_xmloutput_hit' ". " AND CV.name = 'tripal'"; $type_id = db_result(db_query($sql)); // Load the XML file $xml = simplexml_load_file($interproxmlfile); // If there is an EBI header then we need to skip that // and set our proteins array to be the second element of the array. This // occurs if results were generated with the online InterProScan tool. // if the XML starts in with the results then this happens when InterProScan // is used command-line and we can just use the object as is if(preg_match('/^EBIInterProScanResults/',$xml->getname())){ $children = $xml->children(); $header = $children[0]; $proteins = $children[1]; } // if the XML starts with the tag elseif(preg_match('/^interpro_matches/',$xml->getname())) { $proteins = $xml; } else { print "ERROR: cannot parse XML file format is not recognized\n"; return; } // Count the number of entires to be processed $no_iterations = 0; foreach($proteins as $protein) { $no_iterations ++; } print " Found results for $no_iterations sequences\n"; $interval = intval($no_iterations * 0.01); if($interval == 0){ $interval = 1; } $idx_iterations = 0; // get the DB id for the GO database $parsego = tripal_analysis_get_property($analysis_id,'analysis_interpro_parsego'); $go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'")); if ($parsego and !$go_db_id) { print 'GO schema not installed in chado. GO terms are not processed.';; } // Processed each protein foreach ($proteins as $protein) { // Set job status $idx_iterations ++; if ($idx_iterations % $interval == 0 and $update_status) { $percentage = (int) ($idx_iterations / $no_iterations * 100); db_set_active($previous_db); tripal_job_set_progress($job_id, $percentage); $previous_db = db_set_active('chado'); print $percentage."% "; } // match the protein id with the feature name $feature_id = 0; $attr = $protein->attributes(); $seqname =$attr ['id']; // is the sequence name a generic name (i.e. 'Sequence_1') then the // blast results do not contain the original sequence names. The only // option we have is to use the filename. This will work in the case of // Blast2GO which stores the XML for each sequence in a file with the // the filename the name of the sequence if(preg_match('/Sequence_\d+/',$seqname)){ $filename = preg_replace('/^.*\/(.*).xml$/', '$1', $interproxmlfile); print " Sequence name is not specific, using filename: $filename\n"; $seqname = $filename; } // Remove _ORF from the sequence name $seqname = preg_replace('/^(.+)_\d+_ORF\d+.*/', '$1', $seqname); // if a regular expression is provided then pick out the portion requested if ($query_re and preg_match("/$query_re/", $seqname, $matches)) { $feature = $matches[1]; } // If no match by the regular expression then get everything up to the first space else { if (preg_match('/^(.*?)\s.*$/', $seqname, $matches)) { $feature = $matches[1]; } // if no match up to the first space then just use the entire string else { $feature = $seqname; } } if(!$feature and $query_re){ print "Failed: Cannot find feature for '$seqname' using the regular expression: $query_re\n"; continue; } // now find the feature in chado $select = array(); if($query_uniquename){ $select['uniquename'] = $feature; } else { $select['name'] = $feature; } if($query_type){ $select['type_id'] = array( 'cv_id' => array( 'name' => 'sequence' ), 'name' => $query_type, ); } $feature_arr = tripal_core_chado_select('feature',array('feature_id'),$select); if(count($feature_arr) > 1){ print "Ambiguous: '$feature' matches more than one feature and is being skipped.\n"; continue; } if(count($feature_arr) == 0){ print "Failed: cannot find a matching feature for '$feature' in the database.\n"; continue; } $feature_id = $feature_arr[0]->feature_id; // Successfully matched. print 'Succeeded'. Add analysis_id and // feature_id to analysisfeature. Add the table as XML to analysisfeatureprop if ($feature_id) { print " Adding InterPro results for feature '$seqname' ($feature_id)\n"; // Insert into analysisfeature table only if it doesn't already exist $values = array('feature_id' => $feature_id, 'analysis_id' => $analysis_id); $analysisfeature = tripal_core_chado_select('analysisfeature',array('*'),$values); if(sizeof($analysisfeature) == 0){ $analysisfeature = tripal_core_chado_insert('analysisfeature',$values); $analysisfeature_id = $analysisfeature['analysisfeature_id']; } else { $analysisfeature_id = $analysisfeature[0]->analysisfeature_id; } // Insert interpro xml results into analysisfeatureprop table // Check to see if we have an existing entry $sql = "SELECT analysisfeatureprop_id,rank FROM {analysisfeatureprop} WHERE analysisfeature_id = %d AND type_id = %d ORDER BY rank DESC"; $result = db_fetch_object(db_query($sql, $analysisfeature_id, $type_id)); $rank = 0; if($result){ $afp_id = $result->analysisfeatureprop_id; $rank = $result->rank + 1; } $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)". "VALUES (%d, %d, '%s', %d)"; db_query($sql, $analysisfeature_id, $type_id, $protein->asXML(), $rank); // parse the XML for each protein if GO terms are requested if($parsego and $go_db_id){ $protein = tripal_analysis_interpro_get_result_object($protein->asXML(),$feature_id); $goterms = $protein['goterms']; // cycle through the GO terms and add them to the database foreach($goterms as $goterm){ // seperate the 'GO:' from the term if (preg_match("/^.*?GO:(\d+).*$/", $goterm, $matches)) { // Find cvterm_id for the matched GO term $sql = "SELECT cvterm_id FROM {cvterm} CVT INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id WHERE DBX.accession = '%s' AND DBX.db_id = %d"; $goterm_id = db_result(db_query($sql, $matches[1], $go_db_id)); // Insert GO terms into feature_cvterm table // Default pub_id = 1 (NULL) was used $values = array('feature_id' => $feature_id, 'cvterm_id' => $goterm_id, 'pub_id' => 1); $feature_cvterm = tripal_core_chado_select('feature_cvterm',array('*'),$values); if(sizeof($feature_cvterm) == 0){ $feature_cvterm = tripal_core_chado_insert('feature_cvterm',$values); } // Insert GO terms into analysisfeatureprop table $values = array('analysisfeature_id' => $analysisfeature_id, 'type_id' => $goterm_id, 'rank' => 0); $analysisfeatureprop = tripal_core_chado_select('analysisfeatureprop',array('*'),$values); if(sizeof($analysisfeatureprop) == 0){ $values['value'] = $matches[1]; $analysisfeatureprop = tripal_core_chado_insert('analysisfeatureprop',$values); } } // end if preg_match } // end for each goterm } // end if($parsego and $go_db_id) } // end if($feature_id) } // end foreach ($proteins as $protein) db_set_active ($previous_db); // Use drupal database return; } /******************************************************************************** * */ function tripal_analysis_interpro_get_result_object($interpro_xml,$feature_id){ // Load the XML into an object $xmlObj = simplexml_load_string($interpro_xml); // iterate through each interpro results for this protein $results = array(); $terms = array(); $protein = array(); $iprterms = array(); $goterms = array(); $term_count = 0; $match_count = 0; // get the properties of this result $attr = $xmlObj->attributes(); $protein['orf_id'] = (string) $attr["id"]; $protein['orf_length'] = (string) $attr["length"]; $protein['orf_crc64'] = (string) $attr["crc64"]; foreach($xmlObj->children() as $intepro){ // get the interpro term for this match $attr = $intepro->attributes(); $terms[$term_count]['ipr_id'] = (string) $attr["id"]; $terms[$term_count]['ipr_name'] = (string) $attr["name"]; $terms[$term_count]['ipr_type'] = (string) $attr["type"]; $iprterms[] = array($terms[$term_count]['ipr_id'],$terms[$term_count]['ipr_name']); // iterate through the elements of the interpro result $matches[$term_count]['matches'] = array(); $match_count = 0; foreach($intepro->children() as $level1){ $element_name = $level1->getName(); if($element_name == 'match'){ // get the match name for this match $attr = $level1->attributes(); $terms[$term_count]['matches'][$match_count]['match_id'] = (string) $attr["id"]; $terms[$term_count]['matches'][$match_count]['match_name'] = (string) $attr["name"]; $terms[$term_count]['matches'][$match_count]['match_dbname'] = (string) $attr["dbname"]; // get the location information for this match $loc_count = 0; foreach($level1->children() as $level2){ $element_name = $level2->getName(); if($element_name == 'location'){ $attr = $level2->attributes(); $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_start'] = (string) $attr["start"]; $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_end'] = (string) $attr["end"]; $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_score'] = (string) $attr["score"]; $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_status'] = (string) $attr["status"]; $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_evidence'] = (string) $attr["evidence"]; $loc_count++; } } $match_count++; } if($element_name == 'classification'){ $attr = $level1->attributes(); if($attr['class_type'] == 'GO'){ $terms[$term_count]['matches'][$match_count]['go_terms'][] = (string) $attr['id']; $goterms[] = (string) $attr['id']; } } } $term_count++; } $results['terms'] = $terms; $results['orf'] = $protein; $results['iprterms'] = $iprterms; $results['goterms'] = $goterms; return $results; }