123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516 |
- <?php
- function tripal_analysis_interpro_parseHTMLFile ($analysis_id, $interprofile, $parsego, $job_id) {
-
- $filename = preg_replace("/.*\/(.*)/", "$1", $interprofile);
- $logfile = file_directory_path() . "/tripal/tripal_analysis_interpro/load_$filename.log";
- $log = fopen($logfile, 'a');
-
- print "Parsing File:".$interprofile." ...\n";
- fwrite($log, date("D M j G:i:s Y").". Loading $interprofile\n");
-
-
- $previous_db = tripal_db_set_active('chado');
- $sql = "SELECT CVT.cvterm_id FROM {cvterm} CVT ".
- "INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
- "WHERE CVT.name = 'analysis_interpro_output_hit' ".
- "AND CV.name = 'tripal'";
- $type_id = db_result(db_query($sql));
- print "cvterm_id for analysis_interpro_output_iteration_hits is $type_id\n";
-
- $dom = new domDocument;
- $dom->loadHTMLFile($interprofile);
- $xml = $dom->saveXML();
- $interproput = simplexml_load_string($xml);
-
- $tables = $interproput->children()->children();
-
- $no_iterations = 0;
- foreach($tables as $tmp) {
- if ($tmp->getName() == 'table') {
- $no_iterations ++;
- }
- }
- print "$no_iterations html tables to be processed.\n";
- $interval = intval($no_iterations * 0.01);
- $idx_iterations = 0;
-
- foreach ($tables as $table) {
-
-
-
-
- if ($table->getName() == 'table' && !preg_match('/No hits reported/', $table->asXML()) ) {
- $idx_iterations ++;
- if ($idx_iterations % $interval == 0) {
- $percentage = (int) ($idx_iterations / $no_iterations * 100);
- tripal_db_set_active($previous_db);
- tripal_job_set_progress($job_id, $percentage);
- $previous_db = tripal_db_set_active('chado');
- print $percentage."% ";
- }
-
-
- $firsttd = $table->children()->children()->children();
- $feature_id = 0;
- foreach($firsttd as $b) {
- foreach($b->children() as $a) {
- if ($a->getName() == 'a') {
-
- $seqname = preg_replace('/^(.+?)_\d_.+/', "$1", $a);
- print "seqname is $seqname\n";
-
-
- $sql = "SELECT count(feature_id) FROM {feature} ".
- "WHERE uniquename = '%s' ";
- $no_features = db_result(db_query($sql, $seqname));
-
-
- if ($no_features == 1) {
- $sql = "SELECT feature_id FROM {feature} ".
- "WHERE uniquename = '%s' ";
- $feature_id = db_result(db_query($sql, $seqname));
- print "\tfeature id is $feature_id\n";
-
-
- } else if ($no_features > 1) {
- fwrite($log, "Ambiguous: ".$seqname." matches more than one feature and is not processed.\n");
- continue;
-
-
- } else {
- fwrite($log, "Failed: ".$seqname."\n");
- }
-
- }
- }
- }
-
-
- if ($feature_id) {
-
-
-
- $parent_row = "/<tr><td valign=\"top\"><b>Parent<\/b><\/td>\s*<td valign=\"top\">\s*no.*?parent<\/td>\s*<\/tr>/";
- $children_row = "/<tr><td valign=\"top\"><b>Children<\/b><\/td>\s*<td valign=\"top\">\s*no.*?children<\/td>\s*<\/tr>/";
- $found_row = "/<tr><td valign=\"top\"><b>Found.*?in<\/b><\/td>\s*<td valign=\"top\">\s*no.*?entries<\/td>\s*<\/tr>/";
- $contains_row = "/<tr><td valign=\"top\"><b>Contains<\/b><\/td>\s*<td valign=\"top\">\s*no.*?entries<\/td>\s*<\/tr>/";
- $go_row = "/<tr><td valign=\"top\"><b>GO.*?terms<\/b><\/td>\s*<td valign=\"top\">\s*none<\/td>\s*<\/tr>/";
-
- $table_txt = $table->asXML();
- $table_txt = preg_replace($parent_row, "", $table_txt);
- $table_txt = preg_replace($children_row, "", $table_txt);
- $table_txt = preg_replace($found_row, "", $table_txt);
- $table_txt = preg_replace($contains_row, "", $table_txt);
- $table_txt = preg_replace($go_row, "", $table_txt);
-
-
-
- $orf_link = "/<b><a href=\"\/iprscan\/wget.*?\">(.*?)<\/a><\/b>/";
- $table_txt = preg_replace($orf_link, "$1", $table_txt);
-
-
-
-
-
-
-
-
-
- $sql = "Select analysisfeature_id as id from {analysisfeature} where feature_id = %d and analysis_id = %d";
- $analysisfeature = db_fetch_object(db_query($sql, $feature_id, $analysis_id));
- if($analysisfeature){ $analysisfeature_id = $analysisfeature->id; }
- if(!$analysisfeature_id){
- print "inserting analysisfeature\n";
- $sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
- "VALUES (%d, %d)";
- db_query ($sql, $feature_id, $analysis_id);
- $sql = "Select analysisfeature_id from {analysisfeature} where feature_id = %d and analysis_id = %d";
- $analysisfeature = db_fetch_object(db_query($sql, $feature_id, $analysis_id));
- $analysisfeature_id = $analysisfeature->id;
- }
- print "analysisfeature_id is $analysisfeature_id (analysis_id = $analysis_id; feature_id = $feature_id)\n";
-
-
-
- $sql = "SELECT MAX(rank) FROM {analysisfeatureprop} AFP ".
- "INNER JOIN analysisfeature AF ON AF.analysisfeature_id = AFP.analysisfeature_id ".
- "WHERE feature_id=%d ".
- "AND analysis_id=%d ".
- "AND type_id=%d ";
- $afp = db_fetch_object(db_query($sql, $feature_id, $analysis_id, $type_id));
- $hi_rank = 0;
- if ($afp) {
- $hi_rank = $afp->max + 1;
- }
-
-
-
-
-
- $sql = "SELECT value FROM {analysisfeatureprop} WHERE analysisfeature_id = %d AND type_id = %d";
- $result = db_query($sql, $analysisfeature_id, $type_id);
- $duplicate = 0;
- while ($afp_value = db_fetch_object($result)) {
- if ($table_txt == $afp_value->value) {
- $duplicate = 1;
- }
- }
- if (!$duplicate) {
- $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
- "VALUES (%d, %d, '%s', %d)";
- db_query($sql, $analysisfeature_id, $type_id, $table_txt, $hi_rank);
- fwrite($log, " (Insert)\n");
- print "\twriting table\n";
- } else {
- fwrite($log, " (Skipped)\n");
- print "\tskipping table - dup\n";
- }
-
-
- $go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'"));
- if (!$go_db_id) {
- print 'GO schema not installed in chado. GO terms are not processed.';
- }
- if ($go_db_id && $parsego) {
- $trs = $table->children();
- foreach ($trs as $tr) {
- $tds = $tr->children();
- foreach($tds as $td) {
- $gotags = $td->children();
- foreach ($gotags as $gotag) {
-
- if (preg_match("/^.*?GO:(\d+).*$/", $gotag, $matches)) {
-
-
- $sql = "SELECT cvterm_id FROM {cvterm} CVT
- INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id
- WHERE DBX.accession = '%s' AND DBX.db_id = %d";
- $goterm_id = db_result(db_query($sql, $matches[1], $go_db_id));
-
-
-
-
-
- $sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id)
- VALUES (%d, %d, 1)";
- db_query($sql, $feature_id, $goterm_id);
-
-
-
- $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) ".
- "VALUES (%d, %d, '%s', 0)";
- db_query($sql, $analysisfeature_id, $goterm_id, $matches[1]);
- }
- }
- }
- }
- }
- }
- }
- }
- tripal_db_set_active ($previous_db);
- print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
-
- fwrite($log, "\n");
- fclose($log);
- return;
- }
- function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
- $parsego, $query_re, $query_type, $query_uniquename, $job_id)
- {
-
- tripal_core_chado_delete('analysisfeature',array('analysis_id' => $analysis_id));
-
- $filename = preg_replace("/.*\/(.*)/", "$1", $interproxmlfile);
- $logfile = file_directory_path() . "/tripal/tripal_analysis_interpro/load_$filename.log";
- $log = fopen($logfile, 'a');
-
- print "Parsing File:".$interproxmlfile." ...\n";
- fwrite($log, date("D M j G:i:s Y").". Loading $interproxmlfile\n");
-
-
- $previous_db = db_set_active('chado');
- $sql = "SELECT CVT.cvterm_id FROM {cvterm} CVT ".
- "INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
- "WHERE CVT.name = 'analysis_interpro_xmloutput_hit' ".
- "AND CV.name = 'tripal'";
- $type_id = db_result(db_query($sql));
-
- $interproput = simplexml_load_file($interproxmlfile);
-
- $proteins = $interproput->children();
-
- $no_iterations = 0;
- foreach($proteins as $tmp) {
- $no_iterations ++;
- }
- print "$no_iterations proteins to be processed.\n";
- $interval = intval($no_iterations * 0.01);
- $idx_iterations = 0;
-
- $parsego = tripal_analysis_get_property($analysis_id,'analysis_interpro_parsego');
- $go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'"));
- if ($parsego and !$go_db_id) {
- print 'GO schema not installed in chado. GO terms are not processed.';;
- }
-
- foreach ($proteins as $protein) {
-
- $idx_iterations ++;
- if ($idx_iterations % $interval == 0) {
- $percentage = (int) ($idx_iterations / $no_iterations * 100);
- db_set_active($previous_db);
- tripal_job_set_progress($job_id, $percentage);
- $previous_db = db_set_active('chado');
- print $percentage."% ";
- }
-
- $feature_id = 0;
- $attr = $protein->attributes();
- $seqname =$attr ['id'];
-
- $seqname = preg_replace('/^(.+)_\d+_ORF\d+.*/', '$1', $seqname);
-
-
- if ($query_re and preg_match("/$query_re/", $seqname, $matches)) {
- $feature = $matches[1];
- }
-
- else {
- if (preg_match('/^(.*?)\s.*$/', $seqname, $matches)) {
- $feature = $matches[1];
- }
-
- else {
- $feature = $seqname;
- }
- }
- if(!$feature and $query_re){
- print fwrite($log, "Failed: Cannot find feature for '$seqname' using the regular expression: $query_re\n");
- continue;
- }
-
- $select = array();
- if($query_uniquename){
- $select['uniquename'] = $feature;
- } else {
- $select['name'] = $feature;
- }
- if($query_type){
- $select['type_id'] = array(
- 'cv_id' => array(
- 'name' => 'sequence'
- ),
- 'name' => $query_type,
- );
- }
- $feature_arr = tripal_core_chado_select('feature',array('feature_id'),$select);
- if(count($feature_arr) > 1){
- fwrite($log, "Ambiguous: '$feature' matches more than one feature and is being skipped.\n");
- continue;
- }
- if(count($feature_arr) == 0){
- fwrite($log, "Failed: '$feature' cannot find a matching feature in the database.\n");
- continue;
- }
- $feature_id = $feature_arr[0]->feature_id;
-
-
-
- if ($feature_id) {
- print "$idx_iterations Adding InterPro results for feature '$seqname' ($feature_id)\n";
-
- fwrite($log, "Succeeded: ".$seqname." => feature id:".$feature_id);
-
-
-
- $sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
- "VALUES (%d, %d)";
- db_query ($sql, $feature_id, $analysis_id);
-
- $sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE feature_id = %d AND analysis_id = %d";
- $analysisfeature_id = db_result(db_query($sql, $feature_id, $analysis_id));
-
-
-
-
- $sql = "SELECT analysisfeatureprop_id,rank
- FROM {analysisfeatureprop}
- WHERE analysisfeature_id = %d AND type_id = %d
- ORDER BY rank DESC";
- $result = db_fetch_object(db_query($sql, $analysisfeature_id, $type_id));
- $rank = 0;
- if($result){
- $afp_id = $result->analysisfeatureprop_id;
- $rank = $result->rank + 1;
- }
- $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
- "VALUES (%d, %d, '%s', %d)";
- db_query($sql, $analysisfeature_id, $type_id, $protein->asXML(), $rank);
- fwrite($log, " (Insert)\n");
-
- if($parsego and $go_db_id){
- $protein = tripal_analysis_interpro_get_result_object($protein->asXML(),$feature_id);
- $goterms = $protein['goterms'];
-
- foreach($goterms as $goterm){
-
-
- if (preg_match("/^.*?GO:(\d+).*$/", $goterm, $matches)) {
-
- $sql = "SELECT cvterm_id FROM {cvterm} CVT
- INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id
- WHERE DBX.accession = '%s' AND DBX.db_id = %d";
- $goterm_id = db_result(db_query($sql, $matches[1], $go_db_id));
-
-
- $sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id)
- VALUES (%d, %d, 1)";
- db_query($sql, $feature_id, $goterm_id);
-
- $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) ".
- "VALUES (%d, %d, '%s', 0)";
- db_query($sql, $analysisfeature_id, $goterm_id, $matches[1]);
- }
- }
- }
- }
- }
- db_set_active ($previous_db);
- print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
- fwrite($log, "\n");
- fclose($log);
- return;
- }
- function tripal_analysis_interpro_get_result_object($interpro_xml,$feature_id){
-
- $xmlObj = simplexml_load_string($interpro_xml);
-
- $results = array();
- $terms = array();
- $protein = array();
- $iprterms = array();
- $goterms = array();
- $term_count = 0;
- $match_count = 0;
-
-
- $attr = $xmlObj->attributes();
- $protein['orf_id'] = (string) $attr["id"];
- $protein['orf_length'] = (string) $attr["length"];
- $protein['orf_crc64'] = (string) $attr["crc64"];
-
- foreach($xmlObj->children() as $intepro){
-
- $attr = $intepro->attributes();
- $terms[$term_count]['ipr_id'] = (string) $attr["id"];
- $terms[$term_count]['ipr_name'] = (string) $attr["name"];
- $terms[$term_count]['ipr_type'] = (string) $attr["type"];
- $iprterms[] = array($terms[$term_count]['ipr_id'],$terms[$term_count]['ipr_name']);
-
-
- $matches[$term_count]['matches'] = array();
- $match_count = 0;
- foreach($intepro->children() as $level1){
- $element_name = $level1->getName();
- if($element_name == 'match'){
-
- $attr = $level1->attributes();
- $terms[$term_count]['matches'][$match_count]['match_id'] = (string) $attr["id"];
- $terms[$term_count]['matches'][$match_count]['match_name'] = (string) $attr["name"];
- $terms[$term_count]['matches'][$match_count]['match_dbname'] = (string) $attr["dbname"];
-
-
- $loc_count = 0;
- foreach($level1->children() as $level2){
- $element_name = $level2->getName();
- if($element_name == 'location'){
- $attr = $level2->attributes();
- $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_start'] = (string) $attr["start"];
- $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_end'] = (string) $attr["end"];
- $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_score'] = (string) $attr["score"];
- $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_status'] = (string) $attr["status"];
- $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_evidence'] = (string) $attr["evidence"];
- $loc_count++;
- }
- }
- $match_count++;
- }
- if($element_name == 'classification'){
- $attr = $level1->attributes();
- if($attr['class_type'] == 'GO'){
- $terms[$term_count]['matches'][$match_count]['go_terms'][] = (string) $attr['id'];
- $goterms[] = (string) $attr['id'];
- }
- }
- }
- $term_count++;
- }
- $results['terms'] = $terms;
- $results['orf'] = $protein;
- $results['iprterms'] = $iprterms;
- $results['goterms'] = $goterms;
- return $results;
- }
|