parseInterpro.inc 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. <?php
  2. /*******************************************************************************
  3. * Parse Interpro HTML Output file into analysisfeatureprop table
  4. */
  5. function tripal_analysis_interpro_parseHTMLFile ($analysis_id, $interprofile, $parsego, $job_id) {
  6. // Prepare log
  7. $filename = preg_replace("/.*\/(.*)/", "$1", $interprofile);
  8. $logfile = file_directory_path() . "/tripal/tripal_analysis_interpro/load_$filename.log";
  9. $log = fopen($logfile, 'a'); // append parsing results to log file
  10. // Parsing started
  11. print "Parsing File:".$interprofile." ...\n";
  12. fwrite($log, date("D M j G:i:s Y").". Loading $interprofile\n");
  13. // Get cvterm_id for 'analysis_interpro_output_iteration_hits' which is required
  14. // for inserting into the analysisfeatureprop table
  15. $previous_db = tripal_db_set_active('chado'); // use chado database
  16. $sql = "SELECT CVT.cvterm_id FROM {cvterm} CVT ".
  17. "INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
  18. "WHERE CVT.name = 'analysis_interpro_output_hit' ".
  19. "AND CV.name = 'tripal'";
  20. $type_id = db_result(db_query($sql));
  21. print "cvterm_id for analysis_interpro_output_iteration_hits is $type_id\n";
  22. // Load the HTML file and convert it into XML for loading
  23. $dom = new domDocument;
  24. $dom->loadHTMLFile($interprofile);
  25. $xml = $dom->saveXML();
  26. $interproput = simplexml_load_string($xml);
  27. // Get html tables for parsing
  28. $tables = $interproput->children()->children();
  29. // Count the number of tables to be processed
  30. $no_iterations = 0;
  31. foreach($tables as $tmp) {
  32. if ($tmp->getName() == 'table') {
  33. $no_iterations ++;
  34. }
  35. }
  36. print "$no_iterations html tables to be processed.\n";
  37. $interval = intval($no_iterations * 0.01);
  38. $idx_iterations = 0;
  39. // Processed the tables
  40. foreach ($tables as $table) {
  41. //if (preg_match('/No hits reported/', $table->asXML()) ) {
  42. //print "skipping this table b/c no hits are reported\n";
  43. //}
  44. // make sure we are looking at a table and its not an empty table
  45. if ($table->getName() == 'table' && !preg_match('/No hits reported/', $table->asXML()) ) {
  46. $idx_iterations ++;
  47. if ($idx_iterations % $interval == 0) {
  48. $percentage = (int) ($idx_iterations / $no_iterations * 100);
  49. tripal_db_set_active($previous_db);
  50. tripal_job_set_progress($job_id, $percentage);
  51. $previous_db = tripal_db_set_active('chado');
  52. print $percentage."% ";
  53. }
  54. // Set job status
  55. // Get the first row and match its name with the feature name
  56. $firsttd = $table->children()->children()->children();
  57. $feature_id = 0;
  58. foreach($firsttd as $b) {
  59. foreach($b->children() as $a) {
  60. if ($a->getName() == 'a') {
  61. // Remove _ORF from the sequence name
  62. $seqname = preg_replace('/^(.+?)_\d_.+/', "$1", $a);
  63. print "seqname is $seqname\n";
  64. // Find out how many features match this uniquename
  65. $sql = "SELECT count(feature_id) FROM {feature} ".
  66. "WHERE uniquename = '%s' ";
  67. $no_features = db_result(db_query($sql, $seqname));
  68. // If there is only one match, get the feature_id
  69. if ($no_features == 1) {
  70. $sql = "SELECT feature_id FROM {feature} ".
  71. "WHERE uniquename = '%s' ";
  72. $feature_id = db_result(db_query($sql, $seqname));
  73. print "\tfeature id is $feature_id\n";
  74. // If the uniquename matches more than one features then skip and print 'Ambiguous'
  75. } else if ($no_features > 1) {
  76. fwrite($log, "Ambiguous: ".$seqname." matches more than one feature and is not processed.\n");
  77. continue;
  78. // If the uniquename did not match, skip and print 'Failed'
  79. } else {
  80. fwrite($log, "Failed: ".$seqname."\n");
  81. }
  82. }
  83. }
  84. }
  85. // Successfully matched. print 'Succeeded'. Add analysis_id and
  86. // feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
  87. if ($feature_id) {
  88. //------------------------------------
  89. // Clease unwanted rows from the table
  90. //------------------------------------
  91. $parent_row = "/<tr><td valign=\"top\"><b>Parent<\/b><\/td>\s*<td valign=\"top\">\s*no.*?parent<\/td>\s*<\/tr>/";
  92. $children_row = "/<tr><td valign=\"top\"><b>Children<\/b><\/td>\s*<td valign=\"top\">\s*no.*?children<\/td>\s*<\/tr>/";
  93. $found_row = "/<tr><td valign=\"top\"><b>Found.*?in<\/b><\/td>\s*<td valign=\"top\">\s*no.*?entries<\/td>\s*<\/tr>/";
  94. $contains_row = "/<tr><td valign=\"top\"><b>Contains<\/b><\/td>\s*<td valign=\"top\">\s*no.*?entries<\/td>\s*<\/tr>/";
  95. $go_row = "/<tr><td valign=\"top\"><b>GO.*?terms<\/b><\/td>\s*<td valign=\"top\">\s*none<\/td>\s*<\/tr>/";
  96. $table_txt = $table->asXML();
  97. $table_txt = preg_replace($parent_row, "", $table_txt);
  98. $table_txt = preg_replace($children_row, "", $table_txt);
  99. $table_txt = preg_replace($found_row, "", $table_txt);
  100. $table_txt = preg_replace($contains_row, "", $table_txt);
  101. $table_txt = preg_replace($go_row, "", $table_txt);
  102. //------------------------------------
  103. // Clease unwanted ORF link from table
  104. //------------------------------------
  105. $orf_link = "/<b><a href=\"\/iprscan\/wget.*?\">(.*?)<\/a><\/b>/";
  106. $table_txt = preg_replace($orf_link, "$1", $table_txt);
  107. //print "----------------------------\n";
  108. //print "old: ".$table->asXML()."\n\n\n";
  109. //print "----------------------------\n";
  110. //print "Fixed: $table_txt\n";
  111. //print "----------------------------\n";
  112. //------------------------------------
  113. // If this feature has already been associated with this analysis, do not reinsert
  114. // Otherwise, Insert into analysisfeature table
  115. //------------------------------------
  116. $sql = "Select analysisfeature_id as id from {analysisfeature} where feature_id = %d and analysis_id = %d";
  117. $analysisfeature = db_fetch_object(db_query($sql, $feature_id, $analysis_id));
  118. if($analysisfeature){ $analysisfeature_id = $analysisfeature->id; }
  119. if(!$analysisfeature_id){
  120. print "inserting analysisfeature\n";
  121. $sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
  122. "VALUES (%d, %d)";
  123. db_query ($sql, $feature_id, $analysis_id);
  124. $sql = "Select analysisfeature_id from {analysisfeature} where feature_id = %d and analysis_id = %d";
  125. $analysisfeature = db_fetch_object(db_query($sql, $feature_id, $analysis_id));
  126. $analysisfeature_id = $analysisfeature->id;
  127. }
  128. print "analysisfeature_id is $analysisfeature_id (analysis_id = $analysis_id; feature_id = $feature_id)\n";
  129. // Get the higest rank for this feature_id in analysisfeatureprop table.
  130. // If the value of the inserting content is not duplicate, add it to
  131. // analysisfeaturepro with 'higest_rank + 1'
  132. $sql = "SELECT MAX(rank) FROM {analysisfeatureprop} AFP ".
  133. "INNER JOIN analysisfeature AF ON AF.analysisfeature_id = AFP.analysisfeature_id ".
  134. "WHERE feature_id=%d ".
  135. "AND analysis_id=%d ".
  136. "AND type_id=%d ";
  137. $afp = db_fetch_object(db_query($sql, $feature_id, $analysis_id, $type_id));
  138. $hi_rank = 0;
  139. if ($afp) {
  140. $hi_rank = $afp->max + 1;
  141. }
  142. //------------------------------------------------------------
  143. // Insert interpro html tags into analysisfeatureprop table
  144. //------------------------------------------------------------
  145. // Before inserting, make sure it's not a duplicate
  146. $sql = "SELECT value FROM {analysisfeatureprop} WHERE analysisfeature_id = %d AND type_id = %d";
  147. $result = db_query($sql, $analysisfeature_id, $type_id);
  148. $duplicate = 0;
  149. while ($afp_value = db_fetch_object($result)) {
  150. if ($table_txt == $afp_value->value) {
  151. $duplicate = 1;
  152. }
  153. }
  154. if (!$duplicate) {
  155. $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
  156. "VALUES (%d, %d, '%s', %d)";
  157. db_query($sql, $analysisfeature_id, $type_id, $table_txt, $hi_rank);
  158. fwrite($log, " (Insert)\n"); // write to log
  159. print "\twriting table\n";
  160. } else {
  161. fwrite($log, " (Skipped)\n");
  162. print "\tskipping table - dup\n";
  163. }
  164. // Parse GO terms. Make sure GO database schema is installed in chado
  165. $go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'"));
  166. if (!$go_db_id) {
  167. print 'GO schema not installed in chado. GO terms are not processed.';
  168. }
  169. if ($go_db_id && $parsego) {
  170. $trs = $table->children();
  171. foreach ($trs as $tr) {
  172. $tds = $tr->children();
  173. foreach($tds as $td) {
  174. $gotags = $td->children();
  175. foreach ($gotags as $gotag) {
  176. // Look for 'GO:accession#'
  177. if (preg_match("/^.*?GO:(\d+).*$/", $gotag, $matches)) {
  178. // Find cvterm_id for the matched GO term
  179. $sql = "SELECT cvterm_id FROM {cvterm} CVT
  180. INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id
  181. WHERE DBX.accession = '%s' AND DBX.db_id = %d";
  182. $goterm_id = db_result(db_query($sql, $matches[1], $go_db_id));
  183. //-------------------------------------------
  184. // Insert GO terms into feature_cvterm table
  185. //-------------------------------------------
  186. // Default pub_id = 1 (NULL) was used
  187. $sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id)
  188. VALUES (%d, %d, 1)";
  189. db_query($sql, $feature_id, $goterm_id);
  190. //------------------------------------------------
  191. // Insert GO terms into analysisfeatureprop table
  192. //------------------------------------------------
  193. $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) ".
  194. "VALUES (%d, %d, '%s', 0)";
  195. db_query($sql, $analysisfeature_id, $goterm_id, $matches[1]);
  196. }
  197. }
  198. }
  199. }
  200. }
  201. }
  202. }
  203. }
  204. tripal_db_set_active ($previous_db); // Use drupal database
  205. print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
  206. fwrite($log, "\n");
  207. fclose($log);
  208. return;
  209. }
  210. /**
  211. *
  212. */
  213. function tripal_analysis_interpro_parseXMLFile ($analysis_id, $interproxmlfile,
  214. $parsego, $query_re, $query_type, $query_uniquename, $job_id)
  215. {
  216. // clear out the anslysisfeature table for this analysis before getting started
  217. tripal_core_chado_delete('analysisfeature',array('analysis_id' => $analysis_id));
  218. // Prepare log
  219. $filename = preg_replace("/.*\/(.*)/", "$1", $interproxmlfile);
  220. $logfile = file_directory_path() . "/tripal/tripal_analysis_interpro/load_$filename.log";
  221. $log = fopen($logfile, 'a'); // append parsing results to log file
  222. // Parsing started
  223. print "Parsing File:".$interproxmlfile." ...\n";
  224. fwrite($log, date("D M j G:i:s Y").". Loading $interproxmlfile\n");
  225. // Get cvterm_id for 'analysis_interpro_xmloutput_hits' which is required
  226. // for inserting into the analysisfeatureprop table
  227. $previous_db = db_set_active('chado'); // use chado database
  228. $sql = "SELECT CVT.cvterm_id FROM {cvterm} CVT ".
  229. "INNER JOIN cv ON cv.cv_id = CVT.cv_id ".
  230. "WHERE CVT.name = 'analysis_interpro_xmloutput_hit' ".
  231. "AND CV.name = 'tripal'";
  232. $type_id = db_result(db_query($sql));
  233. // Load the XML file
  234. $interproput = simplexml_load_file($interproxmlfile);
  235. // Get entries parsing
  236. $proteins = $interproput->children();
  237. // Count the number of entires to be processed
  238. $no_iterations = 0;
  239. foreach($proteins as $tmp) {
  240. $no_iterations ++;
  241. }
  242. print "$no_iterations proteins to be processed.\n";
  243. $interval = intval($no_iterations * 0.01);
  244. $idx_iterations = 0;
  245. // get the DB id for the GO database
  246. $parsego = tripal_analysis_get_property($analysis_id,'analysis_interpro_parsego');
  247. $go_db_id = db_result(db_query("SELECT db_id FROM {db} WHERE name='GO'"));
  248. if ($parsego and !$go_db_id) {
  249. print 'GO schema not installed in chado. GO terms are not processed.';;
  250. }
  251. // Processed each protein
  252. foreach ($proteins as $protein) {
  253. // Set job status
  254. $idx_iterations ++;
  255. if ($idx_iterations % $interval == 0) {
  256. $percentage = (int) ($idx_iterations / $no_iterations * 100);
  257. db_set_active($previous_db);
  258. tripal_job_set_progress($job_id, $percentage);
  259. $previous_db = db_set_active('chado');
  260. print $percentage."% ";
  261. }
  262. // match the protein id with the feature name
  263. $feature_id = 0;
  264. $attr = $protein->attributes();
  265. $seqname =$attr ['id'];
  266. // Remove _ORF from the sequence name
  267. $seqname = preg_replace('/^(.+)_\d+_ORF\d+.*/', '$1', $seqname);
  268. // if a regular expression is provided then pick out the portion requested
  269. if ($query_re and preg_match("/$query_re/", $seqname, $matches)) {
  270. $feature = $matches[1];
  271. }
  272. // If no match by the regular expression then get everything up to the first space
  273. else {
  274. if (preg_match('/^(.*?)\s.*$/', $seqname, $matches)) {
  275. $feature = $matches[1];
  276. }
  277. // if no match up to the first space then just use the entire string
  278. else {
  279. $feature = $seqname;
  280. }
  281. }
  282. if(!$feature and $query_re){
  283. print fwrite($log, "Failed: Cannot find feature for '$seqname' using the regular expression: $query_re\n");
  284. continue;
  285. }
  286. // now find the feature in chado
  287. $select = array();
  288. if($query_uniquename){
  289. $select['uniquename'] = $feature;
  290. } else {
  291. $select['name'] = $feature;
  292. }
  293. if($query_type){
  294. $select['type_id'] = array(
  295. 'cv_id' => array(
  296. 'name' => 'sequence'
  297. ),
  298. 'name' => $query_type,
  299. );
  300. }
  301. $feature_arr = tripal_core_chado_select('feature',array('feature_id'),$select);
  302. if(count($feature_arr) > 1){
  303. fwrite($log, "Ambiguous: '$feature' matches more than one feature and is being skipped.\n");
  304. continue;
  305. }
  306. if(count($feature_arr) == 0){
  307. fwrite($log, "Failed: '$feature' cannot find a matching feature in the database.\n");
  308. continue;
  309. }
  310. $feature_id = $feature_arr[0]->feature_id;
  311. // Successfully matched. print 'Succeeded'. Add analysis_id and
  312. // feature_id to analysisfeature. Add the table as XML to analysisfeatureprop
  313. if ($feature_id) {
  314. print "$idx_iterations Adding InterPro results for feature '$seqname' ($feature_id)\n";
  315. // If a matched feature is found, write to log.
  316. fwrite($log, "Succeeded: ".$seqname." => feature id:".$feature_id);
  317. //------------------------------------
  318. // Insert into analysisfeature table
  319. //------------------------------------
  320. $sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
  321. "VALUES (%d, %d)";
  322. db_query ($sql, $feature_id, $analysis_id);
  323. // Get the analysisfeature_id
  324. $sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE feature_id = %d AND analysis_id = %d";
  325. $analysisfeature_id = db_result(db_query($sql, $feature_id, $analysis_id));
  326. //------------------------------------------------------------
  327. // Insert interpro xml results into analysisfeatureprop table
  328. //------------------------------------------------------------
  329. // Check to see if we have an existing entry
  330. $sql = "SELECT analysisfeatureprop_id,rank
  331. FROM {analysisfeatureprop}
  332. WHERE analysisfeature_id = %d AND type_id = %d
  333. ORDER BY rank DESC";
  334. $result = db_fetch_object(db_query($sql, $analysisfeature_id, $type_id));
  335. $rank = 0;
  336. if($result){
  337. $afp_id = $result->analysisfeatureprop_id;
  338. $rank = $result->rank + 1;
  339. }
  340. $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
  341. "VALUES (%d, %d, '%s', %d)";
  342. db_query($sql, $analysisfeature_id, $type_id, $protein->asXML(), $rank);
  343. fwrite($log, " (Insert)\n"); // write to log
  344. // parse the XML for each protein if GO terms are requested
  345. if($parsego and $go_db_id){
  346. $protein = tripal_analysis_interpro_get_result_object($protein->asXML(),$feature_id);
  347. $goterms = $protein['goterms'];
  348. // cycle through the GO terms and add them to the database
  349. foreach($goterms as $goterm){
  350. // seperate the 'GO:' from the term
  351. if (preg_match("/^.*?GO:(\d+).*$/", $goterm, $matches)) {
  352. // Find cvterm_id for the matched GO term
  353. $sql = "SELECT cvterm_id FROM {cvterm} CVT
  354. INNER JOIN dbxref DBX ON CVT.dbxref_id = DBX.dbxref_id
  355. WHERE DBX.accession = '%s' AND DBX.db_id = %d";
  356. $goterm_id = db_result(db_query($sql, $matches[1], $go_db_id));
  357. // Insert GO terms into feature_cvterm table
  358. // Default pub_id = 1 (NULL) was used
  359. $sql = "INSERT INTO {feature_cvterm} (feature_id, cvterm_id, pub_id)
  360. VALUES (%d, %d, 1)";
  361. db_query($sql, $feature_id, $goterm_id);
  362. // Insert GO terms into analysisfeatureprop table
  363. $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) ".
  364. "VALUES (%d, %d, '%s', 0)";
  365. db_query($sql, $analysisfeature_id, $goterm_id, $matches[1]);
  366. } // end if preg_match
  367. } // end for each goterm
  368. } // end if($parsego and $go_db_id)
  369. } // end if($feature_id)
  370. } // end foreach ($proteins as $protein)
  371. db_set_active ($previous_db); // Use drupal database
  372. print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
  373. fwrite($log, "\n");
  374. fclose($log);
  375. return;
  376. }
  377. /********************************************************************************
  378. *
  379. */
  380. function tripal_analysis_interpro_get_result_object($interpro_xml,$feature_id){
  381. // Load the XML into an object
  382. $xmlObj = simplexml_load_string($interpro_xml);
  383. // iterate through each interpro results for this protein
  384. $results = array();
  385. $terms = array();
  386. $protein = array();
  387. $iprterms = array();
  388. $goterms = array();
  389. $term_count = 0;
  390. $match_count = 0;
  391. // get the properties of this result
  392. $attr = $xmlObj->attributes();
  393. $protein['orf_id'] = (string) $attr["id"];
  394. $protein['orf_length'] = (string) $attr["length"];
  395. $protein['orf_crc64'] = (string) $attr["crc64"];
  396. foreach($xmlObj->children() as $intepro){
  397. // get the interpro term for this match
  398. $attr = $intepro->attributes();
  399. $terms[$term_count]['ipr_id'] = (string) $attr["id"];
  400. $terms[$term_count]['ipr_name'] = (string) $attr["name"];
  401. $terms[$term_count]['ipr_type'] = (string) $attr["type"];
  402. $iprterms[] = array($terms[$term_count]['ipr_id'],$terms[$term_count]['ipr_name']);
  403. // iterate through the elements of the interpro result
  404. $matches[$term_count]['matches'] = array();
  405. $match_count = 0;
  406. foreach($intepro->children() as $level1){
  407. $element_name = $level1->getName();
  408. if($element_name == 'match'){
  409. // get the match name for this match
  410. $attr = $level1->attributes();
  411. $terms[$term_count]['matches'][$match_count]['match_id'] = (string) $attr["id"];
  412. $terms[$term_count]['matches'][$match_count]['match_name'] = (string) $attr["name"];
  413. $terms[$term_count]['matches'][$match_count]['match_dbname'] = (string) $attr["dbname"];
  414. // get the location information for this match
  415. $loc_count = 0;
  416. foreach($level1->children() as $level2){
  417. $element_name = $level2->getName();
  418. if($element_name == 'location'){
  419. $attr = $level2->attributes();
  420. $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_start'] = (string) $attr["start"];
  421. $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_end'] = (string) $attr["end"];
  422. $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_score'] = (string) $attr["score"];
  423. $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_status'] = (string) $attr["status"];
  424. $terms[$term_count]['matches'][$match_count]['locations'][$loc_count]['match_evidence'] = (string) $attr["evidence"];
  425. $loc_count++;
  426. }
  427. }
  428. $match_count++;
  429. }
  430. if($element_name == 'classification'){
  431. $attr = $level1->attributes();
  432. if($attr['class_type'] == 'GO'){
  433. $terms[$term_count]['matches'][$match_count]['go_terms'][] = (string) $attr['id'];
  434. $goterms[] = (string) $attr['id'];
  435. }
  436. }
  437. }
  438. $term_count++;
  439. }
  440. $results['terms'] = $terms;
  441. $results['orf'] = $protein;
  442. $results['iprterms'] = $iprterms;
  443. $results['goterms'] = $goterms;
  444. return $results;
  445. }