parse_blast_XML.inc 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924
  1. <?php
  2. /*******************************************************************************
  3. * Parse NCBI Blast results for indexing so that user can use blast results to
  4. * find corresponding features
  5. */
  6. function parse_NCBI_Blast_XML_index_version($xml_string,$db,$feature_id) {
  7. // Get the parser using db_id
  8. $sql = "SELECT * FROM {tripal_analysis_blast} WHERE db_id = %d";
  9. $parser = db_fetch_object(db_query($sql, $db->db_id));
  10. $db_name = $parser->displayname;
  11. $is_genbank = $parser->genbank_style;
  12. $regex_hit_id = $parser->regex_hit_id;
  13. $regex_hit_def = $parser->regex_hit_def;
  14. $regex_hit_accession = $parser->regex_hit_accession;
  15. // set default if regular expressions have not been specified
  16. if(!$regex_hit_id){
  17. $regex_hit_id = '/^(.*?)\s.*$/';
  18. } else {
  19. $regex_hit_id = '/'.$regex_hit_id.'/';
  20. }
  21. if(!$regex_hit_def){
  22. $regex_hit_def = '/^.*?\s(.*)$/';
  23. } else {
  24. $regex_hit_def = '/'.$regex_hit_def.'/';
  25. }
  26. if(!$regex_hit_accession){
  27. $regex_hit_accession = '/^(.*?)\s.*$/';
  28. } else {
  29. $regex_hit_accession = '/'.$regex_hit_accession.'/';
  30. }
  31. $html_out .= "<h3>$db_name</h3>";
  32. // Load the file. This XML file should be an extract
  33. // of the original XML file with only a single iteration.
  34. // An iteration is essentially all the hits for a single
  35. // query sequence.
  36. $xml_output = simplexml_load_string($xml_string);
  37. $iteration = '';
  38. // new XML file parser has added the feature name within <Iteration_query-def> tags.
  39. if ($xml_output->getName() == 'Iteration') {
  40. foreach ($xml_output->children() as $xml_tag) {
  41. if ($xml_tag->getName() == 'Iteration_query-def') {
  42. // Here we show the feature name again to check if we pull the correct data
  43. $html_out .= "Query: $xml_tag<br>";
  44. } else if ($xml_tag->getName() == 'Iteration_hits') {
  45. $iteration = $xml_tag;
  46. }
  47. }
  48. // This is for the file parsed by the old parser
  49. } else {
  50. $iteration = $xml_output;
  51. }
  52. // now run through the blast hits/hsps of this iteration
  53. // and generate the rows of the table
  54. foreach($iteration->children() as $hits){
  55. $best_evalue = 0;
  56. foreach($hits->children() as $hit){
  57. $best_evalue = 0;
  58. $element_name = $hit->getName();
  59. if($element_name == 'Hit_id'){
  60. // if parsing "name, acc, desc" from three tags (1/3)
  61. if ($is_genbank) {
  62. $hit_name = $hit;
  63. }
  64. } else if($element_name == 'Hit_def'){
  65. if($is_genbank){
  66. $description = $hit;
  67. } else {
  68. $accession = preg_replace($regex_hit_accession,"$1",$hit);
  69. $hit_name = preg_replace($regex_hit_id,"$1",$hit);
  70. $description = preg_replace($regex_hit_def,"$1",$hit);
  71. }
  72. } else if($element_name == 'Hit_accession'){
  73. // if parsing "name, acc, desc" from three tags (3/3)
  74. if ($is_genbank){
  75. $accession = $hit;
  76. }
  77. // now run through each HSP for this hit
  78. }
  79. }
  80. $html_out .= "<p>$hit_name<br>";
  81. $html_out .= "$accession<br>";
  82. $html_out .= "<b>$description</b></br>";
  83. $hsp_html_out = '';
  84. }
  85. return $html_out;
  86. }
  87. /*******************************************************************************
  88. * Parse Blast XML Output file into analysisfeatureprop table
  89. */
  90. function tripal_analysis_blast_parseXMLFile ($analysis_id, $blastdb, $blastfile,
  91. $no_parsed, $blastfile_ext, $query_re, $query_type, $query_uniquename,
  92. $is_concat,$job_id,$is_concat) {
  93. // Prepare log
  94. $filename = preg_replace("/.*\/(.*)/", "$1", $blastfile);
  95. $logfile = file_directory_path() . "/tripal/tripal_analysis_blast/load_$filename.log";
  96. $logfile = tempnam(sys_get_temp_dir(),"tripal_analysis_blast_import");
  97. $log = fopen($logfile, 'a'); // append parsing results to log file
  98. if(!$log){
  99. print "ERROR: cannot open log file: $logfile\n";
  100. exit;
  101. }
  102. // If user input a file (e.g. blast.xml)
  103. if (is_file($blastfile)) {
  104. tripal_analysis_blast_parseXML($analysis_id, $blastdb, $blastfile,
  105. $no_parsed, $blastfile_ext, $query_re, $query_type, $query_uniquename,
  106. $job_id,1,$log,$is_concat);
  107. }
  108. // Otherwise, $blastfile is a directory. Iterate through all xml files in it
  109. else {
  110. if(!$blastfile_ext){
  111. $blastfile_ext = 'xml';
  112. }
  113. $dir_handle = @opendir($blastfile) or die("Unable to open $blastfile");
  114. $pattern = sql_regcase($blastfile . "/*.$blastfile_ext");
  115. $total_files = count(glob($pattern));
  116. print "$total_files file(s) to be parsed.\n";
  117. $interval = intval($total_files * 0.01);
  118. $no_file = 0;
  119. // Parsing all files in the directory
  120. while ($file = readdir($dir_handle)) {
  121. if(preg_match("/^.*\.$blastfile_ext/i",$file)){
  122. tripal_analysis_blast_parseXML($analysis_id, $blastdb, "$blastfile/$file",
  123. $no_parsed, $blastfile_ext, $query_re, $query_type, $query_uniquename,
  124. $job_id,0,$log);
  125. // Set job status
  126. if ($no_file % $interval == 0) {
  127. $percentage = (int) (($no_file / $total_files) * 100);
  128. tripal_job_set_progress($job_id, $percentage);
  129. print $percentage."%\r";
  130. }
  131. }
  132. $no_file ++;
  133. }
  134. }
  135. print "Done.\nSuccessful and failed entries have been saved in the log file:\n $logfile\n";
  136. fwrite($log, "\n");
  137. fclose($log);
  138. return;
  139. }
  140. /********************************************************************************
  141. *
  142. */
  143. function tripal_analysis_blast_parseXML($analysis_id, $blastdb, $blastfile,
  144. $no_parsed, $blastfile_ext, $query_re, $query_type, $query_uniquename,
  145. $job_id,$set_progress,$log,$is_concat){
  146. // Parsing started
  147. print "Parsing File:".$blastfile." ...\n";
  148. fwrite($log, date("D M j G:i:s Y").". Loading $blastfile\n");
  149. if ($no_parsed == 'all') {
  150. print "Parsing all hits...\n";
  151. } else {
  152. print "Parsing top $no_parsed hits...\n";
  153. }
  154. // Get cvterm_id for 'analysis_blast_output_iteration_hits' which is required
  155. // for inserting into the analysisfeatureprop table
  156. $previous_db = tripal_db_set_active('chado'); // use chado database
  157. $sql = "SELECT CVT.cvterm_id ".
  158. "FROM {cvterm} CVT ".
  159. " INNER JOIN {cv} ON cv.cv_id = CVT.cv_id ".
  160. "WHERE CVT.name = 'analysis_blast_output_iteration_hits' ".
  161. " AND CV.name = 'tripal'";
  162. $type_id = db_result(db_query($sql));
  163. // Load the XML file.
  164. if (!is_readable($blastfile)) {
  165. exit("Could not open the XML file '$blastfile'. Check that file exists and that permissions are correct.\n");
  166. }
  167. // if the file is a set of concatenated files then we want to split it up
  168. // and run each one individually
  169. if($is_concat){
  170. // generate a temporary file name
  171. $temp = tempnam(sys_get_temp_dir(),'blast_');
  172. print "Blast XML file is concatenated. Breaking apart and parsing each individually: $temp\n";
  173. $out_fh = fopen($temp,"w");
  174. // run through the lines of the XML file
  175. $in_fh = fopen($blastfile,"r");
  176. while(!feof($in_fh)){
  177. $line = fgets($in_fh);
  178. $line = trim($line);
  179. if(!$line){
  180. continue;
  181. }
  182. fwrite($out_fh,"$line\n");
  183. // if the line begins a set of blast output XML then parse the
  184. // preceeding set.
  185. if(preg_match("/<\/BlastOutput>/",$line)){
  186. // close the temp file
  187. fclose($out_fh);
  188. // now parse this new temp file
  189. tripal_analysis_blast_parseXML($analysis_id, $blastdb, $temp,
  190. $no_parsed, $blastfile_ext, $query_re, $query_type, $query_uniquename,
  191. $job_id,$set_progress,$log,0);
  192. // reopen the file for the next set of results
  193. $out_fh = fopen($temp,"w");
  194. }
  195. }
  196. fclose($in_fh);
  197. return;
  198. }
  199. $blastoutput = simplexml_load_file($blastfile);
  200. if(!$blastoutput){
  201. exit("Could not read the XML file '$blastfile'. Check that the XML file is not corrupted.\n");
  202. }
  203. $no_iterations = 0;
  204. foreach($blastoutput->children() as $tmp) {
  205. if ($tmp->getName() == 'BlastOutput_iterations') {
  206. foreach($tmp->children() as $itr) {
  207. if ($itr->getName() == 'Iteration') {
  208. $no_iterations ++;
  209. }
  210. }
  211. }
  212. }
  213. print "$no_iterations iterations to be processed.\n";
  214. $interval = intval($no_iterations * 0.01);
  215. $idx_iterations = 0;
  216. foreach ($blastoutput->children() as $blastoutput_tags) {
  217. if ($blastoutput_tags->getName() == 'BlastOutput_iterations') {
  218. foreach($blastoutput_tags->children() as $iterations) {
  219. if ($iterations->getName() == 'Iteration') {
  220. // Set job status
  221. $idx_iterations ++;
  222. if ($set_progress and $idx_iterations % $interval == 0) {
  223. $percentage = (int) (($idx_iterations / $no_iterations) * 100);
  224. tripal_job_set_progress($job_id, $percentage);
  225. print $percentage."%\r";
  226. }
  227. // now run through the blast hits/hsps of this iteration
  228. // and generate the rows of the table
  229. $feature_id = 0;
  230. foreach($iterations->children() as $iteration_tags) {
  231. // Match chado feature uniquename with <Iteration_query-def>
  232. // and get the feature_id
  233. $featurenaem_xml = '';
  234. if($iteration_tags->getName() == 'Iteration_query-def'){
  235. // If the Iteration_query-def in the format provided by the
  236. // user's regular expression
  237. if ($query_re and preg_match("/$query_re/", $iteration_tags, $matches)) {
  238. $feature = $matches[1];
  239. }
  240. // If not in above format then pull up to the first space
  241. else {
  242. if (preg_match('/^(.*?)\s.*$/', $iteration_tags, $matches)) {
  243. $feature = $matches[1];
  244. }
  245. // if no match up to the first space then just use the entire string
  246. else {
  247. $feature = $iteration_tags;
  248. }
  249. }
  250. if(!$feature and $query_re){
  251. print "ERROR: cannot find feature for $iteration_tags using the regular expression: $query_re\n";
  252. exit;
  253. }
  254. // now find the feature in chado
  255. $select = array();
  256. if($query_uniquename){
  257. $select['uniquename'] = $feature;
  258. } else {
  259. $select['name'] = $feature;
  260. }
  261. if($query_type){
  262. $select['type_id'] = array(
  263. 'cv_id' => array(
  264. 'name' => 'sequence'
  265. ),
  266. 'name' => $query_type,
  267. );
  268. }
  269. $feature_arr = tripal_core_chado_select('feature',array('feature_id'),$select);
  270. if(count($feature_arr) > 1){
  271. fwrite($log, "Ambiguous: '$feature' matches more than one feature and is being skipped.\n");
  272. continue;
  273. }
  274. if(count($feature_arr) == 0){
  275. fwrite($log, "Failed: '$feature' cannot find a matching feature in the database.\n");
  276. continue;
  277. }
  278. $feature_id = $feature_arr[0]->feature_id;
  279. fwrite($log, "Matched: '$feature' => feature id:".$feature_id);
  280. $featurename_xml = $iteration_tags->asXML();
  281. }
  282. // Insert Iteration_hits into analysisfeatureprop and analysisfeature tables
  283. else if($iteration_tags->getName() == 'Iteration_hits'){
  284. if ($feature_id) {
  285. // Make sure this iteration doesn't exist in analysisfeatureprop. If it does, update but not insert
  286. $sql = "SELECT analysisfeatureprop_id FROM {analysisfeatureprop} AFP ".
  287. "INNER JOIN analysisfeature AF ON AF.analysisfeature_id = AFP.analysisfeature_id ".
  288. "WHERE feature_id=%d ".
  289. "AND analysis_id=%d ".
  290. "AND type_id=%d ";
  291. $result = db_query($sql, $feature_id, $analysis_id, $type_id);
  292. $analysisfeatureprop = db_fetch_object($result);
  293. $xml_content = "<Iteration>\n".$featurename_xml."\n";
  294. // parse all hits
  295. if ($no_parsed == 'all') {
  296. $xml_content .= $iteration_tags->asXML();
  297. // parse only top hits
  298. } else {
  299. $counter = 0;
  300. $xml_content .= "<Iteration_hits>\n";
  301. foreach ($iteration_tags->children() As $hit) {
  302. if ($counter < $no_parsed) {
  303. $xml_content .= $hit->asXML();
  304. } else {
  305. break;
  306. }
  307. $counter ++;
  308. }
  309. $xml_content .= "</Iteration_hits>";
  310. }
  311. $xml_content .= "\n</Iteration>";
  312. // If this Iteration_hits already exists, update it
  313. if ($analysisfeatureprop) {
  314. $sql = "UPDATE {analysisfeatureprop} ".
  315. "SET value = '%s' ".
  316. "WHERE analysisfeatureprop_id = %d ";
  317. db_query($sql, $xml_content, $analysisfeatureprop->analysisfeatureprop_id);
  318. fwrite($log, " (Update)\n"); // write to log
  319. // Otherwise, insert the Iteration_hits into analysisfeature and analysisfeatureprop tables
  320. } else {
  321. //------------------------------------------------------
  322. // Insert into analysisfeature table
  323. //------------------------------------------------------
  324. $sql = "INSERT INTO {analysisfeature} (feature_id, analysis_id) ".
  325. "VALUES (%d, %d)";
  326. db_query ($sql, $feature_id, $analysis_id);
  327. // Get the newly inserted analysisfeature_id
  328. $sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE feature_id = %d AND analysis_id = %d";
  329. $analysisfeature_id = db_result(db_query($sql, $feature_id, $analysis_id));
  330. //------------------------------------------------------
  331. // Insert into analysisfeatureprop table
  332. //------------------------------------------------------
  333. $sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank)".
  334. "VALUES (%d, %d, '%s', %d)";
  335. db_query($sql, $analysisfeature_id, $type_id, $xml_content, '0');
  336. fwrite($log, " (Insert)\n"); // write to log
  337. }
  338. }
  339. }
  340. }
  341. }
  342. }
  343. }
  344. }
  345. tripal_db_set_active ($previous_db); // Use drupal database
  346. }
  347. /********************************************************************************
  348. *
  349. */
  350. function tripal_analysis_blast_get_result_object($xml_string,$db,$max,$feature_id, $analysis) {
  351. $blast_object = new stdClass();
  352. // Get the parser using db_id
  353. $sql = "SELECT * FROM {tripal_analysis_blast} WHERE db_id = %d";
  354. $parser = db_fetch_object(db_query($sql, $db->db_id));
  355. $db_name = $parser->displayname;
  356. $is_genbank = $parser->genbank_style;
  357. $regex_hit_id = $parser->regex_hit_id;
  358. $regex_hit_def = $parser->regex_hit_def;
  359. $regex_hit_accession = $parser->regex_hit_accession;
  360. // set default if regular expressions have not been specified
  361. if(!$regex_hit_id){
  362. $regex_hit_id = '/^(.*?)\s.*$/';
  363. } else {
  364. $regex_hit_id = '/'.$regex_hit_id.'/';
  365. }
  366. if(!$regex_hit_def){
  367. $regex_hit_def = '/^.*?\s(.*)$/';
  368. } else {
  369. $regex_hit_def = '/'.$regex_hit_def.'/';
  370. }
  371. if(!$regex_hit_accession){
  372. $regex_hit_accession = '/^(.*?)\s.*$/';
  373. } else {
  374. $regex_hit_accession = '/'.$regex_hit_accession.'/';
  375. }
  376. // Get analysis information
  377. $blast_object->analysis = $analysis;
  378. $db->displayname = $db_name;
  379. $blast_object->db = $db;
  380. if (!$db_name) {
  381. $blast_object->title = $analysis->name;
  382. } else {
  383. $blast_object->title = $db_name;
  384. }
  385. // Find node id for the analysis
  386. $ana_nid = db_result(db_query("SELECT nid FROM {chado_analysis} WHERE analysis_id = %d", $analysis->analysis_id));
  387. $analysis->nid = $ana_nid;
  388. // Load the file. This XML file should be an extract
  389. // of the original XML file with only a single iteration.
  390. // An iteration is essentially all the hits for a single
  391. // query sequence.
  392. $xml_output = simplexml_load_string($xml_string);
  393. $iteration = '';
  394. // new XML file parser has added the feature name within <Iteration_query-def> tags.
  395. if ($xml_output->getName() == 'Iteration') {
  396. foreach ($xml_output->children() as $xml_tag) {
  397. if ($xml_tag->getName() == 'Iteration_query-def') {
  398. // Here we show the feature name again to check if we pull the correct data
  399. $blast_object->xml_tag = $xml_tag;
  400. } else if ($xml_tag->getName() == 'Iteration_hits') {
  401. $iteration = $xml_tag;
  402. }
  403. }
  404. // This is for the file parsed by the old parser
  405. } else {
  406. $iteration = $xml_output;
  407. }
  408. $number_hits = 0;
  409. foreach($iteration->children() as $hits){
  410. $number_hits ++;
  411. }
  412. // add the links for updating blast info using Ajax
  413. $blast_object->max = $max;
  414. $blast_object->number_hits = $number_hits;
  415. $blast_object->feature_id = $feature_id;
  416. $hits_array = array();
  417. $hit_count = 0;
  418. foreach($iteration->children() as $hits){
  419. $hsp_array = array();
  420. $counter = 0;
  421. foreach($hits->children() as $hit){
  422. $best_evalue = 0;
  423. $best_identity = 0;
  424. $best_len = 0;
  425. $element_name = $hit->getName();
  426. if($element_name == 'Hit_id'){
  427. // if parsing "name, acc, desc" from three tags (1/3)
  428. if ($is_genbank) {
  429. $hit_name = $hit;
  430. }
  431. } else if($element_name == 'Hit_def'){
  432. if($is_genbank){
  433. $description = $hit;
  434. } else {
  435. $accession = preg_replace($regex_hit_accession,"$1",$hit);
  436. $hit_name = preg_replace($regex_hit_id,"$1",$hit);
  437. $description = preg_replace($regex_hit_def,"$1",$hit);
  438. }
  439. } else if($element_name == 'Hit_accession'){
  440. // if parsing "name, acc, desc" from three tags (3/3)
  441. if ($is_genbank){
  442. $accession = $hit;
  443. }
  444. // now run through each HSP for this hit
  445. } else if($element_name == 'Hit_hsps'){
  446. foreach($hit->children() as $hsp){
  447. foreach($hsp->children() as $hsp_info){
  448. $element_name = $hsp_info->getName();
  449. if($element_name == 'Hsp_num'){
  450. $hsp_num = $hsp_info;
  451. }
  452. if($element_name == 'Hsp_bit-score'){
  453. $hsp_bit_score = $hsp_info;
  454. }
  455. if($element_name == 'Hsp_score'){
  456. $hsp_score = $hsp_info;
  457. }
  458. if($element_name == 'Hsp_evalue'){
  459. $hsp_evalue = $hsp_info;
  460. // use the first evalue for this set of HSPs
  461. // as the best evalue. This get's shown as
  462. // info for the overall match.
  463. if(!$best_evalue){
  464. $best_evalue = $hsp_evalue;
  465. }
  466. }
  467. if($element_name == 'Hsp_query-from'){
  468. $hsp_query_from = $hsp_info;
  469. }
  470. if($element_name == 'Hsp_query-to'){
  471. $hsp_query_to = $hsp_info;
  472. }
  473. if($element_name == 'Hsp_hit-from'){
  474. $hsp_hit_from = $hsp_info;
  475. }
  476. if($element_name == 'Hsp_hit-to'){
  477. $hsp_hit_to = $hsp_info;
  478. }
  479. if($element_name == 'Hsp_query-frame'){
  480. $hsp_query_frame = $hsp_info;
  481. }
  482. if($element_name == 'Hsp_identity'){
  483. $hsp_identity = $hsp_info;
  484. // use the first evalue for this set of HSPs
  485. // as the best evalue. This get's shown as
  486. // info for the overall match.
  487. if(!$best_identity){
  488. $best_identity = $hsp_identity;
  489. }
  490. }
  491. if($element_name == 'Hsp_positive'){
  492. $hsp_positive = $hsp_info;
  493. }
  494. if($element_name == 'Hsp_align-len'){
  495. $hsp_align_len = $hsp_info;
  496. // use the first evalue for this set of HSPs
  497. // as the best evalue. This get's shown as
  498. // info for the overall match.
  499. if(!$best_len){
  500. $best_len = $hsp_align_len;
  501. }
  502. }
  503. if($element_name == 'Hsp_qseq'){
  504. $hsp_qseq = $hsp_info;
  505. }
  506. if($element_name == 'Hsp_hseq'){
  507. $hsp_hseq = $hsp_info;
  508. }
  509. if($element_name == 'Hsp_midline'){
  510. $hsp_midline = $hsp_info;
  511. }
  512. }
  513. $hsp_content = array();
  514. $hsp_content['hsp_num'] = $hsp_num;
  515. $hsp_content['bit_score'] = $hsp_bit_score;
  516. $hsp_content['score'] = $hsp_score;
  517. $hsp_content['evalue'] = $hsp_evalue;
  518. $hsp_content['query_frame'] = $hsp_query_frame;
  519. $hsp_content['qseq'] = $hsp_qseq;
  520. $hsp_content['midline'] = $hsp_midline;
  521. $hsp_content['hseq'] = $hsp_hseq;
  522. $hsp_content['hit_from'] = $hsp_hit_from;
  523. $hsp_content['hit_to'] = $hsp_hit_to;
  524. $hsp_content['identity'] = $hsp_identity;
  525. $hsp_content['align_len'] = $hsp_align_len;
  526. $hsp_content['positive'] = $hsp_positive;
  527. $hsp_content['query_from'] = $hsp_query_from;
  528. $hsp_content['query_to'] = $hsp_query_to;
  529. $hsp_array[$counter] = $hsp_content;
  530. $counter ++;
  531. }
  532. }
  533. }
  534. $arrowr_url = url(drupal_get_path('theme', 'tripal')."/images/arrow_r.png");
  535. $hits_array[$hit_count]['arrowr_url'] = $arrowr_url;
  536. $hits_array[$hit_count]['accession'] = $accession;
  537. $hits_array[$hit_count]['hit_name'] = $hit_name;
  538. if($accession && $db->urlprefix){
  539. $hits_array[$hit_count]['hit_url'] = "$db->urlprefix$accession";
  540. } else {
  541. // Test if this is another feature in the database
  542. $sql = "SELECT feature_id FROM {feature} WHERE uniquename = '%s'";
  543. $previous_db = db_set_active('chado');
  544. $hit_feature_id = db_result(db_query($sql, $hit_name));
  545. db_set_active($previous_db);
  546. // If it is, add link to that feature
  547. if ($hit_feature_id) {
  548. $hits_array[$hit_count]['hit_url'] = "ID$hit_feature_id";
  549. }
  550. }
  551. $hits_array[$hit_count]['best_evalue'] = $best_evalue;
  552. if (!empty($best_len)) {
  553. $percent_identity = number_format($best_identity/$best_len*100, 2);
  554. $hits_array[$hit_count]['percent_identity'] = $percent_identity;
  555. }
  556. $hits_array[$hit_count]['description'] = $description;
  557. // if there is at least one HSP
  558. if (!empty($hsp_array[0]['query_frame'])) {
  559. $hits_array[$hit_count]['hsp'] = $hsp_array;
  560. } else {
  561. $hits_array[$hit_count]['hsp'] = array();
  562. }
  563. $hit_count ++;
  564. // if we've hit the maximum number of hits then return
  565. if($max > 0 && $hit_count >= $max){
  566. break;
  567. }
  568. }
  569. $blast_object->hits_array = $hits_array;
  570. return $blast_object;
  571. }
  572. /********************************************************************************
  573. * Parse the best hit to generate the best hit homology report
  574. */
  575. function tripal_analysis_blast_parse_best_hit ($analysis_id) {
  576. // Select all features for this blast analysis, and save them to the 'featureSet' array
  577. $sql = "SELECT feature_id
  578. FROM {analysisfeature} AF
  579. WHERE analysis_id = %d";
  580. $previous_db = tripal_db_set_active('chado');
  581. $result = db_query($sql, $analysis_id);
  582. $featureSet = array ();
  583. $counter = 0;
  584. while ($feature = db_fetch_object($result)) {
  585. $featureSet [$counter] = $feature->feature_id;
  586. $counter ++;
  587. }
  588. // Get analysis information including 'Time', 'Name', and 'DB Settings'
  589. $sql = "SELECT value, name, to_char(timeexecuted, 'MM-DD-YYYY') AS time
  590. FROM {analysis} A
  591. INNER JOIN {analysisprop} AP ON A.analysis_id = AP.analysis_id
  592. WHERE A.analysis_id = %d
  593. AND type_id= (SELECT cvterm_id
  594. FROM {cvterm}
  595. WHERE name = 'analysis_blast_settings')";
  596. $analysis = db_fetch_object(db_query($sql, $analysis_id));
  597. // Parse the blast settings
  598. $blastsettings = explode("|", $analysis->value);
  599. $db_id = $blastsettings [0];
  600. // Get the xml description parser using db_id
  601. tripal_db_set_active($previous_db);
  602. $sql = "SELECT * FROM {tripal_analysis_blast} WHERE db_id = %d";
  603. $parser = db_fetch_object(db_query($sql, $db_id));
  604. $db_name = $parser->displayname;
  605. $is_genbank = $parser->genbank_style;
  606. $regex_hit_id = $parser->regex_hit_id;
  607. $regex_hit_def = $parser->regex_hit_def;
  608. $regex_hit_accession = $parser->regex_hit_accession;
  609. // set default description parser if regular expressions have not been specified
  610. if(!$regex_hit_id){
  611. $regex_hit_id = '/^(.*?)\s.*$/';
  612. } else {
  613. $regex_hit_id = '/'.$regex_hit_id.'/';
  614. }
  615. if(!$regex_hit_def){
  616. $regex_hit_def = '/^.*?\s(.*)$/';
  617. } else {
  618. $regex_hit_def = '/'.$regex_hit_def.'/';
  619. }
  620. if(!$regex_hit_accession){
  621. $regex_hit_accession = '/^(.*?)\s.*$/';
  622. } else {
  623. $regex_hit_accession = '/'.$regex_hit_accession.'/';
  624. }
  625. $interval = intval($counter * 0.01);
  626. for ($i = 0; $i < $counter; $i ++) {
  627. if ($i !=0 && $i % $interval == 0) {
  628. $percentage = (int) ($i / $counter * 100);
  629. tripal_job_set_progress($job_id, $percentage);
  630. print $percentage."%\r";
  631. }
  632. $sql = "SELECT value
  633. FROM {analysisfeatureprop} AFP
  634. INNER JOIN {analysisfeature} AF ON AFP.analysisfeature_id = AF.analysisfeature_id
  635. WHERE analysis_id = %d
  636. AND feature_id = %d
  637. AND type_id = (SELECT cvterm_id FROM cvterm WHERE name='analysis_blast_output_iteration_hits' AND cv_id = (SELECT cv_id FROM cv WHERE name='tripal'))";
  638. $previous_db = tripal_db_set_active('chado');
  639. $xml_output = simplexml_load_string(db_result(db_query($sql, $analysis_id, $featureSet[$i])));
  640. $iteration = '';
  641. // new XML file parser has added the feature name within <Iteration_query-def> tags.
  642. if ($xml_output->getName() == 'Iteration') {
  643. $query = "";
  644. foreach ($xml_output->children() as $xml_tag) {
  645. if ($xml_tag->getName() == 'Iteration_query-def') {
  646. // Here we show the feature name again to check if we pull the correct data
  647. $query = $xml_tag;
  648. } else if ($xml_tag->getName() == 'Iteration_hits') {
  649. $iteration = $xml_tag;
  650. }
  651. }
  652. // This is for the file parsed by the old parser
  653. } else {
  654. $iteration = $xml_output;
  655. }
  656. $number_hits = 0;
  657. foreach($iteration->children() as $hits){
  658. $number_hits ++;
  659. }
  660. $query = explode(" ", $query) ;
  661. $query = $query [0];
  662. if ($number_hits == 0) {
  663. continue;
  664. }
  665. // now run through the blast hits/hsps of this iteration
  666. // and generate the rows of the table
  667. foreach($iteration->children() as $hits){
  668. $hit_count++;
  669. foreach($hits->children() as $hit){
  670. $best_evalue = 0;
  671. $best_identity = 0;
  672. $best_len = 0;
  673. $element_name = $hit->getName();
  674. if($element_name == 'Hit_id'){
  675. // if parsing "name, acc, desc" from three tags (1/3)
  676. if ($is_genbank) {
  677. $hit_name = $hit;
  678. }
  679. } else if($element_name == 'Hit_def'){
  680. if($is_genbank){
  681. $description = $hit;
  682. } else {
  683. $accession = preg_replace($regex_hit_accession,"$1",$hit);
  684. $hit_name = preg_replace($regex_hit_id,"$1",$hit);
  685. $description = preg_replace($regex_hit_def,"$1",$hit);
  686. }
  687. } else if($element_name == 'Hit_accession'){
  688. // if parsing "name, acc, desc" from three tags (3/3)
  689. if ($is_genbank){
  690. $accession = $hit;
  691. }
  692. // now run through each HSP for this hit
  693. } else if($element_name == 'Hit_hsps'){
  694. foreach($hit->children() as $hsp){
  695. foreach($hsp->children() as $hsp_info){
  696. $element_name = $hsp_info->getName();
  697. if($element_name == 'Hsp_num'){
  698. $hsp_num = $hsp_info;
  699. }
  700. if($element_name == 'Hsp_bit-score'){
  701. $hsp_bit_score = $hsp_info;
  702. }
  703. if($element_name == 'Hsp_score'){
  704. $hsp_score = $hsp_info;
  705. }
  706. if($element_name == 'Hsp_evalue'){
  707. $hsp_evalue = $hsp_info;
  708. // use the first evalue for this set of HSPs
  709. // as the best evalue. This get's shown as
  710. // info for the overall match.
  711. if(!$best_evalue){
  712. $best_evalue = $hsp_evalue;
  713. }
  714. }
  715. if($element_name == 'Hsp_query-from'){
  716. $hsp_query_from = $hsp_info;
  717. }
  718. if($element_name == 'Hsp_query-to'){
  719. $hsp_query_to = $hsp_info;
  720. }
  721. if($element_name == 'Hsp_hit-from'){
  722. $hsp_hit_from = $hsp_info;
  723. }
  724. if($element_name == 'Hsp_hit-to'){
  725. $hsp_hit_to = $hsp_info;
  726. }
  727. if($element_name == 'Hsp_query-frame'){
  728. $hsp_query_frame = $hsp_info;
  729. }
  730. if($element_name == 'Hsp_identity'){
  731. $hsp_identity = $hsp_info;
  732. // use the first evalue for this set of HSPs
  733. // as the best evalue. This get's shown as
  734. // info for the overall match.
  735. if(!$best_identity){
  736. $best_identity = $hsp_identity;
  737. }
  738. }
  739. if($element_name == 'Hsp_positive'){
  740. $hsp_positive = $hsp_info;
  741. }
  742. if($element_name == 'Hsp_align-len'){
  743. $hsp_align_len = $hsp_info;
  744. // use the first evalue for this set of HSPs
  745. // as the best evalue. This get's shown as
  746. // info for the overall match.
  747. if(!$best_len){
  748. $best_len = $hsp_align_len;
  749. }
  750. }
  751. if($element_name == 'Hsp_qseq'){
  752. $hsp_qseq = $hsp_info;
  753. }
  754. if($element_name == 'Hsp_hseq'){
  755. $hsp_hseq = $hsp_info;
  756. }
  757. if($element_name == 'Hsp_midline'){
  758. $hsp_midline = $hsp_info;
  759. }
  760. }
  761. }
  762. }
  763. }
  764. // Get analysisfeature_id
  765. $sql = "SELECT analysisfeature_id FROM {analysisfeature} WHERE analysis_id = %d AND feature_id = %d";
  766. $af_id = db_result(db_query($sql, $analysis_id, $featureSet[$i]));
  767. // Get type_id
  768. $sql = "SELECT cvterm_id FROM {cvterm} WHERE name = '%s' AND cv_id = (SELECT cv_id FROM {cv} WHERE name = 'tripal')";
  769. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_query'));
  770. $sql_test ="SELECT analysisfeatureprop_id FROM {analysisfeatureprop} WHERE analysisfeature_id = $af_id AND type_id = %d";
  771. $test_afpid = db_result(db_query($sql_test, $type_id));
  772. //Insert only if this blast query not exists.
  773. if (!$test_afpid) {
  774. $afp_sql = "INSERT INTO {analysisfeatureprop} (analysisfeature_id, type_id, value, rank) VALUES (%d, %d, '%s', 0)";
  775. //$query;
  776. db_query($afp_sql, $af_id, $type_id, $query);
  777. //$hit_name;
  778. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_match'));
  779. db_query($afp_sql, $af_id, $type_id, $hit_name);
  780. //$description;
  781. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_description'));
  782. db_query($afp_sql, $af_id, $type_id, $description);
  783. //$best_evalue;
  784. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_evalue'));
  785. $e_digit = explode("e-", $best_evalue);
  786. if (count($e_digit) == 2) {
  787. $evalue_shown = number_format($e_digit [0],1);
  788. $best_evalue = $evalue_shown."e-".$e_digit[1];
  789. }
  790. db_query($afp_sql, $af_id, $type_id, $best_evalue);
  791. //$best_identity;
  792. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_identity'));
  793. $percent_identity = number_format($best_identity/$best_len*100, 1);
  794. db_query($afp_sql, $af_id, $type_id, $percent_identity);
  795. //$best_len;
  796. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_length'));
  797. db_query($afp_sql, $af_id, $type_id, $best_len);
  798. // Otherwise, update all instead
  799. } else {
  800. $afp_sql = "UPDATE {analysisfeatureprop} SET analysisfeature_id = %d, type_id = %d, value = '%s', rank = 0 WHERE analysisfeatureprop_id = %d";
  801. //$query;
  802. db_query($afp_sql, $af_id, $type_id, $query, $test_afpid);
  803. //$hit_name;
  804. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_match'));
  805. $test_afpid = db_result(db_query($sql_test, $type_id));
  806. db_query($afp_sql, $af_id, $type_id, $hit_name, $test_afpid);
  807. //$description;
  808. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_description'));
  809. $test_afpid = db_result(db_query($sql_test, $type_id));
  810. db_query($afp_sql, $af_id, $type_id, $description, $test_afpid);
  811. //$best_evalue;
  812. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_evalue'));
  813. $test_afpid = db_result(db_query($sql_test, $type_id));
  814. $e_digit = explode("e-", $best_evalue);
  815. if (count($e_digit) == 2) {
  816. $evalue_shown = number_format($e_digit [0],1);
  817. $best_evalue = $evalue_shown."e-".$e_digit[1];
  818. }
  819. db_query($afp_sql, $af_id, $type_id, $best_evalue, $test_afpid);
  820. //$best_identity;
  821. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_identity'));
  822. $test_afpid = db_result(db_query($sql_test, $type_id));
  823. $percent_identity = number_format($best_identity/$best_len*100, 1);
  824. db_query($afp_sql, $af_id, $type_id, $percent_identity, $test_afpid);
  825. //$best_len;
  826. $type_id = db_result(db_query($sql, 'analysis_blast_besthit_length'));
  827. $test_afpid = db_result(db_query($sql_test, $type_id));
  828. db_query($afp_sql, $af_id, $type_id, $best_len, $test_afpid);
  829. }
  830. tripal_db_set_active($previous_db);
  831. break;
  832. }
  833. }
  834. print "100%\n";
  835. return;
  836. }