blast_ui.api.inc 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. <?php
  2. /**
  3. * @file
  4. * Contains more generally applicable functions as well as some meant to help developers
  5. * Plug-in to the BLAST UI functionality
  6. */
  7. /**
  8. * Get a specific BlastDB.
  9. *
  10. * @param $identifiers
  11. * An array of identifiers used to determine which BLAST DB to retrieve.
  12. *
  13. * @return
  14. * A fully-loaded BLAST DB Node
  15. */
  16. function get_blast_database($identifiers) {
  17. $node = FALSE;
  18. if (isset($identifiers['nid'])) {
  19. $node = node_load($identifiers['nid']);
  20. }
  21. elseif (isset($identifiers['name'])) {
  22. $nid = db_query('SELECT nid FROM {blastdb} WHERE name=:name', array(':name' => $identifiers['name']))->fetchField();
  23. $node = node_load($nid);
  24. } elseif (isset($identifiers['path'])) {
  25. $nid = db_query('SELECT nid FROM {blastdb} WHERE path LIKE :path', array(':path' => db_like($identifiers['path']) . '%'))->fetchField();
  26. $node = node_load($nid);
  27. }
  28. return $node;
  29. }
  30. /**
  31. * Returns a list BLAST DATABASE options
  32. *
  33. * @param $type
  34. * The type of BLAST dabases to restrict the list to (ie: n: nucleotide or p: protein)
  35. *
  36. * @return
  37. * An array where the nid is the key and the value is the human-readable name of the option
  38. */
  39. function get_blast_database_options($type) {
  40. global $user;
  41. // Use the Entity API to get a list of BLAST Nodes to load
  42. // We use this function in order respect node access control so that
  43. // administrators can use this module in combination with a node access module
  44. // of their choice to limit access to specific BLAST databases.
  45. $query = new EntityFieldQuery();
  46. $query->entityCondition('entity_type', 'node')
  47. // Restrict to BLASTDB nodes.
  48. ->entityCondition('bundle', 'blastdb')
  49. // Restrict to Published nodes.
  50. ->propertyCondition('status', 1)
  51. // Restrict to nodes the current user has permission to view.
  52. ->addTag('node_access');
  53. $entities = $query->execute();
  54. // Get all BlastDB nodes
  55. $nodes = node_load_multiple(array_keys($entities['node']));
  56. // Support obsolete database type n/p
  57. $obs_type = '';
  58. if ($type == 'protein') {
  59. $obs_type = 'p';
  60. }
  61. else {
  62. $obs_type = 'n';
  63. }
  64. $options = array();
  65. foreach ($nodes as $node) {
  66. if ( isset($node) && isset($node->db_dbtype) ) {
  67. if ( ($node->db_dbtype == $type) OR ($node->db_dbtype == $obs_type) ) {
  68. $options[$node->nid] = $node->db_name;
  69. }
  70. }
  71. }
  72. asort($options);
  73. $options[0] = 'Select a Dataset';
  74. return $options;
  75. }
  76. /**
  77. * Run BLAST (should be called from the command-line)
  78. *
  79. * @param $program
  80. * Which BLAST program to run (ie: 'blastn', 'tblastn', tblastx', 'blastp','blastx')
  81. * @param $query
  82. * The full path and filename of the query FASTA file
  83. * @param $database
  84. * The full path and filename prefix (excluding .nhr, .nin, .nsq, etc.)
  85. * @param $output_filestub
  86. * The filename (not including path) to give the results. Should not include file type suffix
  87. * @param $options
  88. * An array of additional option where the key is the name of the option used by
  89. * BLAST (ie: 'num_alignments') and the value is relates to this particular
  90. * BLAST job (ie: 250)
  91. */
  92. function run_BLAST_tripal_job($program, $query, $database, $output_filestub, $options, $job_id = NULL) {
  93. $output_file = file_directory_temp() . DIRECTORY_SEPARATOR . $output_filestub . '.blast.asn';
  94. $output_file_xml = variable_get('file_public_path', conf_path() . '/files') . DIRECTORY_SEPARATOR . $output_filestub . '.blast.xml';
  95. $output_file_tsv = variable_get('file_public_path', conf_path() . '/files') . DIRECTORY_SEPARATOR . $output_filestub . '.blast.tsv';
  96. $output_file_html = variable_get('file_public_path', conf_path() . '/files') . DIRECTORY_SEPARATOR . $output_filestub . '.blast.html';
  97. print "\nExecuting $program\n\n";
  98. print "Query: $query\n";
  99. print "Database: $database\n";
  100. print "Results File: $output_file\n";
  101. print "Options:\n";
  102. // Allow administrators to use an absolute path for these commands.
  103. // Defaults to using $PATH.
  104. $blast_path = variable_get('blast_path', '');
  105. $program = $blast_path . $program;
  106. $blast_formatter_command = $blast_path . 'blast_formatter';
  107. $blast_cmd = "$program -query '$query' -db '$database' -out '$output_file' -outfmt=11";
  108. if (!empty($options)) {
  109. foreach ($options as $opt => $val) {
  110. print "\t$opt: $val\n";
  111. $blast_cmd .= " -$opt $val";
  112. }
  113. }
  114. print "\nExecuting the following BLAST command:\n" . $blast_cmd . "\n";
  115. system($blast_cmd);
  116. if(!file_exists($output_file)) {
  117. tripal_report_error(
  118. 'blast_ui',
  119. TRIPAL_ERROR,
  120. "BLAST did not complete successfully as is implied by the lack of output file (%file). The command run was @command",
  121. array('%file' => $output_file, '@command' => $blast_cmd),
  122. array('print' => TRUE)
  123. );
  124. return FALSE;
  125. }
  126. print "\nGenerating additional download formats...\n";
  127. print "\tXML\n";
  128. system("$blast_formatter_command -archive $output_file -outfmt 5 -out $output_file_xml");
  129. if(!file_exists($output_file_xml)) {
  130. tripal_report_error(
  131. 'blast_ui',
  132. TRIPAL_ERROR,
  133. "Unable to convert BLAST ASN.1 archive (%archive) to XML (%file).",
  134. array('%archive' => $output_file, '%file' => $output_file_xml),
  135. array('print' => TRUE)
  136. );
  137. }
  138. print "\tTab-delimited\n";
  139. system("$blast_formatter_command -archive $output_file -outfmt 7 -out $output_file_tsv");
  140. if(!file_exists($output_file_tsv)) {
  141. tripal_report_error(
  142. 'blast_ui',
  143. TRIPAL_WARNING,
  144. "Unable to convert BLAST ASN.1 archive (%archive) to Tabular Output (%file).",
  145. array('%archive' => $output_file, '%file' => $output_file_tsv),
  146. array('print' => TRUE)
  147. );
  148. }
  149. print "\tHTML (includes alignments)\n";
  150. system("$blast_formatter_command -archive $output_file -outfmt 0 -out $output_file_html -html");
  151. if(!file_exists($output_file_tsv)) {
  152. tripal_report_error(
  153. 'blast_ui',
  154. TRIPAL_WARNING,
  155. "Unable to convert BLAST ASN.1 archive (%archive) to HTML Output (%file).",
  156. array('%archive' => $output_file, '%file' => $output_file_html),
  157. array('print' => TRUE)
  158. );
  159. }
  160. print "\nDone!\n";
  161. }
  162. /**
  163. * FASTA validating parser
  164. *
  165. * A sequence in FASTA format begins with a single-line description, followed
  166. * by lines of sequence data.The description line is distinguished from the
  167. * sequence data by a greater-than (">") symbol in the first column. The word
  168. * following the ">" symbol is the identifier of the sequence, and the rest of
  169. * the line is the description (both are optional). There should be no space
  170. * between the ">" and the first letter of the identifier. The sequence ends
  171. * if another line starting with a ">" appears which indicates the start of
  172. * another sequence.
  173. *
  174. * @param $type
  175. * The type of sequence to be validated (ie: either nucleotide or protein).
  176. * @param $sequence
  177. * A string of characters to be validated.
  178. *
  179. * @return
  180. * Return a boolean. 1 if the sequence does not pass the format valifation stage and 0 otherwise.
  181. *
  182. */
  183. function validate_fasta_sequence($type, $sequence) {
  184. if ($type == 'nucleotide') {
  185. $fastaIdRegEx = '/^>.*(\\n|\\r)/';
  186. $fastaSeqRegEx = '/[^acgntuACGNTU\n\r]/';
  187. if ( preg_match($fastaSeqRegEx,$sequence) && !(preg_match($fastaIdRegEx,$sequence)) ) {
  188. return TRUE;
  189. } else {
  190. return FALSE;
  191. }
  192. } elseif ($type == 'protein') {
  193. $fastaIdRegEx = '/^>.*(\\n|\\r)/';
  194. $fastaSeqRegEx = '/[^acgturykmswbdhvnxACGTURYKMSWBDHVNX\*\-\n\r]/';
  195. if ( preg_match($fastaSeqRegEx,$sequence) && !(preg_match($fastaIdRegEx,$sequence)) ) {
  196. return TRUE;
  197. } else {
  198. return FALSE;
  199. }
  200. }
  201. return FALSE;
  202. }
  203. /**
  204. * Retrieve the regex to capture the Link-out Accession from the Hit Def.
  205. *
  206. * @param $nid
  207. * The node ID of the BLAST database the hit is from.
  208. * @param $options
  209. * An array of options that can be passed to this function. Supported
  210. * options include:
  211. * -
  212. *
  213. * @return
  214. * A PHP regex for use with preg_match to cature the Link-out Accession.
  215. */
  216. function get_blastdb_linkout_regex($node, $options = array()) {
  217. if (empty($node->linkout->regex)) {
  218. switch ($node->linkout->regex_type) {
  219. case 'default':
  220. $regex = '/^(\S+).*/';
  221. break;
  222. case 'genbank':
  223. $regex = '/^gb\|([^\|])*\|.*/';
  224. break;
  225. case 'embl':
  226. $regex = '/^embl\|([^\|])*\|.*/';
  227. break;
  228. case 'swissprot':
  229. $regex = '/^sp\|([^\|])*\|.*/';
  230. break;
  231. }
  232. }
  233. else {
  234. $regex = $node->linkout->regex;
  235. }
  236. return $regex;
  237. }