tripal_feature.api.inc 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. <?php
  2. /**
  3. * @file
  4. * Provides an application programming interface (API) for working with features
  5. */
  6. /**
  7. * @defgroup tripal_feature_api Feature API
  8. * @ingroup tripal_api
  9. * @{
  10. * Provides an application programming interface (API) for working with features
  11. * @}
  12. */
  13. /**
  14. * Performs a reverse compliment of a nucleotide sequence
  15. *
  16. * @param $sequence
  17. * The nucelotide sequence
  18. *
  19. * @return
  20. * an upper-case reverse complemented sequence
  21. *
  22. * @ingroup tripal_feature_api
  23. */
  24. function tripal_reverse_compliment_sequence($sequence) {
  25. $seq = strtoupper($sequence);
  26. $seq = strrev($seq);
  27. $seq = str_replace("A", "t", $seq);
  28. $seq = str_replace("T", "a", $seq);
  29. $seq = str_replace("G", "c", $seq);
  30. $seq = str_replace("C", "g", $seq);
  31. $seq = str_replace("Y", "r", $seq);
  32. $seq = str_replace("R", "y", $seq);
  33. $seq = str_replace("W", "w", $seq);
  34. $seq = str_replace("S", "s", $seq);
  35. $seq = str_replace("K", "m", $seq);
  36. $seq = str_replace("M", "k", $seq);
  37. $seq = str_replace("D", "h", $seq);
  38. $seq = str_replace("V", "b", $seq);
  39. $seq = str_replace("H", "d", $seq);
  40. $seq = str_replace("B", "v", $seq);
  41. return strtoupper($seq);
  42. }
  43. /**
  44. * Retrieves the sequence for a feature.
  45. *
  46. * @param $feature
  47. * An associative array describing the feature. Valid keys include:
  48. * - feature_id: The feature_id of the feature for which the sequence will be retrieved
  49. * - name: The feature name. This will appear on the FASTA definition line
  50. * - parent_id: (optional) only retrieve a sequence if 'derive_from_parent' is true
  51. * and the parent matches this ID.
  52. * - featureloc_id: (optional) only retrieve a sequence if 'derive_from_parent' is
  53. * true and the alignment is defined with this featureloc_id
  54. * @param $options
  55. * An associative array of options. Valid keys include:
  56. * - width: Indicate the number of bases to use per line. A new line will be added
  57. * after the specified number of bases on each line.
  58. * - derive_from_parent: Set to '1' if the sequence should be obtained from the parent
  59. * to which this feature is aligned.
  60. * - aggregate: Set to '1' if the sequence should only contain sub features, excluding
  61. * intro sub feature sequence. For example, set this option to obtain just
  62. * the coding sequence of an mRNA.
  63. * - output_format: The type of format. Valid formats include 'fasta_html', 'fasta_txt' and
  64. * 'raw'. The format 'fasta_txt' outputs line breaks as <br> tags and the entire
  65. * return value is in a <span> tag with a fixed-width font definition. 'fasta_txt'
  66. * outputs line breaks with windows format carriage returns (e.g. \r\n) with no other
  67. * formatting. The raw format is simply the sequence with no FASTA formatting and no
  68. * line breaks.
  69. * - num_upstream: An integer specifing the number of upstream bases to include in the output
  70. * - num_downstream: An integer specifying the number of downstream bases to include in the
  71. * output.
  72. * - sub_feature_types: Only include sub features (or child features) of the types
  73. * provided in the array
  74. * - relationship_type: If a relationship name is provided (e.g. sequence_of) then any
  75. * sequences that are in relationships of this type with matched sequences are also included
  76. * - relationship_part: If a relationship is provided in the preceeding argument then
  77. * the rel_part must be either 'object' or 'subject' to indicate which side of the
  78. * relationship the matched features belong
  79. *
  80. * @return
  81. * an array of matching sequence formated as requested.
  82. *
  83. * @ingroup tripal_feature_api
  84. */
  85. function tripal_get_sequence($feature, $options) {
  86. // default values for finding the feature
  87. $feature_id = array_key_exists('feature_id', $feature) ? $feature['feature_id'] : 0;
  88. $parent_id = array_key_exists('parent_id', $feature) ? $feature['parent_id'] : 0;
  89. $featureloc_id = array_key_exists('featureloc_id', $feature) ? $feature['featureloc_id'] : 0;
  90. $feature_name = array_key_exists('name', $feature) ? $feature['name'] : '';
  91. // default values for building the sequence
  92. $num_bases_per_line = array_key_exists('width', $options) ? $options['width'] : 50;
  93. $derive_from_parent = array_key_exists('derive_from_parent', $options) ? $options['derive_from_parent'] : 0;
  94. $aggregate = array_key_exists('aggregate', $options) ? $options['aggregate'] : 0;
  95. $output_format = array_key_exists('output_format', $options) ? $options['output_format'] : 'fasta_txt';
  96. $upstream = array_key_exists('num_upstream', $options) ? $options['num_upstream'] : 0;
  97. $downstream = array_key_exists('num_downstream', $options) ? $options['num_downstream'] : 0;
  98. $sub_features = array_key_exists('sub_feature_types', $options) ? $options['sub_feature_types'] : array();
  99. $relationship = array_key_exists('relationship_type', $options) ? $options['relationship_type'] : '';
  100. $rel_part = array_key_exists('relationship_part', $options) ? $options['relationship_part'] : '';
  101. // make sure the sub_features variable is an array
  102. if (!is_array($sub_features)) {
  103. tripal_report_error('tripal_deprecated', TRIPAL_ERROR,
  104. "'sub_features' option must be an array for function tripal_get_sequence().",
  105. array()
  106. );
  107. return array();
  108. }
  109. // if a relationship was specified then retreive and the sequences that
  110. // have the given relationship and the recurse to extract the appropriate
  111. // sequence
  112. if ($rel_part == "object" or $rel_part == "subject") {
  113. if ($rel_part == "subject") {
  114. $sql = '
  115. SELECT FO.feature_id, FO.name, FO.uniquename, CVTO.name as feature_type, O.genus, O.species
  116. FROM feature FS
  117. INNER JOIN feature_relationship FR ON FR.subject_id = FS.feature_id
  118. INNER JOIN cvterm CVTFR ON CVTFR.cvterm_id = FR.type_id
  119. INNER JOIN feature FO ON FO.feature_id = FR.object_id
  120. INNER JOIN cvterm CVTO ON CVTO.cvterm_id = FO.type_id
  121. INNER JOIN organism O ON O.organism_id = FO.organism_id
  122. WHERE
  123. FS.feature_id = :feature_id AND
  124. CVTFR.name = :relationship
  125. ';
  126. $features = chado_query($sql, array(':feature_id' => $feature_id, ':relationship' => $relationship));
  127. }
  128. if ($rel_part == "object") {
  129. $sql = '
  130. SELECT FS.feature_id, FS.name, FS.uniquename, CVTO.name as feature_type, O.genus, O.species
  131. FROM feature FO
  132. INNER JOIN feature_relationship FR ON FR.object_id = FO.feature_id
  133. INNER JOIN cvterm CVTFR ON CVTFR.cvterm_id = FR.type_id
  134. INNER JOIN feature FS ON FS.feature_id = FR.subject_id
  135. INNER JOIN cvterm CVTO ON CVTO.cvterm_id = FS.type_id
  136. INNER JOIN organism O ON O.organism_id = FS.organism_id
  137. WHERE
  138. FO.feature_id = :feature_id AND
  139. CVTFR.name = :relationship
  140. ';
  141. $features = chado_query($sql, array(':feature_id' => $feature_id, ':relationship' => $relationship));
  142. }
  143. $sequences = '';
  144. while ($feature = $features->fetchObject()) {
  145. // recurse and get the sequences for these in the relationship
  146. if ($rel_part == "subject") {
  147. $defline = "$feature_name, $relationship, $feature->uniquename $feature->feature_type ($feature->genus $feature->species)";
  148. }
  149. if ($rel_part == "object") {
  150. $defline = "$feature->uniquename $feature->feature_type ($feature->genus $feature->species), $relationship, $feature_name";
  151. }
  152. return tripal_get_sequence(
  153. array(
  154. 'feature_id' => $feature->feature_id,
  155. 'name' => $defline,
  156. 'parent_id' => $parent_id,
  157. ),
  158. array(
  159. 'width' => $num_bases_per_line,
  160. 'derive_from_pareht' => $derive_from_parent,
  161. 'aggregate' => $aggregate,
  162. 'output_format' => $output_format,
  163. 'upstream' => $upstream,
  164. 'downstream' => $downstream,
  165. 'sub_features' => $sub_features,
  166. )
  167. );
  168. }
  169. }
  170. // prepare the queries we're going to use later during the render phase
  171. // This SQL statement uses conditionals in the select clause to handle
  172. // cases cases where the alignment is in the reverse direction and when
  173. // the upstream and downstream extensions go beyond the lenght of the
  174. // parent sequence.
  175. $sql ='
  176. SELECT featureloc_id, srcname, srcfeature_id, strand, srctypename, typename,
  177. fmin, fmax, upstream, downstream, adjfmin, adjfmax,
  178. substring(residues from (adjfmin + 1) for (upstream + (fmax - fmin) + downstream)) as residues,
  179. genus, species
  180. FROM (
  181. SELECT
  182. FL.featureloc_id, OF.name srcname, FL.srcfeature_id, FL.strand,
  183. OCVT.name as srctypename, SCVT.name as typename,
  184. FL.fmin, FL.fmax, OO.genus, OO.species,
  185. CASE
  186. WHEN FL.strand >= 0 THEN
  187. CASE
  188. WHEN FL.fmin - :upstream <= 0 THEN 0
  189. ELSE FL.fmin - :upstream
  190. END
  191. WHEN FL.strand < 0 THEN
  192. CASE
  193. WHEN FL.fmin - :downstream <= 0 THEN 0
  194. ELSE FL.fmin - :downstream
  195. END
  196. END as adjfmin,
  197. CASE
  198. WHEN FL.strand >= 0 THEN
  199. CASE
  200. WHEN FL.fmax + :downstream > OF.seqlen THEN OF.seqlen
  201. ELSE FL.fmax + :downstream
  202. END
  203. WHEN FL.strand < 0 THEN
  204. CASE
  205. WHEN FL.fmax + :upstream > OF.seqlen THEN OF.seqlen
  206. ELSE FL.fmax + :upstream
  207. END
  208. END as adjfmax,
  209. CASE
  210. WHEN FL.strand >= 0 THEN
  211. CASE
  212. WHEN FL.fmin - :upstream <= 0 THEN FL.fmin
  213. ELSE :upstream
  214. END
  215. ELSE
  216. CASE
  217. WHEN FL.fmax + :upstream > OF.seqlen THEN OF.seqlen - FL.fmax
  218. ELSE :upstream
  219. END
  220. END as upstream,
  221. CASE
  222. WHEN FL.strand >= 0 THEN
  223. CASE
  224. WHEN FL.fmax + :downstream > OF.seqlen THEN OF.seqlen - FL.fmax
  225. ELSE :downstream
  226. END
  227. ELSE
  228. CASE
  229. WHEN FL.fmin - :downstream <= 0 THEN FL.fmin
  230. ELSE :downstream
  231. END
  232. END as downstream,
  233. OF.residues
  234. FROM {featureloc} FL
  235. INNER JOIN {feature} SF on FL.feature_id = SF.feature_id
  236. INNER JOIN {cvterm} SCVT on SF.type_id = SCVT.cvterm_id
  237. INNER JOIN {feature} OF on FL.srcfeature_id = OF.feature_id
  238. INNER JOIN {cvterm} OCVT on OF.type_id = OCVT.cvterm_id
  239. INNER JOIN {organism} OO on OF.organism_id = OO.organism_id
  240. WHERE SF.feature_id = :feature_id and NOT (OF.residues = \'\' or OF.residues IS NULL)) as tbl1
  241. ';
  242. // this query is meant to get all of the sub features of any given
  243. // feature (arg #1) and order them as they appear on the reference
  244. // feature (arg #2).
  245. $sfsql = '
  246. SELECT SF.feature_id, CVT.name as type_name, SF.type_id
  247. FROM {feature_relationship} FR
  248. INNER JOIN {feature} SF on SF.feature_id = FR.subject_id
  249. INNER JOIN {cvterm} CVT on CVT.cvterm_id = SF.type_id
  250. INNER JOIN {featureloc} FL on FL.feature_id = FR.subject_id
  251. INNER JOIN {feature} PF on PF.feature_id = FL.srcfeature_id
  252. WHERE FR.object_id = :feature_id and PF.feature_id = :srcfeature_id
  253. ORDER BY FL.fmin ASC
  254. ';
  255. // for counting the number of children
  256. $fsql ='
  257. SELECT count(*) as num_children
  258. FROM {feature_relationship} FR
  259. INNER JOIN {feature} SF on SF.feature_id = FR.subject_id
  260. INNER JOIN {cvterm} CVT on CVT.cvterm_id = SF.type_id
  261. INNER JOIN {featureloc} FL on FL.feature_id = FR.subject_id
  262. INNER JOIN {feature} PF on PF.feature_id = FL.srcfeature_id
  263. WHERE FR.object_id = :feature_id and PF.feature_id = :srcfeature_id
  264. ';
  265. // the array to be returned
  266. $sequences = array();
  267. // if we need to get the sequence from the parent then do so now.
  268. if ($derive_from_parent) {
  269. // execute the query to get the sequence from the parent
  270. $parents = chado_query($sql, array(':upstream' => $upstream, ':downstream' => $downstream, ':feature_id' => $feature_id));
  271. while ($parent = $parents->fetchObject()) {
  272. // if the user specified a particular parent and this one doesn't match then skip it
  273. if ($parent_id and $parent_id != $parent->srcfeature_id) {
  274. continue;
  275. }
  276. // if the user specified a particular featureloc_id and this one doesn't match then skip it
  277. if ($featureloc_id and $featureloc_id != $parent->featureloc_id) {
  278. continue;
  279. }
  280. $seq = ''; // initialize the sequence for each parent
  281. // if we are to aggregate then we will ignore the feature returned
  282. // by the query above and rebuild it using the sub features
  283. if ($aggregate) {
  284. // now get the sub features that are located on the parent.
  285. $children = chado_query($sfsql, array(':feature_id' => $feature_id, ':srcfeature_id' => $parent->srcfeature_id));
  286. $num_children = chado_query($fsql, array(':feature_id' => $feature_id, ':srcfeature_id' => $parent->srcfeature_id))->fetchObject();
  287. // iterate through the sub features and concat their sequences. They
  288. // should already be in order.
  289. $types = array();
  290. $i = 0;
  291. while ($child = $children->fetchObject()) {
  292. // if the callee has specified that only certain sub features should be
  293. // included then continue if this child is not one of those allowed
  294. // subfeatures
  295. if (count($sub_features) > 0 and !in_array($child->type_name, $sub_features)) {
  296. continue;
  297. }
  298. // keep up with the types
  299. if (!in_array($child->type_name, $types)) {
  300. $types[] = $child->type_name;
  301. }
  302. // if the first sub feature we need to include the upstream bases. first check if
  303. // the feature is in the foward direction or the reverse.
  304. if ($i == 0 and $parent->strand >= 0) { // forward direction
  305. // -------------------------- ref
  306. // ....----> ---->
  307. // up 1 2
  308. $q = chado_query($sql, array(':upstream' => $upstream, ':downstream' => 0, ':feature_id' => $child->feature_id));
  309. }
  310. elseif ($i == 0 and $parent->strand < 0) { // reverse direction
  311. // -------------------------- ref
  312. // ....<---- <----
  313. // down 1 2
  314. $q = chado_query($sql, array(':upstream' => 0, ':downstream' => $downstream, ':feature_id' => $child->feature_id));
  315. }
  316. // Next, if the last sub feature we need to include the downstream bases. first check if
  317. // the feature is in teh forward direction or the reverse
  318. if ($i == $num_children->num_children - 1 and $parent->strand >= 0) { // forward direction
  319. // -------------------------- ref
  320. // ----> ---->....
  321. // 1 2 down
  322. $q = chado_query($sql, array(':upstream' => 0, ':downstream' => $downstream, ':feature_id' => $child->feature_id));
  323. }
  324. elseif ($i == $num_children->num_children - 1 and $parent->strand < 0) { // reverse direction
  325. // -------------------------- ref
  326. // <---- <----....
  327. // 1 2 up
  328. $q = chado_query($sql, array(':upstream' => $upstream, ':downstream' => 0, ':feature_id' => $child->feature_id));
  329. }
  330. // for internal sub features we don't want upstream or downstream bases
  331. else {
  332. $q = chado_query($sql, array(':upstream' => 0, ':downstream' => 0, ':feature_id' => $child->feature_id));
  333. }
  334. while ($subseq = $q->fetchObject()) {
  335. // concatenate the sequences of all the sub features
  336. if ($subseq->srcfeature_id == $parent->srcfeature_id) {
  337. $seq .= $subseq->residues;
  338. }
  339. }
  340. $i++;
  341. }
  342. }
  343. // if this isn't an aggregate then use the parent residues
  344. else {
  345. $seq = $parent->residues;
  346. }
  347. // get the reverse compliment if feature is on the reverse strand
  348. $dir = 'forward';
  349. if ($parent->strand < 0) {
  350. $seq = tripal_feature_reverse_complement($seq);
  351. $dir = 'reverse';
  352. }
  353. // now format for display
  354. if ($output_format == 'fasta_html') {
  355. $seq = wordwrap($seq, $num_bases_per_line, "<br>", TRUE);
  356. }
  357. elseif ($output_format == 'fasta_txt') {
  358. $seq = wordwrap($seq, $num_bases_per_line, "\r\n", TRUE);
  359. }
  360. if (!$seq) {
  361. if ($output_format == 'fasta_html') {
  362. $notes .= "No sequence available.</br>";
  363. }
  364. else {
  365. $notes .= "No sequence available.\r\n";
  366. }
  367. }
  368. $notes = "Sequence derived from feature of type, '$parent->srctypename', of $parent->genus $parent->species: $parent->srcname:" . ($parent->adjfmin + 1) . ".." . $parent->adjfmax . " ($dir). ";
  369. /*
  370. if (count($types) > 0) {
  371. $notes .= "Excludes all bases but those of type(s): " . implode(', ', $types) . ". " ;
  372. }
  373. if ($parent->upstream > 0) {
  374. $notes .= "Includes " . $parent->upstream . " bases upstream. ";
  375. }
  376. if ($parent->downstream > 0) {
  377. $notes .= "Includes " . $parent->downstream . " bases downstream. ";
  378. }
  379. */
  380. $sequences[] = array(
  381. 'types' => $types,
  382. 'upstream' => $parent->upstream,
  383. 'downstream' => $parent->downstream,
  384. 'notes' => $notes,
  385. 'residues' => $seq,
  386. 'featureloc_id' => $parent->featureloc_id,
  387. );
  388. }
  389. }
  390. // if we are not getting the sequence from the parent sequence then
  391. // use what comes through from the feature record
  392. else {
  393. $sql = "SELECT * FROM {feature} F WHERE feature_id = :feature_id";
  394. $values = chado_query($sql, array(':feature_id' => $feature_id))->fetchObject();
  395. $residues = $values->residues;
  396. if ($output_format == 'fasta_html') {
  397. $residues = wordwrap($residues, $num_bases_per_line, "<br>", TRUE);
  398. }
  399. elseif ($output_format == 'fasta_txt') {
  400. $residues = wordwrap($residues, $num_bases_per_line, "\r\n", TRUE);
  401. }
  402. $sequences[] = array(
  403. 'types' => $values->type,
  404. 'upstream' => 0,
  405. 'downstream' => 0,
  406. 'notes' => '',
  407. 'residues' => $residues,
  408. );
  409. }
  410. return $sequences;
  411. }
  412. /**
  413. * Returns a fasta record for the passed in feature
  414. *
  415. * @param $feature
  416. * A single feature object containing all the fields from the chado.feature table
  417. * @param $desc
  418. * A string containing any additional details you want added to the definition line of
  419. * the fasta record.
  420. *
  421. * Note: the feature accession and name separated by a | will be added
  422. * before the description but on the same line
  423. *
  424. * @ingroup tripal_feature_api
  425. */
  426. function tripal_format_fasta_sequence($feature, $desc) {
  427. $fasta = ">" . variable_get('chado_feature_accession_prefix', 'FID') . "$feature->feature_id|$feature->name";
  428. $fasta .= " $desc\n";
  429. $fasta .= wordwrap($feature->residues, 50, "\n", TRUE);
  430. $fasta .= "\n\n";
  431. return $fasta;
  432. }
  433. /**
  434. * Returns a definition line that can be used in a FASTA file
  435. *
  436. * @param $feature
  437. * A single feature object containing all the fields from the chado.feature table
  438. * @param $featureloc
  439. * Optional: A single featureloc object generated using chado_generate_var that
  440. * contains a record from the chado.featureloc table.
  441. * @param $type
  442. * Optional: the type of sequence. By default the feature type is used.
  443. *
  444. * @return
  445. * A string of the format: uniquename|name|type|feature_id
  446. * or if an alignment: srcfeature_name:fmin..fmax[+-]; alignment of uniquename|name|type|feature_id
  447. */
  448. function tripal_get_fasta_defline($feature, $featureloc = NULL, $type = '') {
  449. if (!$type) {
  450. $type = $feature->type_id->name;
  451. }
  452. $defline = $feature->uniquename . "|" . $feature->name . "|" . $type . "|" . $feature->feature_id;
  453. if ($featureloc) {
  454. $defline = $defline . "; derived from alignment at " .tripal_get_location_string($featureloc);
  455. }
  456. return $defline;
  457. }
  458. /**
  459. * Returns a string representing a feature location in an alignment
  460. *
  461. * @param unknown $featureloc
  462. * A single featureloc object generated using chado_generate_var that
  463. * contains a record from the chado.featureloc table.
  464. */
  465. function tripal_get_location_string($featureloc) {
  466. $feature = $featureloc->feature_id;
  467. if ($featureloc->strand < 0) {
  468. $residues = tripal_feature_reverse_complement($residues);
  469. }
  470. $strand = '.';
  471. if ($featureloc->strand == 1) {
  472. $strand = '+';
  473. }
  474. elseif ($featureloc->strand == -1) {
  475. $strand = '-';
  476. }
  477. return $featureloc->srcfeature_id->name . ":" . ($featureloc->fmin + 1) . ".." . $featureloc->fmax . $strand;
  478. }