tripal_chado.phylotree_newick.inc 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. <?php
  2. /**
  3. * This code parses the grammer for the Newick format as per the following
  4. * grammar:
  5. *
  6. * Tree --> Subtree ";" | Branch ";"
  7. * Subtree --> Leaf | Internal
  8. * Leaf --> Name
  9. * Internal --> "(" BranchSet ")" Name
  10. * BranchSet --> Branch | BranchSet "," Branch
  11. * Branch --> Subtree Length
  12. * Name --> empty | string
  13. * Length --> empty | ":" number
  14. *
  15. */
  16. /**
  17. *
  18. * @param unknown $file_name
  19. */
  20. function tripal_phylogeny_parse_newick_file($file_name) {
  21. // Initialize the bootstrap value and index
  22. global $tripal_phylogeny_bootstrap;
  23. $tripal_phylogeny_bootstrap = 1;
  24. $tripal_phylogeny_index = 1;
  25. $tree = [];
  26. $fp = fopen($file_name, 'r');
  27. if ($fp) {
  28. $tree = tripal_phylogeny_parse_newick_tree($fp);
  29. }
  30. else {
  31. // ERROR
  32. }
  33. return $tree;
  34. }
  35. /**
  36. *
  37. * @param unknown $fp
  38. * @param number $depth
  39. *
  40. * @return boolean
  41. */
  42. function tripal_phylogeny_parse_newick_tree($fp, $depth = 0) {
  43. $subtree = tripal_phylogeny_parse_newick_subtree($fp, $depth);
  44. $subtree['is_root'] = 1;
  45. // this subtree may also be a branch. A branch is a subtree with a length,
  46. // so see if there is a length
  47. $token = tripal_phylogeny_parse_newick_get_token($fp);
  48. if ($token == ";") {
  49. // we're done!
  50. return $subtree;
  51. }
  52. tripal_phylogeny_parse_newick_replace_token($fp);
  53. // Get the length.
  54. $length = tripal_phylogeny_parse_newick_length($fp, $depth);
  55. $subtree['length'] = $length;
  56. // Now if we're missing the semicolon we have a syntax error.
  57. $token = tripal_phylogeny_parse_newick_get_token($fp);
  58. if ($token != ';') {
  59. print "Syntax Error: missing trailing semicolon.\n";
  60. exit;
  61. }
  62. return $subtree;
  63. }
  64. /**
  65. *
  66. * @param unknown $fp
  67. * @param unknown $depth
  68. *
  69. * @return Ambigous|unknown
  70. */
  71. function tripal_phylogeny_parse_newick_subtree($fp, $depth) {
  72. $internal = tripal_phylogeny_parse_newick_internal($fp, $depth + 1);
  73. if (!is_array($internal)) {
  74. $leaf_node = tripal_phylogeny_parse_newick_leaf($fp, $depth);
  75. return [
  76. 'name' => $leaf_node,
  77. 'depth' => $depth,
  78. 'is_leaf' => TRUE,
  79. 'descendents' => 0,
  80. ];
  81. }
  82. else {
  83. $internal['depth'] = $depth;
  84. }
  85. return $internal;
  86. }
  87. /**
  88. *
  89. * @param unknown $fp
  90. * @param unknown $depth
  91. *
  92. * @return boolean|multitype:unknown Ambigous <Ambigous, unknown>
  93. */
  94. function tripal_phylogeny_parse_newick_branch($fp, $depth) {
  95. $subtree = tripal_phylogeny_parse_newick_subtree($fp, $depth);
  96. $length = tripal_phylogeny_parse_newick_length($fp, $depth);
  97. $subtree['length'] = $length;
  98. return $subtree;
  99. }
  100. /**
  101. *
  102. * @param unknown $fp
  103. * @param unknown $parent
  104. * @param unknown $depth
  105. */
  106. function tripal_phylogeny_parse_newick_internal($fp, $depth) {
  107. // If the next character is not an open paren then this is an internal node
  108. if (tripal_phylogeny_parse_newick_get_token($fp) != '(') {
  109. tripal_phylogeny_parse_newick_replace_token($fp);
  110. return FALSE;
  111. }
  112. $branches = tripal_phylogeny_parse_newick_branchset($fp, $depth);
  113. if (!is_array($branches)) {
  114. return FALSE;
  115. }
  116. // If we don't have a closing parent then this is a syntax error.
  117. if (tripal_phylogeny_parse_newick_get_token($fp) != ')') {
  118. tripal_phylogeny_parse_newick_replace_token($fp);
  119. return FALSE;
  120. }
  121. $internal_node = tripal_phylogeny_parse_newick_name($fp, $depth);
  122. $descendent_count = 0;
  123. for ($i = 0; $i < count($branches); $i++) {
  124. $branches[$i]['parent'] = $internal_node;
  125. $descendent_count += 1 + $branches[$i]['descendents'];
  126. }
  127. return [
  128. 'name' => $internal_node,
  129. 'depth' => $depth,
  130. 'branch_set' => $branches,
  131. 'is_internal' => TRUE,
  132. 'descendents' => $descendent_count,
  133. ];
  134. }
  135. /**
  136. *
  137. * @param unknown $fp
  138. * @param unknown $parent
  139. * @param unknown $depth
  140. */
  141. function tripal_phylogeny_parse_newick_branchset($fp, $depth) {
  142. $branches = [];
  143. $num_read = 0;
  144. $branch = tripal_phylogeny_parse_newick_branch($fp, $depth);
  145. $branches[] = $branch;
  146. // If it's not a branch then return false, a branchset will
  147. // always appear as a branch.
  148. if (!is_array($branch)) {
  149. return FALSE;
  150. }
  151. // If we have a comma as the next token then this is
  152. // a branchset and we should recurse.
  153. $token = tripal_phylogeny_parse_newick_get_token($fp);
  154. if ($token == ',') {
  155. $rbranches = tripal_phylogeny_parse_newick_branchset($fp, $depth);
  156. foreach ($rbranches as $branch) {
  157. $branches[] = $branch;
  158. }
  159. }
  160. else {
  161. tripal_phylogeny_parse_newick_replace_token($fp);
  162. }
  163. return $branches;
  164. }
  165. /**
  166. *
  167. * @param unknown $fp
  168. * @param unknown $depth
  169. *
  170. * @return Ambigous <string, boolean, unknown>
  171. */
  172. function tripal_phylogeny_parse_newick_leaf($fp, $depth) {
  173. return tripal_phylogeny_parse_newick_name($fp, $depth);
  174. }
  175. /**
  176. *
  177. * @param unknown $fp
  178. * @param unknown $depth
  179. *
  180. * @return string|boolean|Ambigous <string, unknown>
  181. */
  182. function tripal_phylogeny_parse_newick_name($fp, $depth) {
  183. global $tripal_phylogeny_bootstrap;
  184. $token = tripal_phylogeny_parse_newick_get_token($fp);
  185. // If the next token is a colon, semicolon, close paren, or comma
  186. // then the name is empty.
  187. if ($token == ':' or $token == ',' or $token == ';' or $token == ')') {
  188. tripal_phylogeny_parse_newick_replace_token($fp);
  189. // create a bootstrap value
  190. return $tripal_phylogeny_bootstrap++;
  191. }
  192. // If the next token is an open paren then this is a syntax error:
  193. if ($token == '(') {
  194. tripal_phylogeny_parse_newick_replace_token($fp);
  195. return FALSE;
  196. }
  197. return $token;
  198. }
  199. /**
  200. *
  201. * @param unknown $fp
  202. * @param unknown $depth
  203. *
  204. * @return string|boolean|unknown
  205. */
  206. function tripal_phylogeny_parse_newick_length($fp, $depth) {
  207. $length = '';
  208. $token = tripal_phylogeny_parse_newick_get_token($fp);
  209. // If the next token is a semicolon, close paren, or comma
  210. // then the length is empty.
  211. if ($token == ',' or $token == ';' or $token == ')') {
  212. tripal_phylogeny_parse_newick_replace_token($fp);
  213. return '';
  214. }
  215. // If the next token is a colon then we are parsing the length.
  216. // Otherwise we are not.
  217. if ($token != ':') {
  218. tripal_phylogeny_parse_newick_replace_token($fp);
  219. return FALSE;
  220. }
  221. // Now get the length.
  222. $token = tripal_phylogeny_parse_newick_get_token($fp);
  223. // If the next token is an open paren then this is a syntax error:
  224. if ($token == '(') {
  225. exit();
  226. }
  227. return $token;
  228. }
  229. /**
  230. *
  231. * @param unknown $fp
  232. *
  233. * @return string
  234. */
  235. function tripal_phylogeny_parse_newick_get_token($fp) {
  236. // Keep track of the file position that we start with
  237. global $tripal_phylogeny_fp_pos;
  238. $tripal_phylogeny_fp_pos = ftell($fp);
  239. $token = '';
  240. $in_quote = FALSE;
  241. $num_read = 0;
  242. $c = fgetc($fp);
  243. while (!feof($fp)) {
  244. $num_read++;
  245. switch ($c) {
  246. // If the first character is a reserved character and we
  247. // we have not encountered any other charcters then return
  248. // it as the token. Otherwise, return the collected token.
  249. case ';':
  250. case '(':
  251. case ')':
  252. case ',':
  253. case ':':
  254. if (!$token) {
  255. return $c;
  256. }
  257. else {
  258. // put the character back and return the token
  259. fseek($fp, $tripal_phylogeny_fp_pos + $num_read - 1);
  260. return $token;
  261. }
  262. break;
  263. // Quotes are allowed around names and if a name is in
  264. // quotes then allow spaces. Otherwise, spaces are ignored.
  265. case '\'':
  266. case '"':
  267. if (!$in_quote) {
  268. $in_quote = TRUE;
  269. }
  270. else {
  271. $in_quote = FALSE;
  272. }
  273. break;
  274. case " ":
  275. case "\t":
  276. case "\r":
  277. case "\n":
  278. if ($in_quote) {
  279. $token .= $c;
  280. }
  281. break;
  282. // All other characters get saved as the token
  283. default:
  284. $token .= $c;
  285. }
  286. $c = fgetc($fp);
  287. }
  288. return $token;
  289. }
  290. /**
  291. *
  292. * @param unknown $fp
  293. */
  294. function tripal_phylogeny_parse_newick_replace_token($fp) {
  295. global $tripal_phylogeny_fp_pos;
  296. fseek($fp, $tripal_phylogeny_fp_pos);
  297. $tripal_phylogeny_fp_pos = ftell($fp);
  298. }