tripal_cv.owl_loader.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. <?php
  2. // This Tripal_cv.owl_loader.php and OWLStanza.inc(CLASS) file is being developed to read and parse through any scientific
  3. // ontology XML Owl file to have the vocabularies be inserted into the Chado database for the open source project
  4. // Tripal.info to be used.
  5. /**
  6. * @file
  7. * @add file from header
  8. */
  9. require_once('OWLStanza.inc');
  10. /**
  11. * Parses an OWL XML file and imports the CV terms into Chado
  12. *
  13. * @param $filename The
  14. * full path to the OWL XML file.
  15. *
  16. * @return No return value.
  17. *
  18. * @throws Exception
  19. */
  20. function tripal_cv_parse_owl($filename) {
  21. // TODO: need to pass in the $db_name as the cv name into the function when you see the
  22. // oboInOwl:default-namespace that is not the same as $db_name.
  23. //
  24. // TODO: this all should occur inside of a transaction.
  25. // Opening the OWL file for parsing.
  26. $owl = new XMLReader();
  27. // This command will open and read the any Owl file.
  28. if (!$owl->open($filename)) {
  29. print "ERROR opening OWL file: '$filename'\n";
  30. exit();
  31. }
  32. // Get the RDF stanza. We pass FALSE as the second parameter to prevent
  33. // the object from reading the entire file into memory.
  34. $rdf = new OWLStanza($owl, FALSE);
  35. // Get the ontology stanza. It will contain the values for the database
  36. // name for this ontology.
  37. $ontology = new OWLStanza($owl);
  38. // Look for the db name in using the 'oboInOwl:default-namespace term. If it's
  39. // not present then we'll use the 'about' element to get the namespace.
  40. $namespace = $ontology->getChild('oboInOwl:default-namespace');
  41. if ($namespace) {
  42. $db_name = $namespace->getValue();
  43. }
  44. else {
  45. // Insert the database record into Chado using the owl:Ontology stanza of the owl file.
  46. $about = $ontology->getAttribute('rdf:about');
  47. // We wrote the regular expression on the rdf.about line to get the database name for any particular Owl file.
  48. if (preg_match('/^.*\/(.*?)\..*$/', $about, $matches)) {
  49. $db_name = strtoupper($matches[1]);
  50. }
  51. }
  52. //
  53. // Step 1: Make sure that all dependencies (database names) are met for each Owl ontology file.
  54. //
  55. // loop through each stanza, one at a time, and handle each one
  56. // based on the tag name.
  57. $stanza = new OWLStanza($owl);
  58. // Set an empty array for the dependencies to go in.
  59. $deps = [
  60. 'db' => [],
  61. 'dbxref' => [],
  62. ];
  63. // Start looping and parsing through the owl:Class stanza section of the Owl file.
  64. while (!$stanza->isFinished()) {
  65. // Use the tag name from OWLStanza.inc to identify which function should be called.
  66. switch ($stanza->getTagName()) {
  67. case 'owl:Class':
  68. tripal_owl_check_class_depedencies($stanza, $vocab_db_name, $deps);
  69. break;
  70. }
  71. // Get to the next stanza in the OWL file.
  72. $stanza = new OWLStanza($owl);
  73. }
  74. if (count(array_keys($deps['db'])) > 0 or count(array_keys($deps['dbxref'])) > 0) {
  75. // We have unmet dependencies. Print those out and return.
  76. // The deps array will have DB’s, then terms' . "\n");
  77. if (count($deps['db']) > 0) {
  78. drupal_set_message('Cannot import ontology, "' . $db_name . '", as the following ' . 'dependent vocabularies must first be imported: ' . print_r(array_keys($deps['db']), TRUE) . '\n', 'error');
  79. }
  80. if (count($deps['dbxref']) > 0) {
  81. drupal_set_message('Cannot import ontology, "' . $db_name . '", as the following ' . 'dependent terms must first be imported: ' . print_r(array_keys($deps['dbxref']), TRUE) . '\n', 'error');
  82. }
  83. return;
  84. }
  85. //
  86. // Step 2: If we pass the dependency check in step 1 then we can insert
  87. // the terms.
  88. //
  89. // Holds an array of CV and DB records that have already been
  90. // inserted (reduces number of queires).
  91. $vocabs = [
  92. 'db' => [],
  93. 'cv' => [],
  94. 'this' => [],
  95. ];
  96. // Reload the ontology to reposition at the beginning of the OWl file for inserting the
  97. // new terms into Chado.
  98. $owl = new XMLReader();
  99. if (!$owl->open($filename)) {
  100. print "ERROR opening OWL file: '$filename'\n";
  101. exit();
  102. }
  103. $rdf = new OWLStanza($owl, FALSE);
  104. $ontology = new OWLStanza($owl);
  105. // Insert the database record into Chado using the
  106. // owl:Ontology stanza.
  107. $url = '';
  108. $homepage = $ontology->getChild('foaf:homepage');
  109. if ($homepage) {
  110. $url = $homepage->getValue();
  111. }
  112. $db = [
  113. 'url' => $url,
  114. 'name' => $db_name,
  115. ];
  116. // Using the Tripal API function to insert the term into the Chado database.
  117. $db = chado_insert_db($db);
  118. // Get the description for this vocabulary. This should be in the
  119. // dc:description element. If that element is missing then the
  120. // description should default to the empty string.
  121. $cv_description = '';
  122. $description = $ontology->getChild('dc:description');
  123. if ($description) {
  124. $cv_description = $description->getValue();
  125. }
  126. // Get the name for the CV. This should be in the 'dc:title' element. If the
  127. // title is not present then the cv name should default to the database name.
  128. $cv_name == $namespace = $ontology->getChild('oboInOwl:default-namespace');
  129. if ($namespace) {
  130. $cv_name = $namespace->getValue();
  131. }
  132. $title = $ontology->getChild('dc:title');
  133. if ($title) {
  134. $cv_name = preg_replace("/[^\w]/", "_", strtolower($title->getValue()));
  135. }
  136. // Insert the CV record into Chado.
  137. $cv = chado_insert_cv($cv_name, $cv_description);
  138. // Add this CV and DB to our vocabs array so we can reuse it later.
  139. $vocabs[$db_name]['cv'] = $namespace_cv;
  140. $vocabs[$db_name]['db'] = $db;
  141. $vocabs['this'] = $db_name;
  142. // loop through each stanza of the owl file, one at a time, and handle each one
  143. // based on the tag name from the OWLStanza.inc file.
  144. $stanza = new OWLStanza($owl);
  145. while (!$stanza->isFinished()) {
  146. // Use the tag name to identify which function should be called.
  147. switch ($stanza->getTagName()) {
  148. case 'owl:AnnotationProperty':
  149. // tripal_owl_handle_annotation_property($stanza, $vocabs);
  150. break;
  151. case 'rdf:Description':
  152. // tripal_owl_handle_description($stanza, $vocabs);
  153. break;
  154. case 'owl:ObjectProperty':
  155. // tripal_owl_handle_object_property($stanza, $vocabs);
  156. break;
  157. case 'owl:Class':
  158. tripal_owl_handle_class($stanza, $vocabs);
  159. break;
  160. case 'owl:Axiom':
  161. break;
  162. case 'owl:Restriction':
  163. break;
  164. default:
  165. throw new Exception("Unhandled stanza: " . $stanza->getTagName());
  166. exit();
  167. break;
  168. }
  169. // Get the next stanza in the OWL file.
  170. $stanza = new OWLStanza($owl);
  171. }
  172. // Close the XMLReader $owl object.
  173. $owl->close();
  174. }
  175. /**
  176. * Checks for required vocabularies that are not loaded into Chado.
  177. *
  178. * Some vocabularies use terms from other ontologies. If this is happens
  179. * we need to ensure that the dependent vocabularies are present in the
  180. * database prior to loading this one. This function adds to the $deps
  181. * array all of the database names and term accessions that are missing in
  182. * Chado.
  183. *
  184. * @param $stanza The
  185. * OWLStanza object for the current stanza from the OWL file.
  186. * @param $vocab_db_name The
  187. * name of the database for the vocabulary being loded.
  188. * @param $deps The
  189. * dependencies array. The missing databases are provided in array
  190. * using a 'db' key, and missing terms are in a second array using a
  191. * 'dbxref' key.
  192. */
  193. function tripal_owl_check_class_depedencies(OWLStanza $stanza, $vocab_db_name, &$deps) {
  194. // Initialize the variables.
  195. $db_name = '';
  196. $accession = '';
  197. $db = NULL;
  198. // Get the DB name and accession from the "rdf:about" attribute.
  199. $about = $stanza->getAttribute('rdf:about');
  200. if (!$about) {
  201. // TODO: some owl:Class stanzas do not have an about. What are these?
  202. // how should we handle them.
  203. return;
  204. }
  205. // We wrote the regular expression on the rdf.about line to get the database
  206. // name and accession term for any particular Owl file.
  207. if (preg_match('/.*\/(.+)_(.+)/', $about, $matches)) {
  208. $db_name = strtoupper($matches[1]);
  209. $accession = $matches[2];
  210. }
  211. else {
  212. throw new Exception("owl:Class stanza 'rdf:about' attribute is not formated as expected: '$about'. " . "This is necessary to determine the term's accession: \n\n" . $stanza->getXML());
  213. }
  214. // If the database name for this term is the same as the vocabulary
  215. // we are trying to load, then don't include it in the $deps array.
  216. if ($db_name !== $vocab_db_name) {
  217. return;
  218. }
  219. // Check if the db_name does not exist in the chado.db table. If it
  220. // does not exist then add it to our $deps array. If the query fails then
  221. // throw an exception.
  222. $db = chado_select_record('db', [
  223. 'db_id',
  224. ], [
  225. 'name' => $db_name,
  226. ]);
  227. if ($db === FALSE) {
  228. throw new Exception("Failed to execute query to find vocabulary in chado.db table\n\n" . $stanza->getXML());
  229. }
  230. else {
  231. if (count($db) == 0) {
  232. $deps['db'][$db_name] = TRUE;
  233. // Does this stanza provide the URL for the OWL file of this missing
  234. // dependency. If so then add it to our deps array.
  235. $imported_from = $stanza->getChild('obo:IAO_0000412');
  236. if ($imported_from == NULL) {
  237. return;
  238. }
  239. $url = $imported_from->getAttribute('rdf:resource');
  240. if ($url) {
  241. $deps['db'][$db_name] = $url;
  242. }
  243. return;
  244. }
  245. }
  246. // If the db_name exists, then check if the accession exists in
  247. // the chado.dbxref table. If it doesn't exist then add an entry to the
  248. // $deps array. If the query fails then throw an exception.
  249. $values = [
  250. 'db_id' => $db[0]->db_id,
  251. 'accession' => $accession,
  252. ];
  253. $dbxref = chado_select_record('dbxref', [
  254. 'dbxref_id',
  255. 'db_id',
  256. ], $values);
  257. if ($dbxref === FALSE) {
  258. throw new Exception("Failed to execute query to find vocabulary term in chado.dbxref table\n\n" . $stanza->getXML());
  259. }
  260. elseif (count($accession) == 0) {
  261. $deps['dbxref'][$db_name . ':' . $accesson] = TRUE;
  262. }
  263. return;
  264. }
  265. /**
  266. *
  267. * @param
  268. * $stanza
  269. * @param
  270. * $vocabs
  271. *
  272. * @throws Exception
  273. */
  274. function tripal_owl_handle_object_property($stanza, $vocabs) {
  275. }
  276. /**
  277. *
  278. * @param
  279. * $stanza
  280. * @param
  281. * $vocabs
  282. *
  283. * @throws Exception
  284. */
  285. function tripal_owl_handle_annotation_property($stanza, $vocabs) {
  286. // $matches = array();
  287. // $db_name = '';
  288. // $accession = '';
  289. // $about = $stanza->getAttribute('rdf:about');
  290. // // Get the DB name and accession from the about attribute using the preg match function.
  291. // if (preg_match('/.*\/(.+)_(.+)/', $about, $matches)) {
  292. // $db_name = ($matches[1]);
  293. // $accession = $matches[2];
  294. // }
  295. // else {
  296. // throw new Exception("owl:Class stanza 'rdf:about' attribute is not formated as expected: '$about'. " . "This is necessary to determine the term's accession: \n\n" . $stanza->getXML());
  297. // }
  298. // // Insert a DB Record
  299. // if (array_key_exists($db_name, $vocabs)) {
  300. // $db = $vocabs[$db_name]['db'];
  301. // $default_namespace_cv = $vocabs[$db_name]['cv'];
  302. // }
  303. // else {
  304. // // Unfortunately, all we have is the name. The OWL format
  305. // // doesn't provides us the URL, description, etc.
  306. // $values = array(
  307. // 'name' => $db_name
  308. // );
  309. // $db = chado_insert_db($values);
  310. // // Insert a dbxref record.
  311. // $values = array(
  312. // 'db_id' => $db->db_id,
  313. // 'accession' => $accession
  314. // );
  315. // $dbxref = chado_insert_dbxref($values);
  316. // $imported_from = $stanza->getChild('obo:IAO_0000114');
  317. // if ($imported_from == NULL) {
  318. // return;
  319. // }
  320. // $url = $imported_from->getAttribute('rdf:resource');
  321. // if ($url) {
  322. // $vocabs['db'][$db_name] = $url;
  323. // }
  324. // return;
  325. // }
  326. // // Insert a new cvterm record.
  327. // $cvterm_name = '';
  328. // $definition = '';
  329. // $cvterm_name = $stanza->getChild('rdfs:label');
  330. // if ($cvterm_name) {
  331. // $cvterm_name = $stanza->getValue();
  332. // }
  333. // $definition = $stanza->getChild('obo:IAO_0000115');
  334. // if ($definition) {
  335. // $definition = $stanza->getValue();
  336. // }
  337. // $term = array(
  338. // 'id' => $db->name .':'. $dbxref->accession,
  339. // 'name' => $cvterm_name,
  340. // 'cv_name' => $cv->name,
  341. // 'definition' => $definition,
  342. // );
  343. // $option =array();
  344. // if ($vocabs['this'] != $db->name){
  345. // $option['update_existing'] = FALSE;
  346. // }
  347. // $cvterm = chado_insert_cvterm($term, $option);
  348. // }
  349. }
  350. /**
  351. *
  352. * @param
  353. * $stanza
  354. * @param
  355. * $vocabs
  356. *
  357. * @throws Exception
  358. */
  359. function tripal_owl_handle_description($stanza, $vocabs) {
  360. }
  361. /**
  362. *
  363. * The function goes through owl:Class stanza to insert new vocabularies.
  364. *
  365. * @param $stanza The
  366. * OWLStanza object for the current stanza from the OWL file.
  367. * @param
  368. * $vocabs
  369. *
  370. * @throws Exception
  371. */
  372. function tripal_owl_handle_class(OWLStanza $stanza, $vocabs) {
  373. // Initialize the database and cv variables.
  374. $db_name = $vocabs['this'];
  375. $accession = '';
  376. $is_a = '';
  377. $namespace_cv = $vocabs[$db_name]['cv'];
  378. $db = $vocabs[$db_name]['db'];
  379. // Insert the dbxref record into Chado using the owl:Class stanza of the owl file.
  380. // Any oboInOwl:id supercedes what we find in the rdf:about in the owl file.
  381. $obo_id = $stanza->getChild('oboInOwl:id');
  382. if ($obo_id) {
  383. if (preg_match('/.*>(.+):(.+)<.*/', $about, $matches)) {
  384. $db_name = strtoupper($matches[1]);
  385. $accession = $matches[2];
  386. }
  387. else {
  388. $about = $stanza->getAttribute('rdf:about');
  389. // We wrote the regular expression on the rdf.about line to get the
  390. // db_name and accession for any particular Owl file.
  391. if (preg_match('/.*\/(.+)_(.+)/', $about, $matches)) {
  392. $db_name = strtoupper($matches[1]);
  393. $accession = $matches[2];
  394. }
  395. else {
  396. throw new Exception("owl:Class stanza 'rdf:about' attribute is not formated as expected: '$about'. " . "This is necessary to determine the term's accession: \n\n" . $stanza->getXML());
  397. }
  398. }
  399. }
  400. // If the database name for this term is the same as the vocabulary
  401. // we are trying to load, then do include it in the $vocabs array.
  402. if ($db_name == $vocabs['this']) {
  403. return;
  404. }
  405. // insert dbxref
  406. $values = [
  407. 'db_id' => $db->db_id,
  408. 'accession' => $accession,
  409. ];
  410. $dbxref = chado_insert_dbxref($values);
  411. $cvterm_name = $stanza->getChild('rdfs:label');
  412. if ($cvterm_name) {
  413. $cvterm_name = $stanza->getValue();
  414. }
  415. $definition = $stanza->getChild('obo:IAO_0000115');
  416. if ($definition) {
  417. $definition = $stanza->getValue();
  418. }
  419. $term = [
  420. 'id' => $db->name . ':' . $dbxref->accession,
  421. 'name' => $db->name,
  422. 'cv_name' => $stanza->getValue(),
  423. 'definition' => $stanza->getValue(),
  424. ];
  425. $options = [];
  426. if ($vocabs['this'] != $db->name) {
  427. $options['update_existing'] = FALSE;
  428. }
  429. $cvterm = chado_insert_cvterm($term, $options);
  430. // // Add a record to the chado relationship table if an ‘rdfs:subClassOf’ child exists.
  431. // $cvterm_name = $stanza->getChild('rdfs:subClassOf');
  432. // Insert a new cvterm record.
  433. // $cvterm_name = '';
  434. // $definition = '';
  435. // $cvterm_name = $stanza->getChild('rdfs:label');
  436. // if ($cvterm_name) {
  437. // $cvterm_name = $stanza->getValue();
  438. //}
  439. // $definition = $stanza->getChild('obo:IAO_0000115');
  440. // if ($definition) {
  441. //$definition = $stanza->getValue();
  442. //}
  443. // $term = array (
  444. // 'id' => $db->name . ':' . $dbxref->accession,
  445. // 'name' => $cvterm_name,
  446. // 'cv_name' => $cv->name,
  447. // 'definition' => $definition
  448. // );
  449. // $option = array ();
  450. // if ($vocabs['this'] != $db->name) {
  451. // $option['update_existing'] = FALSE;
  452. // }
  453. // $cvterm = chado_insert_cvterm($term, $option);
  454. }