obo_loader.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. <?php
  2. /*************************************************************************
  3. *
  4. */
  5. function tripal_cv_load_obo_v1_2($file) {
  6. $header = array();
  7. $obo = array();
  8. print "Opening File $file\n";
  9. // set the search path
  10. db_query("set search_path to chado,public"); // TODO: fix this
  11. // make sure we have an 'internal' and a '_global' database
  12. if(!tripal_cv_obo_add_db('internal')){
  13. return tripal_cv_obo_loader_done();
  14. }
  15. if(!tripal_cv_obo_add_db('_global')){
  16. return tripal_cv_obo_loader_done();
  17. }
  18. // parse the obo file
  19. tripal_cv_obo_parse($file,$obo,$header);
  20. // add the CV for this ontology to the database
  21. $cv = tripal_cv_obo_add_cv($header['default-namespace'][0],'');
  22. if(!$cv){
  23. return tripal_cv_obo_loader_done();
  24. }
  25. // add any typedefs to the vocabulary first
  26. $typedefs = $obo['Typedef'];
  27. foreach($typedefs as $typedef){
  28. tripal_cv_obo_process_stanza($typedef,$cv,1);
  29. }
  30. // next add terms to the vocabulary
  31. $terms = $obo['Term'];
  32. if(!tripal_cv_obo_process_stanza($terms,$cv,$obo)){
  33. return tripal_cv_obo_loader_done();
  34. }
  35. return tripal_cv_obo_loader_done();
  36. }
  37. /*************************************************************************
  38. *
  39. */
  40. function tripal_cv_obo_process_stanza($terms,$cv,$obo,$is_relationship=0){
  41. foreach ($terms as $term){
  42. // add the cvterm
  43. $cvterm = tripal_cv_obo_add_cv_term($term,$cv,$is_relationship,1);
  44. if(!$cvterm){ return 0; }
  45. // now handle other properites
  46. if(isset($term['is_anonymous'])){
  47. print "WARNING: unhandled tag: is_anonymous\n";
  48. }
  49. if(isset($term['alt_id'])){
  50. foreach($term['alt_id'] as $alt_id){
  51. if(!tripal_cv_obo_add_cvterm_dbxref($cvterm,$alt_id)){
  52. return 0;
  53. }
  54. }
  55. }
  56. if(isset($term['subset'])){
  57. print "WARNING: unhandled tag: subset\n";
  58. }
  59. // add synonyms for this cvterm
  60. if(isset($term['synonym'])){
  61. if(!tripal_cv_obo_add_synonyms($term,$cvterm)){
  62. return 0;
  63. }
  64. }
  65. if(isset($term['exact_synonym'])){
  66. // depricated
  67. print "WARNING: unhandled tag: exact_synonym\n";
  68. }
  69. if(isset($term['narrow_synonym'])){
  70. print "WARNING: unhandled tag: narrow_synonym\n";
  71. // depricated
  72. }
  73. if(isset($term['broad_synonym'])){
  74. print "WARNING: unhandled tag: broad_synonym\n";
  75. // depricated
  76. }
  77. // add the comment to the cvtermprop table
  78. if(isset($term['comment'])){
  79. $comments = $term['comment'];
  80. $j = 0;
  81. foreach($comments as $comment){
  82. if(!tripal_cv_obo_add_cvterm_prop($cvterm,'comment',$comment,$j)){
  83. return 0;
  84. }
  85. $j++;
  86. }
  87. }
  88. // add any other external dbxrefs
  89. if(isset($term['xref']) or isset($term['xref_analog']) or isset($term['xref_unk'])){
  90. foreach($term['xref'] as $xref){
  91. if(!tripal_cv_obo_add_cvterm_dbxref($cvterm,$xref)){
  92. return 0;
  93. }
  94. }
  95. }
  96. // add is_a relationships for this cvterm
  97. if(isset($term['is_a'])){
  98. foreach($term['is_a'] as $is_a){
  99. if(!tripal_cv_obo_add_relationship($cvterm,$cv,$obo,'is_a',$is_a)){
  100. return 0;
  101. }
  102. }
  103. }
  104. if(isset($term['intersection_of'])){
  105. print "WARNING: unhandled tag: intersection_of\n";
  106. }
  107. if(isset($term['union_of'])){
  108. print "WARNING: unhandled tag: union_on\n";
  109. }
  110. if(isset($term['disjoint_from'])){
  111. print "WARNING: unhandled tag: disjoint_from\n";
  112. }
  113. if(isset($term['relationship'])){
  114. foreach($term['relationship'] as $value){
  115. $rel = preg_replace('/^(.+?)\s.+?$/','\1',$value);
  116. $object = preg_replace('/^.+?\s(.+?)$/','\1',$value);
  117. if(!tripal_cv_obo_add_relationship($cvterm,$cv,$obo,$rel,$object)){
  118. return 0;
  119. }
  120. }
  121. }
  122. if(isset($term['replaced_by'])){
  123. print "WARNING: unhandled tag: replaced_by\n";
  124. }
  125. if(isset($term['consider'])){
  126. print "WARNING: unhandled tag: consider\n";
  127. }
  128. if(isset($term['use_term'])){
  129. print "WARNING: unhandled tag: user_term\n";
  130. }
  131. if(isset($term['builtin'])){
  132. print "WARNING: unhandled tag: builtin\n";
  133. }
  134. }
  135. return 1;
  136. }
  137. /*************************************************************************
  138. *
  139. */
  140. function tripal_cv_obo_add_db($dbname){
  141. $db_sql = "SELECT * FROM {db} WHERE name ='%s'";
  142. $db = db_fetch_object(db_query($db_sql,$dbname));
  143. if(!$db){
  144. if(!db_query("INSERT INTO {db} (name) VALUES ('%s')",$dbname)){
  145. print "Cannot create '$dbname' db in Chado.";
  146. return 0;
  147. }
  148. $db = db_fetch_object(db_query($db_sql,$dbname));
  149. }
  150. return $db;
  151. }
  152. /*************************************************************************
  153. *
  154. */
  155. function tripal_cv_obo_add_cv($name,$comment){
  156. // see if the CV (default-namespace) exists already in the database
  157. $vocab = $name;
  158. $remark = $comment;
  159. $cv_sql = "SELECT * FROM {cv} WHERE name = '%s'";
  160. $cv = db_fetch_object(db_query($cv_sql,$vocab));
  161. // if the CV exists then update it, otherwise insert
  162. if(!$cv){
  163. $sql = "INSERT INTO {cv} (name,definition) VALUES ('%s','%s')";
  164. if(!db_query($sql,$vocab,$remark)){
  165. print "Failed to create the CV record";
  166. return 0;
  167. }
  168. $cv = db_fetch_object(db_query($cv_sql,$vocab));
  169. } else {
  170. $sql = "UPDATE {cv} SET definition = '%s' WHERE name ='%s'";
  171. if(!db_query($sql,$remark,$vocab)){
  172. print "Failed to update the CV record";
  173. return 0;
  174. }
  175. $cv = db_fetch_object(db_query($cv_sql,$vocab));
  176. }
  177. return $cv;
  178. }
  179. /*************************************************************************
  180. *
  181. */
  182. function tripal_cv_obo_add_cvterm_prop($cvterm,$property,$value,$rank){
  183. // make sure the 'cvterm_property_type' CV exists
  184. $cv = tripal_cv_obo_add_cv($property,'');
  185. if(!$cv){ return 0; }
  186. // get the property type cvterm. If it doesn't exist then we want to add it
  187. $sql = "
  188. SELECT *
  189. FROM {cvterm} CVT INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  190. WHERE CVT.name = '%s' and CV.name = '%s'
  191. ";
  192. $cvproptype = db_fetch_object(db_query($sql,$property,'cvterm_property_type'));
  193. if(!$cvproptype){
  194. $term = array(
  195. 'name' => array($property),
  196. 'id' => array("internal:$property"),
  197. 'definition' => array(''),
  198. 'is_obsolete' => array(0),
  199. );
  200. $cvproptype = tripal_cv_obo_add_cv_term($term,$cv,0,0);
  201. if(!$cvproptype){ return 0; }
  202. }
  203. // remove any properties that currently exist for this term. We'll reset them
  204. if($rank == 0){
  205. $sql = "DELETE FROM {cvtermprop} WHERE cvterm_id = %d";
  206. db_query($sql,$cvterm->cvterm_id);
  207. }
  208. // now add the property
  209. $sql = "INSERT INTO {cvtermprop} (cvterm_id,type_id,value,rank) ".
  210. "VALUES (%d, %d, '%s',%d)";
  211. if(!db_query($sql,$cvterm->cvterm_id,$cvproptype->cvterm_id,$value,$rank)){
  212. print "Could not add property $property for term\n";
  213. return 0;
  214. }
  215. return 1;
  216. }
  217. /*************************************************************************
  218. *
  219. */
  220. function tripal_cv_obo_add_relationship($cvterm,$cv,$obo,$rel,$objname){
  221. // make sure the relationship cvterm exists
  222. $sql = "
  223. SELECT *
  224. FROM {cvterm} CVT INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  225. WHERE CVT.name = '%s' and CV.name = '%s'
  226. ";
  227. $cvisa = db_fetch_object(db_query($sql,$rel,$cv->name));
  228. if(!$cvisa){
  229. $term = array(
  230. 'name' => array($rel),
  231. 'id' => array($rel),
  232. 'definition' => array(''),
  233. 'is_obsolete' => array(0),
  234. );
  235. if(!tripal_cv_obo_add_cv_term($term,$cv,1,1)){
  236. print "Cannot find or insert the relationship term: $rel.\n";
  237. return 0;
  238. }
  239. $cvisa = db_fetch_object(db_query($sql,$rel,$cv->name));
  240. }
  241. // get the object term
  242. $objterm = tripal_cv_obo_get_term($obo,$objname);
  243. if(!$objterm) {
  244. print "Could not find object term $objname\n";
  245. return 0;
  246. }
  247. $objcvterm = tripal_cv_obo_add_cv_term($objterm,$cv,1,1);
  248. if(!$objcvterm){ return 0; }
  249. // check to see if the cvterm_relationship already exists, if not add it
  250. $cvrsql = "SELECT * FROM {cvterm_relationship} WHERE type_id = %d and subject_id = %d and object_id = %d";
  251. if(!db_fetch_object(db_query($cvrsql,$cvisa->cvterm_id,$cvterm->cvterm_id,$objcvterm->cvterm_id))){
  252. $sql = "INSERT INTO {cvterm_relationship} ".
  253. "(type_id,subject_id,object_id) VALUES (%d,%d,%d)";
  254. if(!db_query($sql,$cvisa->cvterm_id,$cvterm->cvterm_id,$objcvterm->cvterm_id)){
  255. print "Cannot add $rel relationship";
  256. return 0;
  257. }
  258. // print " $rel $objname\n";
  259. }
  260. return 1;
  261. }
  262. /*************************************************************************
  263. *
  264. */
  265. function tripal_cv_obo_get_term($obo,$id){
  266. foreach ($obo as $type){
  267. foreach ($type as $term){
  268. $accession = $term['id'][0];
  269. if(strcmp($accession,$id)==0){
  270. return $term;
  271. }
  272. }
  273. }
  274. return;
  275. }
  276. /*************************************************************************
  277. *
  278. */
  279. function tripal_cv_obo_add_synonyms($term,$cvterm){
  280. // make sure we have a 'synonym_type' vocabulary
  281. $sql = "SELECT * FROM {cv} WHERE name='synonym_type'";
  282. $syncv = db_fetch_object(db_query($sql));
  283. if(!$syncv){
  284. $sql = "INSERT INTO {cv} (name,definition) VALUES ('synonym_type','')";
  285. if(!db_query($sql)){
  286. print "Failed to add the synonyms type vocabulary";
  287. return 0;
  288. }
  289. }
  290. // now add the synonyms
  291. if(isset($term['synonym'])){
  292. foreach($term['synonym'] as $synonym){
  293. // separate out the synonym definition and the synonym type
  294. $def = preg_replace('/^\s*"(.*)"\s*.*$/','\1',$synonym);
  295. $type = strtolower(preg_replace('/^.*"\s+(.*?)\s+.*$/','\1',$synonym));
  296. // make sure the synonym type exists in the 'synonym_type' vocabulary
  297. $cvtsql = "
  298. SELECT *
  299. FROM {cvterm} CVT
  300. INNER JOIN {cv} CV ON CVT.cv_id = CV.cv_id
  301. WHERE CVT.name = '%s' and CV.name = '%s'
  302. ";
  303. $syntype = db_fetch_object(db_query($cvtsql,$type,'synonym_type'));
  304. if(!$syntype){
  305. // build a 'term' object so we can add the missing term
  306. $term = array(
  307. 'name' => array($type),
  308. 'id' => array("internal:$type"),
  309. 'definition' => array(''),
  310. 'is_obsolete' => array(0),
  311. );
  312. if(!tripal_cv_obo_add_cv_term($term,$syncv,0,1)){
  313. return 0;
  314. }
  315. $syntype = db_fetch_object(db_query($cvtsql,$type,'synonym_type'));
  316. }
  317. // make sure the synonym doesn't already exists
  318. $sql = "
  319. SELECT *
  320. FROM {cvtermsynonym}
  321. WHERE cvterm_id = %d and synonym = '%s' and type_id = %d
  322. ";
  323. $syn = db_fetch_object(db_query($sql,$cvterm->cvterm_id,$def,$syntype->cvterm_id));
  324. if(!$syn){
  325. $sql = "INSERT INTO {cvtermsynonym} (cvterm_id,synonym,type_id)
  326. VALUES(%d,'%s',%d)";
  327. if(!db_query($sql,$cvterm->cvterm_id,$def,$syntype->cvterm_id)){
  328. print "Failed to insert the synonym for term: $name ($def)\n";
  329. return 0;
  330. }
  331. }
  332. }
  333. }
  334. return 1;
  335. }
  336. /*************************************************************************
  337. *
  338. */
  339. function tripal_cv_obo_add_cv_term($term,$cv,$is_relationship = 0,$update = 1){
  340. // get the term properties
  341. $id = $term['id'][0];
  342. $name = $term['name'][0];
  343. $definition = preg_replace('/^\"(.*)\"/','\1',$term['def'][0]);
  344. $is_obsolete = 0;
  345. if(isset($term['is_obsolete'][0]) and strcmp($term['is_obsolete'][0],'true')==0){
  346. $is_obsolete = 1;
  347. }
  348. // get the accession and the database from the cvterm
  349. if(preg_match('/^.+?:.*$/',$id)){
  350. $accession = preg_replace('/^.+?:(.*)$/','\1',$id);
  351. $dbname = preg_replace('/^(.+?):.*$/','\1',$id);
  352. }
  353. else if($is_relationship) {
  354. $accession = $id;
  355. $dbname = 'OBO_REL';
  356. }
  357. // check to see if the database exists
  358. $db = tripal_cv_obo_add_db($dbname);
  359. if(!$db){
  360. print "Cannot find database '$dbname' in Chado.\n";
  361. return 0;
  362. }
  363. // check to see if the cvterm already exists
  364. $cvtermsql = "SELECT * from {cvterm} WHERE name = '%s' and cv_id = %d";
  365. $cvterm = db_fetch_object(db_query($cvtermsql,$name,$cv->cv_id));
  366. // if the cvterm doesn't exist then add it otherwise just update it
  367. if(!$cvterm){
  368. // check to see if the dbxref exists if not, add it
  369. $dbxref = tripal_cv_obo_add_dbxref($db->db_id,$accession);
  370. if(!$dbxref){
  371. print "Failed to find or insert the dbxref record for cvterm, $name (id: $accession), for database $dbname\n";
  372. return 0;
  373. }
  374. // now add the cvterm
  375. $sql = "
  376. INSERT INTO {cvterm} (cv_id, name, definition, dbxref_id,
  377. is_obsolete, is_relationshiptype)
  378. VALUES (%d,'%s','%s',%d,%d,%d)
  379. ";
  380. if(!db_query($sql,$cv->cv_id,$name,$definition,
  381. $dbxref->dbxref_id,$is_obsolete,$is_relationship)){
  382. print "Failed to insert the term: $id\n";
  383. return 0;
  384. }
  385. print "Added CV term: $id\n";
  386. $cvterm = db_fetch_object(db_query($cvtermsql,$name,$cv->cv_id));
  387. }
  388. elseif($update) { // update the cvterm
  389. $sql = "
  390. UPDATE {cvterm} SET name='%s', definition='%s',
  391. is_obsolete = %d, is_relationshiptype = %d
  392. WHERE cvterm_id = %d
  393. ";
  394. if(!db_query($sql,$term['name'][0],$definition,
  395. $is_obsolete,$is_relationship,$cvterm->cvterm_id)){
  396. print "Failed to update the term: $name\n";
  397. return 0;
  398. }
  399. print "Updated CV term: $id\n";
  400. $cvterm = db_fetch_object(db_query($cvtermsql,$name,$cv->cv_id));
  401. }
  402. // return the cvterm
  403. return $cvterm;
  404. }
  405. /*************************************************************************
  406. *
  407. */
  408. function tripal_cv_obo_add_cvterm_dbxref($cvterm,$xref){
  409. $accession = preg_replace('/^.+?:(.*)$/','\1',$xref);
  410. $dbname = preg_replace('/^(.+?):.*$/','\1',$xref);
  411. // if the xref is a database link, handle that specially
  412. if(strcmp($dbname,'http')==0){
  413. $accession = $xref;
  414. $dbname = 'URL';
  415. }
  416. // check to see if the database exists
  417. $db = tripal_cv_obo_add_db($dbname);
  418. if(!$db){
  419. print "Cannot find database '$dbname' in Chado.";
  420. return 0;
  421. }
  422. // now add the dbxref
  423. $dbxref = tripal_cv_obo_add_dbxref($db->db_id,$accession);
  424. if(!$dbxref){ return 0;}
  425. // finally add the cvterm_dbxref but first check to make sure it exists
  426. $sql = "SELECT * from {cvterm_dbxref} WHERE cvterm_id = %d and dbxref_id = %d";
  427. if(!db_fetch_object(db_query($sql,$cvterm->cvterm_id,$dbxref->dbxref_id))){
  428. $sql = "INSERT INTO {cvterm_dbxref} (cvterm_id,dbxref_id)".
  429. "VALUES (%d,%d)";
  430. if(!db_query($sql,$cvterm->cvterm_id,$dbxref->dbxref_id)){
  431. print "Cannot add cvterm_dbxref: $accession\n";
  432. return 0;
  433. }
  434. }
  435. return 1;
  436. }
  437. /*************************************************************************
  438. *
  439. */
  440. function tripal_cv_obo_add_dbxref($db_id,$accession,$version='',$description=''){
  441. // check to see if the dbxref exists if not, add it
  442. $dbxsql = "SELECT dbxref_id FROM {dbxref} WHERE db_id = %d and accession = '%s'";
  443. $dbxref = db_fetch_object(db_query($dbxsql,$db_id,$accession));
  444. if(!$dbxref){
  445. $sql = "
  446. INSERT INTO {dbxref} (db_id, accession, version, description)
  447. VALUES (%d,'%s','%s','%s')
  448. ";
  449. if(!db_query($sql,$db_id,$accession,$version,$description)){
  450. print "Failed to insert the dbxref record $accession\n";
  451. return 0;
  452. }
  453. $dbxref = db_fetch_object(db_query($dbxsql,$db_id,$accession));
  454. }
  455. return $dbxref;
  456. }
  457. /*************************************************************************
  458. *
  459. */
  460. function tripal_cv_obo_parse($obo_file,&$obo,&$header){
  461. $i = 0;
  462. $in_header = 1;
  463. $stanza = array();
  464. // iterate through the lines in the OBO file and parse the stanzas
  465. $fh = fopen($obo_file,'r');
  466. while($line = fgets($fh)) {
  467. $i++;
  468. // remove newlines
  469. $line = rtrim($line);
  470. // skip empty lines
  471. if(strcmp($line,'')==0) { continue; }
  472. //remove comments from end of lines
  473. $line = preg_replace('/^(.*?)\!.*$/','\1',$line); // TODO: if the explamation is escaped
  474. if(preg_match('/^\s*\[/',$line)){ // at the first stanza we're out of header
  475. $in_header = 0;
  476. // load the stanza we just finished reading
  477. if(sizeof($stanza) > 0){
  478. if(!isset($obo[$type])){
  479. $obo[$type] = array();
  480. }
  481. if(!isset($obo[$type][$stanza['id'][0]])){
  482. $obo[$type][$stanza['id'][0]] = $stanza;
  483. } else {
  484. array_merge($obo[$type][$stanza['id'][0]],$stanza);
  485. }
  486. }
  487. // get the stanza type: Term, Typedef or Instance
  488. $type = preg_replace('/^\s*\[\s*(.+?)\s*\]\s*$/','\1',$line);
  489. // start fresh with a new array
  490. $stanza = array();
  491. continue;
  492. }
  493. // break apart the line into the tag and value but ignore any escaped colons
  494. preg_replace("/\\:/","|-|-|",$line); // temporarily replace escaped colons
  495. $pair = explode(":",$line,2);
  496. $tag = $pair[0];
  497. $value = ltrim(rtrim($pair[1]));// remove surrounding spaces
  498. $tag = preg_replace("/\|-\|-\|/","\:",$tag); // return the escaped colon
  499. $value = preg_replace("/\|-\|-\|/","\:",$value);
  500. if($in_header){
  501. if(!isset($header[$tag])){
  502. $header[$tag] = array();
  503. }
  504. $header[$tag][] = $value;
  505. } else {
  506. if(!isset($stanza[$tag])){
  507. $stanza[$tag] = array();
  508. }
  509. $stanza[$tag][] = $value;
  510. }
  511. }
  512. // now add the last term in the file
  513. if(sizeof($stanza) > 0){
  514. if(!isset($obo[$type])){
  515. $obo[$type] = array();
  516. }
  517. if(!isset($obo[$type][$stanza['id'][0]])){
  518. $obo[$type][$stanza['id'][0]] = $stanza;
  519. } else {
  520. array_merge($obo[$type][$stanza['id'][0]],$stanza);
  521. }
  522. }
  523. }
  524. /*************************************************************************
  525. *
  526. */
  527. function tripal_cv_obo_loader_done (){
  528. // return the search path to normal
  529. db_query("set search_path to public");
  530. return '';
  531. }