obo_loader.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. <?php
  2. /*************************************************************************
  3. *
  4. */
  5. function tripal_core_load_obo_job (){
  6. global $user;
  7. # $file = 'ro.obo';
  8. $file = 'so_2_4_4.obo';
  9. $args = array($file);
  10. tripal_add_job("Load OBO $file",'tripal_core',
  11. "tripal_core_load_obo_v1_2",$args,$user->uid);
  12. return '';
  13. }
  14. /*************************************************************************
  15. *
  16. */
  17. function tripal_core_load_obo_v1_2($file) {
  18. $header = array();
  19. $obo = array();
  20. $obo_file = drupal_get_path('module', 'tripal_core')."/$file";
  21. print "Opening File $obo_file\n";
  22. // set the search path
  23. db_query("set search_path to chado,public"); // TODO: fix this
  24. // make sure we have an 'internal' and a '_global' database
  25. if(!tripal_core_obo_add_db('internal')){
  26. return tripal_core_obo_loader_done();
  27. }
  28. if(!tripal_core_obo_add_db('_global')){
  29. return tripal_core_obo_loader_done();
  30. }
  31. // parse the obo file
  32. tripal_core_obo_parse($obo_file,$obo,$header);
  33. // add the CV for this ontology to the database
  34. $cv = tripal_core_obo_add_cv($header['default-namespace'][0],'');
  35. if(!$cv){
  36. return tripal_core_obo_loader_done();
  37. }
  38. // add any typedefs to the vocabulary first
  39. $typedefs = $obo['Typedef'];
  40. foreach($typedefs as $typedef){
  41. tripal_core_obo_add_cv_term($typedef,$cv,1);
  42. }
  43. // next add terms to the vocabulary
  44. $terms = $obo['Term'];
  45. if(!tripal_core_obo_process_terms($terms,$cv,$obo)){
  46. return tripal_core_obo_loader_done();
  47. }
  48. return tripal_core_obo_loader_done();
  49. }
  50. /*************************************************************************
  51. *
  52. */
  53. function tripal_core_obo_process_terms($terms,$cv,$obo){
  54. foreach ($terms as $term){
  55. // add the cvterm
  56. $cvterm = tripal_core_obo_add_cv_term($term,$cv);
  57. if(!$cvterm){ return 0; }
  58. if(isset($term['is_anonymous'])){
  59. }
  60. if(isset($term['alt_id'])){
  61. }
  62. if(isset($term['subset'])){
  63. }
  64. // add synonyms for this cvterm
  65. if(isset($term['synonym'])){
  66. if(!tripal_core_obo_add_synonyms($term,$cvterm)){
  67. return 0;
  68. }
  69. }
  70. if(isset($term['exact_synonym'])){
  71. }
  72. if(isset($term['narrow_synonym'])){
  73. }
  74. if(isset($term['broad_synonym'])){
  75. }
  76. if(isset($term['xref'])){
  77. }
  78. if(isset($term['xref_analog'])){
  79. }
  80. if(isset($term['xref_unk'])){
  81. }
  82. // add is_a relationships for this cvterm
  83. if(isset($term['is_a'])){
  84. foreach($term['is_a'] as $is_a){
  85. if(!tripal_core_obo_add_relationship($cvterm,$cv,$obo,'is_a',$is_a)){
  86. return 0;
  87. }
  88. }
  89. }
  90. if(isset($term['intersection_of'])){
  91. }
  92. if(isset($term['union_of'])){
  93. }
  94. if(isset($term['disjoint_from'])){
  95. }
  96. if(isset($term['relationship'])){
  97. foreach($term['relationship'] as $value){
  98. $rel = preg_replace('/^(.+?)\s.+?$/','\1',$value);
  99. $object = preg_replace('/^.+?\s(.+?)$/','\1',$value);
  100. if(!tripal_core_obo_add_relationship($cvterm,$cv,$obo,$rel,$object)){
  101. return 0;
  102. }
  103. }
  104. }
  105. if(isset($term['replaced_by'])){
  106. }
  107. if(isset($term['consider'])){
  108. }
  109. if(isset($term['use_term'])){
  110. }
  111. if(isset($term['builtin'])){
  112. }
  113. }
  114. return 1;
  115. }
  116. /*************************************************************************
  117. *
  118. */
  119. function tripal_core_obo_add_db($dbname){
  120. $db_sql = "SELECT * FROM {db} WHERE name ='%s'";
  121. $db = db_fetch_object(db_query($db_sql,$dbname));
  122. if(!$db){
  123. if(!db_query("INSERT INTO {db} (name) VALUES ('%s')",$dbname)){
  124. print "Cannot create '$dbname' db in Chado.";
  125. return 0;
  126. }
  127. $db = db_fetch_object(db_query($db_sql,$dbname));
  128. }
  129. return $db;
  130. }
  131. /*************************************************************************
  132. *
  133. */
  134. function tripal_core_obo_add_cv($name,$comment){
  135. // see if the CV (default-namespace) exists already in the database
  136. $vocab = $name;
  137. $remark = $comment;
  138. $cv_sql = "SELECT * FROM {cv} WHERE name = '%s'";
  139. $cv = db_fetch_object(db_query($cv_sql,$vocab));
  140. // if the CV exists then update it, otherwise insert
  141. if(!$cv){
  142. $sql = "INSERT INTO {cv} (name,definition) VALUES ('%s','%s')";
  143. if(!db_query($sql,$vocab,$remark)){
  144. print "Failed to create the CV record";
  145. return 0;
  146. }
  147. $cv = db_fetch_object(db_query($cv_sql,$vocab));
  148. } else {
  149. $sql = "UPDATE {cv} SET definition = '%s' WHERE name ='%s'";
  150. if(!db_query($sql,$remark,$vocab)){
  151. print "Failed to update the CV record";
  152. return 0;
  153. }
  154. $cv = db_fetch_object(db_query($cv_sql,$vocab));
  155. }
  156. return $cv;
  157. }
  158. /*************************************************************************
  159. *
  160. */
  161. function tripal_core_obo_add_cvterm_prop($cvterm,$property,$value,$rank){
  162. // make sure the 'cvterm_property_type' CV exists
  163. $cv = tripal_core_obo_add_cv($property,'');
  164. if(!$cv){ return 0; }
  165. // get the property type cvterm. If it doesn't exist then we want to add it
  166. $sql = "
  167. SELECT *
  168. FROM {cvterm} CVT INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  169. WHERE CVT.name = '%s' and CV.name = '%s'
  170. ";
  171. $cvproptype = db_fetch_object(db_query($sql,$property,'cvterm_property_type'));
  172. if(!$cvproptype){
  173. $term = array(
  174. 'name' => array($property),
  175. 'id' => array("internal:$property"),
  176. 'definition' => array(''),
  177. 'is_obsolete' => array(0),
  178. );
  179. $cvproptype = tripal_core_obo_add_cv_term($term,$cv,0,0);
  180. if(!$cvproptype){ return 0; }
  181. }
  182. // remove any properties that currently exist for this term. We'll reset them
  183. if($rank == 0){
  184. $sql = "DELETE FROM {cvtermprop} WHERE cvterm_id = %d";
  185. db_query($sql,$cvterm->cvterm_id);
  186. }
  187. // now add the property
  188. $sql = "INSERT INTO {cvtermprop} (cvterm_id,type_id,value,rank) ".
  189. "VALUES (%d, %d, '%s',%d)";
  190. if(!db_query($sql,$cvterm->cvterm_id,$cvproptype->cvterm_id,$value,$rank)){
  191. print "Could not add property $property for term\n";
  192. return 0;
  193. }
  194. return 1;
  195. }
  196. /*************************************************************************
  197. *
  198. */
  199. function tripal_core_obo_add_relationship($cvterm,$cv,$obo,$rel,$objname){
  200. // make sure the relationship cvterm exists
  201. $sql = "
  202. SELECT *
  203. FROM {cvterm} CVT INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  204. WHERE CVT.name = '%s' and CV.name = '%s'
  205. ";
  206. if(strcmp($rel,'is_a')==0){ // is_a is part of the OBO format and is in the 'relationship' ontology
  207. $cvisa = db_fetch_object(db_query($sql,$rel,'relationship'));
  208. } else {
  209. $cvisa = db_fetch_object(db_query($sql,$rel,$cv->name));
  210. }
  211. if(!$cvisa){
  212. print "Cannot find the relationship term: $rel\n";
  213. return 0;
  214. }
  215. // get the object term
  216. $objterm = tripal_core_obo_get_term($obo,$objname);
  217. if(!$objterm) {
  218. print "Could not find object term $objname\n";
  219. return 0;
  220. }
  221. $objcvterm = tripal_core_obo_add_cv_term($objterm,$cv);
  222. if(!$objcvterm){ return 0; }
  223. // check to see if the cvterm_relationship already exists, if not add it
  224. $cvrsql = "SELECT * FROM {cvterm_relationship} WHERE type_id = %d and subject_id = %d and object_id = %d";
  225. if(!db_fetch_object(db_query($cvrsql,$cvisa->cvterm_id,$cvterm->cvterm_id,$objcvterm->cvterm_id))){
  226. $sql = "INSERT INTO {cvterm_relationship} ".
  227. "(type_id,subject_id,object_id) VALUES (%d,%d,%d)";
  228. if(!db_query($sql,$cvisa->cvterm_id,$cvterm->cvterm_id,$objcvterm->cvterm_id)){
  229. print "Cannot add $rel relationship";
  230. return 0;
  231. }
  232. // print " $rel $objname\n";
  233. }
  234. return 1;
  235. }
  236. /*************************************************************************
  237. *
  238. */
  239. function tripal_core_obo_get_term($obo,$id){
  240. foreach ($obo as $type){
  241. foreach ($type as $term){
  242. $accession = $term['id'][0];
  243. if(strcmp($accession,$id)==0){
  244. return $term;
  245. }
  246. }
  247. }
  248. return;
  249. }
  250. /*************************************************************************
  251. *
  252. */
  253. function tripal_core_obo_add_synonyms($term,$cvterm){
  254. // make sure we have a 'synonym_type' vocabulary
  255. $sql = "SELECT * FROM {cv} WHERE name='synonym_type'";
  256. $syncv = db_fetch_object(db_query($sql));
  257. if(!$syncv){
  258. $sql = "INSERT INTO {cv} (name,definition) VALUES ('synonym_type','')";
  259. if(!db_query($sql)){
  260. print "Failed to add the synonyms type vocabulary";
  261. return 0;
  262. }
  263. }
  264. // now add the synonyms
  265. if(isset($term['synonym'])){
  266. foreach($term['synonym'] as $synonym){
  267. // separate out the synonym definition and the synonym type
  268. $def = preg_replace('/^\s*"(.*)"\s*.*$/','\1',$synonym);
  269. $type = strtolower(preg_replace('/^.*"\s+(.*?)\s+.*$/','\1',$synonym));
  270. // make sure the synonym type exists in the 'synonym_type' vocabulary
  271. $cvtsql = "
  272. SELECT *
  273. FROM {cvterm} CVT
  274. INNER JOIN {cv} CV ON CVT.cv_id = CV.cv_id
  275. WHERE CVT.name = '%s' and CV.name = '%s'
  276. ";
  277. $syntype = db_fetch_object(db_query($cvtsql,$type,'synonym_type'));
  278. if(!$syntype){
  279. // build a 'term' object so we can add the missing term
  280. $term = array(
  281. 'name' => array($type),
  282. 'id' => array("internal:$type"),
  283. 'definition' => array(''),
  284. 'is_obsolete' => array(0),
  285. );
  286. if(!tripal_core_obo_add_cv_term($term,$syncv)){
  287. return 0;
  288. }
  289. $syntype = db_fetch_object(db_query($cvtsql,$type,'synonym_type'));
  290. }
  291. // make sure the synonym doesn't already exists
  292. $sql = "
  293. SELECT *
  294. FROM {cvtermsynonym}
  295. WHERE cvterm_id = %d and synonym = '%s' and type_id = %d
  296. ";
  297. $syn = db_fetch_object(db_query($sql,$cvterm->cvterm_id,$def,$syntype->cvterm_id));
  298. if(!$syn){
  299. $sql = "INSERT INTO {cvtermsynonym} (cvterm_id,synonym,type_id)
  300. VALUES(%d,'%s',%d)";
  301. if(!db_query($sql,$cvterm->cvterm_id,$def,$syntype->cvterm_id)){
  302. print "Failed to insert the synonym for term: $name ($def)\n";
  303. return 0;
  304. }
  305. }
  306. }
  307. }
  308. return 1;
  309. }
  310. /*************************************************************************
  311. *
  312. */
  313. function tripal_core_obo_add_cv_term($term,$cv,$is_relationship = 0,$update = 1){
  314. // get the term properties
  315. $name = $term['name'][0];
  316. $definition = preg_replace('/^\"(.*)\"/','\1',$term['def'][0]);
  317. $is_obsolete = 0;
  318. if(isset($term['is_obsolete'][0]) and strcmp($term['is_obsolete'][0],'true')==0){
  319. $is_obsolete = 1;
  320. }
  321. // get the accession and the database from the cvterm
  322. $accession = preg_replace('/^.+?:(.*)$/','\1',$term['id'][0]);
  323. $db = preg_replace('/^(.+?):.*$/','\1',$term['id'][0]);
  324. // check to see if the database exists
  325. $db = tripal_core_obo_add_db($db);
  326. if(!$db){
  327. print "Cannot find database '$db' in Chado.";
  328. return 0;
  329. }
  330. // check to see if the cvterm already exists
  331. $cvtermsql = "SELECT * from {cvterm} WHERE name = '%s' and cv_id = %d";
  332. $cvterm = db_fetch_object(db_query($cvtermsql,$name,$cv->cv_id));
  333. // if the cvterm doesn't exist then add it otherwise just update it
  334. if(!$cvterm){
  335. // check to see if the dbxref exists if not, add it
  336. $dbxsql = tripal_core_obo_add_dbxref($db->db_id,$accession);
  337. if(!$dbxref){
  338. print "Failed to find or insert the dbxref record for cvterm: $name ($accession)";
  339. return 0;
  340. }
  341. // now add the cvterm
  342. $sql = "
  343. INSERT INTO {cvterm} (cv_id, name, definition, dbxref_id,
  344. is_obsolete, is_relationshiptype)
  345. VALUES (%d,'%s','%s',%d,%d,%d)
  346. ";
  347. if(!db_query($sql,$cv->cv_id,$name,$definition,
  348. $dbxref->dbxref_id,$is_obsolete,$is_relationship)){
  349. print "Failed to insert the term: " . $term['name'][0];
  350. return 0;
  351. }
  352. print "Added CV term: $name\n";
  353. $cvterm = db_fetch_object(db_query($cvtermsql,$name,$cv->cv_id));
  354. }
  355. elseif($update) { // update the cvterm
  356. $sql = "
  357. UPDATE {cvterm} SET name='%s', definition='%s',
  358. is_obsolete = %d, is_relationshiptype = %d
  359. WHERE cvterm_id = %d
  360. ";
  361. if(!db_query($sql,$term['name'][0],$definition,
  362. $is_obsolete,$is_relationship,$cvterm->cvterm_id)){
  363. print "Failed to update the term: $name\n";
  364. return 0;
  365. }
  366. print "Updated CV term: $name\n";
  367. $cvterm = db_fetch_object(db_query($cvtermsql,$name,$cv->cv_id));
  368. }
  369. // add the comment to the cvtermprop table
  370. if(isset($term['comment'])){
  371. $comments = $term['comment'];
  372. $j = 0;
  373. foreach($comments as $comment){
  374. if(!tripal_core_obo_add_cvterm_prop($cvterm,'comment',$comment,$j)){
  375. return 0;
  376. }
  377. $j++;
  378. }
  379. }
  380. // add any other external dbxrefs
  381. if(isset($term['xref'])){
  382. foreach($term['xref'] as $xref){
  383. $accession = preg_replace('/^.+?:(.*)$/','\1',$xref);
  384. $dbname = preg_replace('/^(.+?):.*$/','\1',$xref);
  385. // if the xref is a database link, handle that specially
  386. if(strcmp($db,'http')==0){
  387. $accession = $xref;
  388. $dbname = 'URL';
  389. }
  390. // check to see if the database exists
  391. $db = tripal_core_obo_add_db($db);
  392. if(!$db){
  393. print "Cannot find database '$db' in Chado.";
  394. return 0;
  395. }
  396. // now add the dbxref
  397. $dbxref = tripal_core_obo_add_dbxref($db->db_id,$accession);
  398. if(!$dbxref){ return 0;}
  399. // finally add the cvterm_dbxref but first check to make sure it exists
  400. $sql = "SELECT * from {cvterm_dbxref} WHERE cvterm_id = %d and dbxref_id = %d";
  401. if(!db_fetch_object(db_query($sql,$cvterm->cvterm_id,$dbxref->dbxref_id))){
  402. $sql = "INSERT INTO {cvterm_dbxref} (cvterm_id,dbxref_id)".
  403. "VALUES (%d,%d)";
  404. if(!db_query($sql,$cvterm->cvterm_id,$dbxref->dbxref_id)){
  405. print "Cannot add cvterm_dbxref: $accession\n";
  406. return 0;
  407. }
  408. }
  409. }
  410. }
  411. // return the cvterm
  412. return $cvterm;
  413. }
  414. /*************************************************************************
  415. *
  416. */
  417. function tripal_core_obo_add_dbxref($db_id,$accession,$version='',$description=''){
  418. // check to see if the dbxref exists if not, add it
  419. $dbxsql = "SELECT dbxref_id FROM {dbxref} WHERE db_id = %d and accession = '%s'";
  420. $dbxref = db_fetch_object(db_query($dbxsql,$db_id,$accession));
  421. if(!$dbxref){
  422. $sql = "
  423. INSERT INTO {dbxref} (db_id, accession, version, description)
  424. VALUES (%d,'%s','%s','%s')
  425. ";
  426. if(!db_query($sql,$db_id,$accession,$version,$description)){
  427. print "Failed to insert the dbxref record $accession\n";
  428. return 0;
  429. }
  430. $dbxref = db_fetch_object(db_query($dbxsql,$db_id,$accession));
  431. }
  432. return $dbxref;
  433. }
  434. /*************************************************************************
  435. *
  436. */
  437. function tripal_core_obo_parse($obo_file,&$obo,&$header){
  438. $i = 0;
  439. $in_header = 1;
  440. $stanza = array();
  441. $lines = file($obo_file,FILE_SKIP_EMPTY_LINES);
  442. // iterate through the lines in the OBO file and parse the stanzas
  443. foreach ($lines as $line_num => $line) {
  444. $i++;
  445. // remove newlines
  446. $line = rtrim($line);
  447. // skip empty lines
  448. if(strcmp($line,'')==0) { continue; }
  449. //remove comments from end of lines
  450. $line = preg_replace('/^(.*?)\!.*$/','\1',$line); // TODO: if the explamation is escaped
  451. if(preg_match('/^\s*\[/',$line)){ // at the first stanza we're out of header
  452. $in_header = 0;
  453. // load the stanza we just finished reading
  454. if(sizeof($stanza) > 0){
  455. if(!isset($obo[$type])){
  456. $obo[$type] = array();
  457. }
  458. if(!isset($obo[$type][$stanza['id'][0]])){
  459. $obo[$type][$stanza['id'][0]] = $stanza;
  460. } else {
  461. array_merge($obo[$type][$stanza['id'][0]],$stanza);
  462. }
  463. }
  464. // get the stanza type: Term, Typedef or Instance
  465. $type = preg_replace('/^\s*\[\s*(.+?)\s*\]\s*$/','\1',$line);
  466. // start fresh with a new array
  467. $stanza = array();
  468. continue;
  469. }
  470. // break apart the line into the tag and value but ignore any escaped colons
  471. preg_replace("/\\:/","|-|-|",$line); // temporarily replace escaped colons
  472. $pair = explode(":",$line,2);
  473. $tag = $pair[0];
  474. $value = ltrim(rtrim($pair[1]));// remove surrounding spaces
  475. $tag = preg_replace("/\|-\|-\|/","\:",$tag); // return the escaped colon
  476. $value = preg_replace("/\|-\|-\|/","\:",$value);
  477. if($in_header){
  478. if(!isset($header[$tag])){
  479. $header[$tag] = array();
  480. }
  481. $header[$tag][] = $value;
  482. } else {
  483. if(!isset($stanza[$tag])){
  484. $stanza[$tag] = array();
  485. }
  486. $stanza[$tag][] = $value;
  487. }
  488. }
  489. // now add the last term in the file
  490. if(sizeof($stanza) > 0){
  491. if(!isset($obo[$type])){
  492. $obo[$type] = array();
  493. }
  494. if(!isset($obo[$type][$stanza['id'][0]])){
  495. $obo[$type][$stanza['id'][0]] = $stanza;
  496. } else {
  497. array_merge($obo[$type][$stanza['id'][0]],$stanza);
  498. }
  499. }
  500. }
  501. /*************************************************************************
  502. *
  503. */
  504. function tripal_core_obo_loader_done (){
  505. // return the search path to normal
  506. db_query("set search_path to public");
  507. return '';
  508. }