gff_loader.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. <?php
  2. /*************************************************************************
  3. *
  4. */
  5. function tripal_core_gff3_load_form (){
  6. $form['gff_file']= array(
  7. '#type' => 'textfield',
  8. '#title' => t('GFF3 File'),
  9. '#description' => t('Please enter the full system path for the GFF file, or a path within the Drupal
  10. installation (e.g. /sites/default/files/xyz.obo). The path must be accessible to the
  11. server on which this Drupal instance is running.'),
  12. '#required' => TRUE,
  13. '#weight' => 1
  14. );
  15. $form['add_only']= array(
  16. '#type' => 'checkbox',
  17. '#title' => t('Import only new features'),
  18. '#required' => FALSE,
  19. '#default_value' => 'checked',
  20. '#description' => t('The job will skip features in the GFF file that already
  21. exist in the database and import only new features.'),
  22. '#weight' => 2
  23. );
  24. $form['update']= array(
  25. '#type' => 'checkbox',
  26. '#title' => t('Import all and update'),
  27. '#required' => FALSE,
  28. '#description' => t('Existing features will be updated and new features will be added. Attributes
  29. for a feature that are not present in the GFF but which are present in the
  30. database will not be altered.'),
  31. '#weight' => 3
  32. );
  33. $form['refresh']= array(
  34. '#type' => 'checkbox',
  35. '#title' => t('Import all and replace'),
  36. '#required' => FALSE,
  37. '#description' => t('Existing features will be updated and feature properties not
  38. present in the GFF file will be removed.'),
  39. '#weight' => 4
  40. );
  41. $form['remove']= array(
  42. '#type' => 'checkbox',
  43. '#title' => t('Delete features'),
  44. '#required' => FALSE,
  45. '#description' => t('Features present in the GFF file that exist in the database
  46. will be removed rather than imported'),
  47. '#weight' => 5
  48. );
  49. $form['button'] = array(
  50. '#type' => 'submit',
  51. '#value' => t('Import GFF3 file'),
  52. '#weight' => 10,
  53. );
  54. return $form;
  55. }
  56. /*************************************************************************
  57. *
  58. */
  59. function tripal_core_gff3_load_form_submit ($form, &$form_state){
  60. global $user;
  61. $gff_file = $form_state['values']['gff_file'];
  62. $add_only = $form_state['values']['add_only'];
  63. $update = $form_state['values']['update'];
  64. $refresh = $form_state['values']['refresh'];
  65. $remove = $form_state['values']['remove'];
  66. $args = array($gff_file,$add_only,$update,$refresh,$remove);
  67. $type = '';
  68. if($add_only){
  69. $type = 'import only new features';
  70. }
  71. if($update){
  72. $type = 'import all and update';
  73. }
  74. if($refresh){
  75. $type = 'import all and replace';
  76. }
  77. if($remove){
  78. $type = 'delete features';
  79. }
  80. tripal_add_job("Import GFF3 file: $type",'tripal_core',
  81. 'tripal_core_load_gff3',$args,$user->uid);
  82. return '';
  83. }
  84. /*************************************************************************
  85. *
  86. */
  87. function tripal_core_load_gff3($gff_file, $add_only =0, $update = 0, $refresh = 0, $remove = 0, $job = NULL){
  88. // check to see if the file is located local to Drupal
  89. $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $gff_file;
  90. if(!file_exists($dfile)){
  91. // if not local to Drupal, the file must be someplace else, just use
  92. // the full path provided
  93. $dfile = $gff_file;
  94. }
  95. if(!file_exists($dfile)){
  96. print "ERROR: cannot find the file: $dfile\n";
  97. return 0;
  98. }
  99. $previous_db = tripal_db_set_active('chado');
  100. print "Opening $gff_file\n";
  101. $lines = file($dfile,FILE_SKIP_EMPTY_LINES);
  102. $i = 0;
  103. // get the controlled vocaubulary that we'll be using. The
  104. // default is the 'sequence' ontology
  105. $vocab = 'sequence';
  106. $sql = "SELECT * FROM cv WHERE name = '%s'";
  107. $cv = db_fetch_object(db_query($sql,$vocab));
  108. if(!$cv){
  109. print "ERROR: cannot find the '$vocab' ontology\n";
  110. return '';
  111. }
  112. // get the organism for which this GFF3 file belongs
  113. $sql = "SELECT * FROM organism WHERE common_name = 'fruitfly'";
  114. $organism = db_fetch_object(db_query($sql));
  115. foreach ($lines as $line_num => $line) {
  116. $i++; // update the line count
  117. if(preg_match('/^#/',$line)){
  118. continue; // skip comments
  119. }
  120. // TODO: remove URL encoding
  121. $cols = explode("\t",$line);
  122. if(sizeof($cols) != 9){
  123. print "ERROR: improper number of columns on line $i\n";
  124. return '';
  125. }
  126. // get the column values
  127. $landmark = $cols[0];
  128. $source = $cols[1];
  129. $type = $cols[2];
  130. $start = $cols[3];
  131. $end = $cols[4];
  132. $score = $cols[5];
  133. $strand = $cols[6];
  134. $phase = $cols[7];
  135. $attrs = explode(";",$cols[8]); // split by a semi-colon
  136. // ready the start and stop for chado
  137. $fmin = $start;
  138. $fmax = $end;
  139. if($end < $start){
  140. $fmin = $end;
  141. $fmax = $start;
  142. }
  143. // format the strand for chado
  144. if(strcmp($strand,'.')==0){
  145. $strand = 0;
  146. }
  147. elseif(strcmp($strand,'+')==0){
  148. $strand = 1;
  149. }
  150. elseif(strcmp($strand,'-')==0){
  151. $strand = -1;
  152. }
  153. if(strcmp($phase,'.')==0){
  154. $phase = '';
  155. }
  156. // get the type record
  157. $sql = "SELECT * from cvterm CVT WHERE name = '%s' and cv_id = %d";
  158. $cvterm = db_fetch_object(db_query($sql,$type,$cv->cv_id));
  159. if(!$cvterm){
  160. print "ERROR: cannot find ontology term '$type' on line $i.\n";
  161. return '';
  162. }
  163. // break apart each of the attributes
  164. $tags = array();
  165. $attr_name = '';
  166. $attr_uniquename = '';
  167. $attr_residue_info = '';
  168. $attr_locgroup = 0;
  169. $attr_fmin_partial = 'f';
  170. $attr_fmax_partial = 'f';
  171. $attr_is_obsolete = 'f';
  172. $attr_is_analysis = 'f';
  173. $residues = '';
  174. foreach($attrs as $attr){
  175. $attr = rtrim($attr);
  176. $attr = ltrim($attr);
  177. if(strcmp($attr,'')==0){
  178. continue;
  179. }
  180. if(!preg_match('/^[^\=]+\=[^\=]+$/',$attr)){
  181. print "ERROR: attribute is not correctly formatted on line $i: $attr\n";
  182. return '';
  183. }
  184. // break apart each tag
  185. $tag = explode("=",$attr); // split by equals sign
  186. // multiple instances of an attribute are separated by commas
  187. $tags[$tag[0]] = explode(",",$tag[1]); // split by comma
  188. if(strcmp($tag[0],'ID')==0){
  189. $attr_uniquename = $tag[1];
  190. }
  191. if(strcmp($tag[0],'Name')==0){
  192. $attr_name = $tag[1];
  193. }
  194. }
  195. if(strcmp($attr_name,'')==0){
  196. $attr_name = $attr_uniquename;
  197. }
  198. // skip features that have no ID attribute
  199. if(!$attr_uniquename){
  200. continue;
  201. }
  202. // if the option is to remove or refresh then we want to remove
  203. // the feature from the database.
  204. if($remove or $refresh){
  205. print "Removing feature '$attr_uniquename'\n";
  206. $sql = "DELETE FROM {feature}
  207. WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
  208. $result = db_query($sql,$organism->organism_id,$attr_uniquename,$cvterm->cvterm_id);
  209. if(!$result){
  210. print "ERROR: cannot delete feature $attr_uniquename\n";
  211. }
  212. $feature = 0;
  213. }
  214. // add or update the feature and all properties
  215. if($update or $refresh or $add_only){
  216. // add/update the feature
  217. $feature = tripal_core_load_gff3_feature($organism,$cvterm,
  218. $attr_uniquename,$attr_name,$residues,$attr_is_analysis,$attr_is_obsolete,
  219. $add_only);
  220. if($feature){
  221. // add/update the featureloc if the landmark and the feature ID are not the same
  222. // if they are the same then this entry in the GFF is probably a landmark identifier
  223. if(strcmp($landmark,$attr_uniquename)!=0){
  224. tripal_core_load_gff3_featureloc($feature,$organism,
  225. $landmark,$fmin,$fmax,$strand,$phase,$attr_fmin_partial,
  226. $attr_fmax_partial,$attr_residue_info,$attr_locgroup);
  227. }
  228. // add any aliases for this feature
  229. if(array_key_exists('Alias',$tags)){
  230. tripal_core_load_gff3_alias($feature,$tags['Alias']);
  231. }
  232. // add any aliases for this feature
  233. if(array_key_exists('Dbxref',$tags)){
  234. tripal_core_load_gff3_dbxref($feature,$tags['Dbxref']);
  235. }
  236. }
  237. }
  238. }
  239. tripal_db_set_active($previous_db);
  240. return '';
  241. }
  242. /*************************************************************************
  243. *
  244. */
  245. function tripal_core_load_gff3_dbxref($feature,$dbxrefs){
  246. foreach($dbxrefs as $dbxref){
  247. print "Adding: $dbxref\n";
  248. }
  249. }
  250. /*************************************************************************
  251. *
  252. */
  253. function tripal_core_load_gff3_alias($feature,$aliases){
  254. // make sure we have a 'synonym_type' vocabulary
  255. $sql = "SELECT * FROM {cv} WHERE name='synonym_type'";
  256. $syncv = db_fetch_object(db_query($sql));
  257. if(!$syncv){
  258. $sql = "INSERT INTO {cv} (name,definition) VALUES ('synonym_type','')";
  259. if(!db_query($sql)){
  260. print("ERROR: Failed to add the synonyms type vocabulary");
  261. return 0;
  262. }
  263. $syncv = db_fetch_object(db_query($sql));
  264. }
  265. // get the 'exact' cvterm, which is the type of synonym we're adding
  266. $cvtsql = "
  267. SELECT * FROM {cvterm} CVT
  268. INNER JOIN {cv} CV ON CVT.cv_id = CV.cv_id
  269. WHERE CVT.name = '%s' and CV.name = '%s'
  270. ";
  271. $syntype = db_fetch_object(db_query($cvtsql,'exact','synonym_type'));
  272. if(!$syntype){
  273. $term = array(
  274. 'name' => array('exact'),
  275. 'id' => array("internal:exact"),
  276. 'definition' => array(''),
  277. 'is_obsolete' => array(0),
  278. );
  279. $syntype = tripal_cv_obo_add_cv_term($term,$syncv,0,1);
  280. if(!$syntype){
  281. tripal_cv_obo_quiterror("Cannot add synonym type: internal:$type");
  282. }
  283. }
  284. // iterate through all of the aliases and add each one
  285. foreach($aliases as $alias){
  286. print " Adding Alias $alias\n";
  287. // check to see if the alias already exists in the synonym table
  288. // if not, then add it
  289. $synsql = "SELECT * FROM {synonym}
  290. WHERE name = '%s' and type_id = %d";
  291. $synonym = db_fetch_object(db_query($synsql,$alias,$syntype->cvterm_id));
  292. if(!$synonym){
  293. $sql = "INSERT INTO {synonym}
  294. (name,type_id,synonym_sgml)
  295. VALUES ('%s',%d,'%s')";
  296. $result = db_query($sql,$alias,$syntype->cvterm_id,'');
  297. if(!$result){
  298. print "ERROR: cannot add alias $alias to synonym table\n";
  299. }
  300. }
  301. $synonym = db_fetch_object(db_query($synsql,$alias,$syntype->cvterm_id));
  302. // check to see if we have a NULL publication in the pub table. If not,
  303. // then add one.
  304. $pubsql = "SELECT * FROM {pub} WHERE uniquename = 'null'";
  305. $pub = db_fetch_object(db_query($pubsql));
  306. if(!$pub){
  307. $sql = "INSERT INTO pub (uniquename,type_id) VALUES ('%s',
  308. (SELECT cvterm_id
  309. FROM cvterm CVT
  310. INNER JOIN dbxref DBX on DBX.dbxref_id = CVT.dbxref_id
  311. INNER JOIN db DB on DB.db_id = DBX.db_id
  312. WHERE CVT.name = 'null' and DB.name = 'null')";
  313. $result = db_query($sql,'null');
  314. if(!$result){
  315. print "ERROR: cannot add null publication needed for setup of alias\n";
  316. return 0;
  317. }
  318. }
  319. $pub = db_fetch_object(db_query($pubsql));
  320. // check to see if the synonym exists in the feature_synonym table
  321. // if not, then add it.
  322. $synsql = "SELECT * FROM {feature_synonym}
  323. WHERE synonym_id = %d and feature_id = %d and pub_id = %d";
  324. $fsyn = db_fetch_object(db_query($synsql,$synonym->synonym_id,$feature->feature_id,$pub->pub_id));
  325. if(!$fsyn){
  326. $sql = "INSERT INTO {feature_synonym}
  327. (synonym_id,feature_id,pub_id)
  328. VALUES (%d,%d,%d)";
  329. $result = db_query($sql,$synonym->synonym_id,$feature->feature_id,$pub->pub_id);
  330. if(!$result){
  331. print "ERROR: cannot add alias $alias to feature synonym table\n";
  332. return 0;
  333. }
  334. }
  335. $fsyn = db_fetch_object(db_query($synsql,$synonym->synonym_id,$feature->feature_id,$pub->pub_id));
  336. }
  337. return 1;
  338. }
  339. /*************************************************************************
  340. *
  341. */
  342. function tripal_core_load_gff3_feature($organism,$cvterm,$uniquename,$name,
  343. $residues,$is_analysis='f',$is_obsolete='f',$add_only) {
  344. // check to see if the feature already exists
  345. $feature_sql = "SELECT * FROM {feature}
  346. WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
  347. $feature = db_fetch_object(db_query($feature_sql,$organism->organism_id,$uniquename,$cvterm->cvterm_id));
  348. if(strcmp($is_obsolete,'f')==0){
  349. $is_obsolete = 'false';
  350. }
  351. if(strcmp($is_analysis,'f')==0){
  352. $is_analysis = 'false';
  353. }
  354. // insert the feature if it does not exist otherwise perform an update
  355. if(!$feature){
  356. print "Adding feature '$uniquename'\n";
  357. $sql = "INSERT INTO {feature} (organism_id, name, uniquename, residues, seqlen,
  358. md5checksum, type_id,is_analysis,is_obsolete)
  359. VALUES(%d,'%s','%s','%s',%d, '%s', %d, %s, %s)";
  360. $result = db_query($sql,$organism->organism_id,$name,$uniquename,$residues,strlen($residues),
  361. md5($residues),$cvterm->cvterm_id,$is_analysis,$is_obsolete);
  362. if(!$result){
  363. print "ERROR: failed to insert feature '$uniquename'\n";
  364. return 0;
  365. }
  366. }
  367. elseif(!$add_only) {
  368. print "Updating feature '$uniquename'\n";
  369. $sql = "UPDATE {feature}
  370. SET name = '%s', residues = '%s', seqlen = '%s', md5checksum = '%s',
  371. is_analysis = %s, is_obsolete = %s
  372. WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
  373. $result = db_query($sql,$name,$residues,strlen($residues),md5($residues),$is_analysis,$is_obsolete);
  374. if(!$result){
  375. print "ERROR: failed to update feature '$uniquename'\n";
  376. return 0;
  377. }
  378. }
  379. else {
  380. // the feature exists and we don't want to update it so return
  381. // a value of 0. This will stop all downstream property additions
  382. print "Skipping existing feature: '$uniquename'.\n";
  383. return 0;
  384. }
  385. $feature = db_fetch_object(db_query($feature_sql,$organism->organism_id,$uniquename,$cvterm->cvterm_id));
  386. return $feature;
  387. }
  388. /*************************************************************************
  389. *
  390. */
  391. function tripal_core_load_gff3_featureloc($feature,$organism,$landmark,$fmin,$fmax,$strand,$phase,
  392. $is_fmin_partial,$is_fmax_partial,$residue_info,$locgroup) {
  393. // get the source feature
  394. $sql = "SELECT * FROM {feature}
  395. WHERE organism_id = %d and uniquename = '%s'";
  396. $srcfeature = db_fetch_object(db_query($sql,$organism->organism_id,$landmark));
  397. if(!$srcfeature){
  398. print "ERROR: cannot find source feature $landmark.\n";
  399. return 0;
  400. }
  401. // TODO: create an attribute that recognizes the residue_info,locgroup, is_fmin_partial and is_fmax_partial, right now these are
  402. // hardcoded to be false and 0 below.
  403. // check to see if this featureloc already exists, but also keep track of the
  404. // last rank value
  405. $rank = 0;
  406. $exists = 0;
  407. $featureloc_sql = "SELECT FL.featureloc_id,FL.fmin,FL.fmax, FL.is_fmin_partial,
  408. FL.is_fmax_partial, FL.strand, FL.phase, FL.residue_info,
  409. FL.locgroup, F.uniquename as srcname
  410. FROM {featureloc} FL
  411. INNER JOIN {feature} F on F.feature_id = FL.srcfeature_id
  412. WHERE FL.feature_id = %d
  413. ORDER BY rank ASC";
  414. $recs = db_query($featureloc_sql,$feature->feature_id);
  415. while ($featureloc = db_fetch_object($recs)){
  416. if(strcmp($featureloc->srcname,$landmark)==0 and
  417. $featureloc->fmin == $fmin and strcmp($featureloc->is_fmin_partial,$is_fmin_partial)==0 and
  418. $featureloc->fmax == $fmax and strcmp($featureloc->is_fmax_partial,$is_fmax_partial)==0 and
  419. $featureloc->phase == $phase and $featureloc->strand == $strand and
  420. strcmp($featureloc->residue_info,$residue_info)==0 and
  421. $featureloc->locgroup == $locgroup){
  422. // this is the same featureloc, so do nothing... no need to update
  423. //TODO: need more checks here
  424. print " No change to featureloc\n";
  425. $exists = 1;
  426. }
  427. $rank = $featureloc->rank;
  428. }
  429. if(!$exists){
  430. // this feature does not have a feature loc so add it
  431. if(!$phase){
  432. $phase = 'NULL';
  433. }
  434. if(strcmp($is_fmin_partial,'f')==0){
  435. $is_fmin_partial = 'false';
  436. }
  437. elseif(strcmp($is_fmin_partial,'t')==0){
  438. $is_fmin_partial = 'true';
  439. }
  440. if(strcmp($is_fmax_partial,'f')==0){
  441. $is_fmax_partial = 'false';
  442. }
  443. elseif(strcmp($is_fmax_partial,'t')==0){
  444. $is_fmax_partial = 'true';
  445. }
  446. print " Adding featureloc $srcfeature->uniquename fmin: $fmin, fmax: $fmax, strand: $strand, phase: $phase, rank: $rank\n";
  447. $fl_isql = "INSERT INTO {featureloc}
  448. (feature_id, srcfeature_id, fmin, is_fmin_partial, fmax, is_fmax_partial,
  449. strand, phase, residue_info, locgroup, rank)
  450. VALUES (%d,%d,%d,%s,%d,%s,%d,%s,'%s',%d,%d)";
  451. $result = db_query($fl_isql,$feature->feature_id,$srcfeature->feature_id,$fmin,$is_fmin_partial,$fmax,$is_fmax_partial,
  452. $strand,$phase,$residue_info,$locgroup,$rank);
  453. if(!$result){
  454. print "ERROR: failed to insert featureloc\n";
  455. return 0;
  456. }
  457. }
  458. return 1;
  459. }
  460. /*************************************************************************
  461. *
  462. */