gaf_loader.inc 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. <?php
  2. function tripal_analysis_go_gaf_load_form (){
  3. $form['notice']= array(
  4. '#type' => 'item',
  5. '#value' => t('Note: currently, the GAF loader only uses column 2 (Object ID) and 5 (GO ID)
  6. from the GAF file, and simply imports GO terms for the features.
  7. Further support for this file format will be provided later.'),
  8. );
  9. $form['gaf_file']= array(
  10. '#type' => 'textfield',
  11. '#title' => t('GAF File'),
  12. '#description' => t('Please enter the full system path for the GAF file, or a path within the Drupal
  13. installation (e.g. /sites/default/files/xyz.txt). The path must be accessible to the
  14. server on which this Drupal instance is running.'),
  15. '#required' => TRUE,
  16. );
  17. // get the list of organisms
  18. $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  19. $previous_db = tripal_db_set_active('chado'); // use chado database
  20. $org_rset = db_query($sql);
  21. tripal_db_set_active($previous_db); // now use drupal database
  22. $organisms = array();
  23. $organisms[''] = '';
  24. while($organism = db_fetch_object($org_rset)){
  25. $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  26. }
  27. $form['organism_id'] = array (
  28. '#title' => t('Organism'),
  29. '#type' => t('select'),
  30. '#description' => t("Choose the organism to which these sequences are associated "),
  31. '#required' => TRUE,
  32. '#options' => $organisms,
  33. );
  34. $form['seq_type']= array(
  35. '#type' => 'textfield',
  36. '#title' => t('Sequence Type'),
  37. '#required' => TRUE,
  38. '#description' => t('Please enter the Sequence Ontology term that describes the sequences in the GAF file.'),
  39. );
  40. $form['query_uniquename'] = array(
  41. '#title' => t('Use Unique Name'),
  42. '#type' => 'checkbox',
  43. '#description' => t('Select this checboxk if the feature name in the GAF file '.
  44. 'matches the uniquename in the database. By default, the feature will '.
  45. 'be mapped to the "name" of the feature.'),
  46. '#default_value' => $query_uniquename,
  47. );
  48. $form['import_options'] = array(
  49. '#type' => 'fieldset',
  50. '#title' => t('Import Options'),
  51. '#collapsed' => TRUE
  52. );
  53. $form['import_options']['add_only']= array(
  54. '#type' => 'checkbox',
  55. '#title' => t('Add GO terms'),
  56. '#required' => FALSE,
  57. '#description' => t('GO terms in the GAF file will be added to each feature.'),
  58. );
  59. // $form['import_options']['replace']= array(
  60. // '#type' => 'checkbox',
  61. // '#title' => t('Replace GO terms'),
  62. // '#required' => FALSE,
  63. // '#description' => t('All GO terms for features in the GAF file will be replaced with terms in the GAF file.'),
  64. // );
  65. $form['import_options']['remove']= array(
  66. '#type' => 'checkbox',
  67. '#title' => t('Delete GO terms'),
  68. '#required' => FALSE,
  69. '#description' => t('GO terms for features in the GAF file will be removed. Other terms will remain.'),
  70. );
  71. $form['analysis'] = array(
  72. '#type' => 'fieldset',
  73. '#title' => t('Analysis Used to Derive GO terms'),
  74. '#collapsed' => TRUE
  75. );
  76. // get the list of organisms
  77. $sql = "SELECT * FROM {analysis} ORDER BY name";
  78. $previous_db = tripal_db_set_active('chado'); // use chado database
  79. $org_rset = db_query($sql);
  80. tripal_db_set_active($previous_db); // now use drupal database
  81. $analyses = array();
  82. $analyses[''] = '';
  83. while($analysis = db_fetch_object($org_rset)){
  84. $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)";
  85. }
  86. $form['analysis']['analysis_id'] = array (
  87. '#title' => t('Analysis'),
  88. '#type' => t('select'),
  89. '#description' => t("Choose the analysis that defines how the GO annotations in the GAF file were created. "),
  90. '#required' => TRUE,
  91. '#options' => $analyses,
  92. );
  93. // Advanced Options
  94. $form['advanced'] = array(
  95. '#type' => 'fieldset',
  96. '#title' => t('Advanced Options'),
  97. '#collapsed' => TRUE
  98. );
  99. $form['advanced']['re_help']= array(
  100. '#type' => 'item',
  101. '#value' => t('A regular expression is an advanced method for extracting information from a string of text.
  102. By default, this loader will use the first word in the second column of the GAF file
  103. as the uniquename for the sequences. If this is not desired, you may use the following regular
  104. expressions to define the location of the name or unique name within the text of column 2.'),
  105. );
  106. $form['advanced']['re_name']= array(
  107. '#type' => 'textfield',
  108. '#title' => t('Regular expression for the name'),
  109. '#required' => FALSE,
  110. '#description' => t('Enter the regular expression that will extract the '.
  111. 'feature name from the GAF file. This option is '.
  112. 'is only required when the feature identifier does not identically match a feature '.
  113. 'in the database.'),
  114. );
  115. $form['button'] = array(
  116. '#type' => 'submit',
  117. '#value' => t('Import GAF file'),
  118. );
  119. return $form;
  120. }
  121. /**
  122. *
  123. *
  124. * @ingroup gff3_loader
  125. */
  126. function tripal_analysis_go_gaf_load_form_validate ($form, &$form_state){
  127. $gaf_file = $form_state['values']['gaf_file'];
  128. $organism_id = $form_state['values']['organism_id'];
  129. $add_only = $form_state['values']['add_only'];
  130. $remove = $form_state['values']['remove'];
  131. $replace = $form_state['values']['replace'];
  132. $analysis_id = $form_state['values']['analysis_id'];
  133. $type = trim($form_state['values']['seq_type']);
  134. $re_name = trim($form_state['values']['re_name']);
  135. $query_uniquename = $form_state['values']['query_uniquename'];
  136. // check to see if the file is located local to Drupal
  137. $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $gaf_file;
  138. if(!file_exists($dfile)){
  139. // if not local to Drupal, the file must be someplace else, just use
  140. // the full path provided
  141. $dfile = $gaf_file;
  142. }
  143. if(!file_exists($dfile)){
  144. form_set_error('gff_file',t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
  145. }
  146. if (($add_only and ($remove or $replace)) or
  147. ($replace and ($add_only or $remove)) or
  148. ($remove and ($replace or $add_only))){
  149. form_set_error('add_only',t("Please select only one checkbox from the import options section"));
  150. }
  151. // check to make sure the types exists
  152. $cvtermsql = "SELECT CVT.cvterm_id
  153. FROM {cvterm} CVT
  154. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  155. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  156. WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
  157. $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$type,$type));
  158. if(!$cvterm){
  159. form_set_error('type',t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
  160. }
  161. if($rel_type){
  162. $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$parent_type,$parent_type));
  163. if(!$cvterm){
  164. form_set_error('parent_type',t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
  165. }
  166. }
  167. }
  168. /**
  169. *
  170. * @ingroup gff3_loader
  171. */
  172. function tripal_analysis_go_gaf_load_form_submit ($form, &$form_state){
  173. global $user;
  174. $gaf_file = $form_state['values']['gaf_file'];
  175. $organism_id = $form_state['values']['organism_id'];
  176. $add_only = $form_state['values']['add_only'];
  177. $remove = $form_state['values']['remove'];
  178. $replace = $form_state['values']['replace'];
  179. $analysis_id = $form_state['values']['analysis_id'];
  180. $type = trim($form_state['values']['seq_type']);
  181. $re_name = trim($form_state['values']['re_name']);
  182. $query_uniquename = $form_state['values']['query_uniquename'];
  183. $args = array($gaf_file,$organism_id,$analysis_id,$add_only,$replace,
  184. $remove,$re_name,$type,$query_uniquename);
  185. if($add_only){
  186. $type = 'add GO terms';
  187. }
  188. if($replace){
  189. $type = 'replace and add GO terms';
  190. }
  191. if($remove){
  192. $type = 'remove GO terms';
  193. }
  194. tripal_add_job("Import GAF 2.0 file $gaf_file and $type",'tripal_analysis_go',
  195. 'tripal_analysis_go_load_gaf',$args,$user->uid);
  196. return '';
  197. }
  198. /**
  199. *
  200. *
  201. * @ingroup gff3_loader
  202. */
  203. function tripal_analysis_go_load_gaf($gaf_file, $organism_id,$analysis_id,$add_only =0,
  204. $replace = 0, $remove = 0, $re_name, $type, $query_uniquename,
  205. $job = NULL)
  206. {
  207. print "Opening GAF file $gaf_file\n";
  208. $lines = file($gaf_file,FILE_SKIP_EMPTY_LINES);
  209. $i = 0;
  210. $name = '';
  211. $residues = '';
  212. $num_lines = sizeof($lines);
  213. $interval = intval($num_lines * 0.01);
  214. if($interval == 0){
  215. $interval = 1;
  216. }
  217. foreach ($lines as $line_num => $line) {
  218. $i++; // update the line count
  219. // skip comments
  220. if(preg_match('/^\!/',$line)){
  221. continue;
  222. }
  223. // update the job status every 1% features
  224. if($job and $i % $interval == 0){
  225. tripal_job_set_progress($job,intval(($i/$num_lines)*100));
  226. }
  227. // split the line into it's columns
  228. $cols = explode("\t",$line);
  229. if(sizeof($cols) < 15){
  230. print "ERROR: improper number of columns on line $i\n";
  231. print_r($cols);
  232. return '';
  233. }
  234. $db = $cols[0];
  235. $object = $cols[1];
  236. $symbol = $cols[2];
  237. $qualifier = $cols[3];
  238. $go_id = $cols[4];
  239. $dbxref = $cols[5];
  240. $ecode = $cols[6];
  241. $with = $cols[7];
  242. $aspect = $cols[8];
  243. $obj_name = $cols[9];
  244. $obj_syn = $cols[10];
  245. $obj_type = $cols[11];
  246. $taxon = $cols[12];
  247. $date = $cols[13];
  248. $assigned = $cols[14];
  249. $exten = $cols[15];
  250. $product = $cols[16];
  251. // get the name or uniquename for the feature
  252. $name = $object;
  253. if($re_name){
  254. if(!preg_match("/$re_name/",$object,$matches)){
  255. print "Regular expression for the feature name finds nothing\n";
  256. } else {
  257. $name = trim($matches[1]);
  258. }
  259. } else {
  260. if(preg_match("/^\s*(.*?)[\s\|].*$/",$object,$matches)){
  261. $name = trim($matches[1]);
  262. }
  263. }
  264. // get the feature
  265. $values = array(
  266. 'type_id' => array(
  267. 'cv_id' => array(
  268. 'name' => 'sequence'
  269. ),
  270. 'name' => $type,
  271. ),
  272. 'organism_id' => $organism_id,
  273. );
  274. if(!$query_uniquename){
  275. $values['name'] = $name;
  276. } else {
  277. $values['uniquename'] = $name;
  278. }
  279. $feature = tripal_core_chado_select('feature',array('*'),$values);
  280. if(count($feature) == 0){
  281. print "WARNING: Cannot find the feature: '$name'\n";
  282. } else {
  283. // add the GO term to the feature
  284. tripal_analysis_go_load_gaff_go_term($feature[0],$go_id,$remove,$analysis_id);
  285. }
  286. }
  287. return 1;
  288. }
  289. /**
  290. *
  291. */
  292. function tripal_analysis_go_load_gaff_go_term($feature,$dbxref,$remove,$analysis_id){
  293. // get the database name from the reference. If it doesn't exist then create one.
  294. $ref = explode(":",$dbxref);
  295. $dbname = $ref[0];
  296. $accession = $ref[1];
  297. // first look for the database name
  298. $db = tripal_core_chado_select('db',array('db_id'),array('name' => "DB:$dbname"));
  299. if(sizeof($db) == 0){
  300. $db = tripal_core_chado_select('db',array('db_id'),array('name' => "$dbname"));
  301. }
  302. if(sizeof($db) == 0){
  303. print "ERROR: Database, $dbname is missing for reference: $dbname:$accession\n";
  304. return 0;
  305. }
  306. $db = $db[0];
  307. // now check to see if the accession exists
  308. $dbxref = tripal_core_chado_select('dbxref',array('dbxref_id'),array(
  309. 'accession' => $accession,'db_id' => $db->db_id));
  310. if(sizeof($dbxref) == 0){
  311. print "ERROR: Accession, $accession is missing for reference: $dbname:$accession\n";
  312. return 0;
  313. }
  314. $dbxref = $dbxref[0];
  315. // now check to see if the cvterm exists
  316. $cvterm = tripal_core_chado_select('cvterm',array('cvterm_id'),array(
  317. 'dbxref_id' => $dbxref->dbxref_id));
  318. // if it doesn't exist in the cvterm table, look for an alternate id
  319. if(sizeof($cvterm) == 0){
  320. $cvterm = tripal_core_chado_select('cvterm_dbxref',array('cvterm_id'),array(
  321. 'dbxref_id' => $dbxref->dbxref_id));
  322. }
  323. if(sizeof($cvterm) == 0){
  324. print "ERROR: CVTerm is missing for reference: $dbname:$accession\n";
  325. return 0;
  326. }
  327. $cvterm = $cvterm[0];
  328. // check to see if this feature cvterm already exists
  329. $fcvt = tripal_core_chado_select('feature_cvterm',array('feature_cvterm_id'),
  330. array('cvterm_id' => $cvterm->cvterm_id,'feature_id' => $feature->feature_id));
  331. // now associate this feature with the cvterm if it doesn't already exist
  332. if(sizeof($fcvt)==0){
  333. $values = array(
  334. 'feature_id' => $feature->feature_id,
  335. 'cvterm_id' => $cvterm->cvterm_id,
  336. 'pub_id' => array(
  337. 'uniquename' => 'null',
  338. ),
  339. );
  340. $ret = tripal_core_chado_insert('feature_cvterm',$values);
  341. if($ret){
  342. print " Added ontology term $dbname:$accession to feature $feature->uniquename\n";
  343. } else {
  344. print "ERROR: failed to insert ontology term '$dbname:$accession' for feature: $feature\n";
  345. return 0;
  346. }
  347. } else {
  348. if($remove){
  349. $status = tripal_core_chado_delete('feature_cvterm',
  350. array('cvterm_id' => $cvterm->cvterm_id,
  351. 'feature_id' => $feature->feature_id));
  352. if(!$status){
  353. print "ERROR: Failed to delete ontology term $dbname:$accession from feature $feature->uniquename\n";
  354. } else {
  355. print " Deleted ontology term $dbname:$accession from feature $feature->uniquename\n";
  356. }
  357. } else {
  358. print " Ontology term already associated to feature $feature->uniquename, skipping $dbname:$accession\n";
  359. }
  360. }
  361. if(!$remove){
  362. print " Associating feature $feature->name to analysis\n";
  363. // Insert into analysisfeature table only if it doesn't already exist
  364. $values = array('feature_id' => $feature->feature_id, 'analysis_id' => $analysis_id);
  365. $analysisfeature = tripal_core_chado_select('analysisfeature',array('*'),$values);
  366. if(sizeof($analysisfeature) == 0){
  367. $analysisfeature = tripal_core_chado_insert('analysisfeature',$values);
  368. $analysisfeature_id = $analysisfeature['analysisfeature_id'];
  369. } else {
  370. $analysisfeature_id = $analysisfeature[0]->analysisfeature_id;
  371. }
  372. // Insert GO terms into analysisfeatureprop table
  373. $values = array('analysisfeature_id' => $analysisfeature_id,
  374. 'type_id' => $cvterm->cvterm_id,
  375. 'rank' => 0);
  376. $analysisfeatureprop = tripal_core_chado_select('analysisfeatureprop',array('*'),$values);
  377. if(sizeof($analysisfeatureprop) == 0){
  378. $values['value'] = $matches[1];
  379. $analysisfeatureprop = tripal_core_chado_insert('analysisfeatureprop',$values);
  380. }
  381. }
  382. return 1;
  383. }
  384. /**
  385. *
  386. */
  387. function tripal_analysis_go_load_gaff_insert_analysisfeatureprop ($feature_id, $analysis_id,
  388. $brite_id,$keggterm)
  389. {
  390. // add the analysisfeature record if it doesn't already exist.
  391. $values = array('feature_id' => $feature_id,'analysis_id' => $analysis_id);
  392. $analysisfeature_arr = tripal_core_chado_select('analysisfeature',
  393. array('analysisfeature_id'),$values);
  394. if(count($analysisfeature_arr) == 0){
  395. tripal_core_chado_insert('analysisfeature',$values);
  396. $analysisfeature_arr = tripal_core_chado_select('analysisfeature',
  397. array('analysisfeature_id'),$values);
  398. }
  399. $analysisfeature_id = $analysisfeature_arr[0]->analysisfeature_id;
  400. // Insert into analysisfeatureprop if the value doesn't already exist
  401. // KEGG heir results sometimes have the same record more than once.
  402. if($analysisfeature_id){
  403. // Get the highest rank for this feature_id in analysisfeatureprop table
  404. $sql = "SELECT rank FROM analysisfeatureprop WHERE analysisfeature_id = %d and type_id = %d ORDER BY rank DESC";
  405. $previous_db = tripal_db_set_active('chado');
  406. $result = db_fetch_object(db_query($sql,$analysisfeature_id,$brite_id));
  407. tripal_db_set_active($previous);
  408. $rank = 0;
  409. if ($result and $result->rank > 0) {
  410. $rank = $result->rank + 1;
  411. }
  412. $values = array(
  413. 'analysisfeature_id' => $analysisfeature_id,
  414. 'type_id' => $brite_id,
  415. 'value' => $keggterm,
  416. 'rank' => $rank,
  417. );
  418. return tripal_core_chado_insert('analysisfeatureprop',$values);
  419. }
  420. else {
  421. return 0;
  422. }
  423. }