gaf_loader.inc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. <?php
  2. function tripal_analysis_go_gaf_load_form (){
  3. $form['notice']= array(
  4. '#type' => 'item',
  5. '#value' => t('Note: currently, the GAF loader only uses column 2 (Object ID) and 5 (GO ID)
  6. from the GAF file, and simply imports GO terms for the features.
  7. Further support for this file format will be provided later.'),
  8. );
  9. $form['gaf_file']= array(
  10. '#type' => 'textfield',
  11. '#title' => t('GAF File'),
  12. '#description' => t('Please enter the full system path for the GAF file, or a path within the Drupal
  13. installation (e.g. /sites/default/files/xyz.txt). The path must be accessible to the
  14. server on which this Drupal instance is running.'),
  15. '#required' => TRUE,
  16. );
  17. // get the list of organisms
  18. $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  19. $previous_db = tripal_db_set_active('chado'); // use chado database
  20. $org_rset = db_query($sql);
  21. tripal_db_set_active($previous_db); // now use drupal database
  22. $organisms = array();
  23. $organisms[''] = '';
  24. while($organism = db_fetch_object($org_rset)){
  25. $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  26. }
  27. $form['type']= array(
  28. '#type' => 'textfield',
  29. '#title' => t('Sequence Type'),
  30. '#required' => TRUE,
  31. '#description' => t('Please enter the Sequence Ontology term that describes the sequences in the GAF file.'),
  32. );
  33. $form['organism_id'] = array (
  34. '#title' => t('Organism'),
  35. '#type' => t('select'),
  36. '#description' => t("Choose the organism to which these sequences are associated "),
  37. '#required' => TRUE,
  38. '#options' => $organisms,
  39. );
  40. $form['import_options'] = array(
  41. '#type' => 'fieldset',
  42. '#title' => t('Import Options'),
  43. '#collapsed' => TRUE
  44. );
  45. $form['import_options']['add_only']= array(
  46. '#type' => 'checkbox',
  47. '#title' => t('Add GO terms'),
  48. '#required' => FALSE,
  49. '#description' => t('GO terms in the GAF file will be added to each feature.'),
  50. );
  51. // $form['import_options']['replace']= array(
  52. // '#type' => 'checkbox',
  53. // '#title' => t('Replace GO terms'),
  54. // '#required' => FALSE,
  55. // '#description' => t('All GO terms for features in the GAF file will be replaced with terms in the GAF file.'),
  56. // );
  57. $form['import_options']['remove']= array(
  58. '#type' => 'checkbox',
  59. '#title' => t('Delete GO terms'),
  60. '#required' => FALSE,
  61. '#description' => t('GO terms for features in the GAF file will be removed. Other terms will remain.'),
  62. );
  63. $form['analysis'] = array(
  64. '#type' => 'fieldset',
  65. '#title' => t('Analysis Used to Derive GO terms'),
  66. '#collapsed' => TRUE
  67. );
  68. // get the list of organisms
  69. $sql = "SELECT * FROM {analysis} ORDER BY name";
  70. $previous_db = tripal_db_set_active('chado'); // use chado database
  71. $org_rset = db_query($sql);
  72. tripal_db_set_active($previous_db); // now use drupal database
  73. $analyses = array();
  74. $analyses[''] = '';
  75. while($analysis = db_fetch_object($org_rset)){
  76. $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)";
  77. }
  78. $form['analysis']['analysis_id'] = array (
  79. '#title' => t('Analysis'),
  80. '#type' => t('select'),
  81. '#description' => t("Choose the analysis that defines how the GO annotations in the GAF file were created. "),
  82. '#required' => TRUE,
  83. '#options' => $analyses,
  84. );
  85. // Advanced Options
  86. $form['advanced'] = array(
  87. '#type' => 'fieldset',
  88. '#title' => t('Advanced Options'),
  89. '#collapsed' => TRUE
  90. );
  91. $form['advanced']['re_help']= array(
  92. '#type' => 'item',
  93. '#value' => t('A regular expression is an advanced method for extracting information from a string of text.
  94. By default, this loader will use the first word in the second column of the GAF file
  95. as the uniquename for the sequences. If this is not desired, you may use the following regular
  96. expressions to define the location of the name or unique name within the text of column 2.'),
  97. );
  98. $form['advanced']['re_name']= array(
  99. '#type' => 'textfield',
  100. '#title' => t('Regular expression for the name'),
  101. '#required' => FALSE,
  102. '#description' => t('Enter the regular expression that will extract the feature name. For example, for text with a name and uniquename separated by a bar \'|\' (>seqname|uniquename), the regular expression would be, "^(.*?)\|.*$". the name must be unique for this organism and sequence type.' ),
  103. );
  104. $form['advanced']['re_uname']= array(
  105. '#type' => 'textfield',
  106. '#title' => t('Regular expression for the unique name'),
  107. '#required' => FALSE,
  108. '#description' => t('Enter the regular expression that will extract the unique feature name. For example, for text with a name and uniquename separated by a bar \'|\' (>seqname|uniquename), the regular expression would be, "^.*?\|(.*)$". the name must be unique for this organism and sequence type.' ),
  109. );
  110. $form['button'] = array(
  111. '#type' => 'submit',
  112. '#value' => t('Import GAF file'),
  113. );
  114. return $form;
  115. }
  116. /**
  117. *
  118. *
  119. * @ingroup gff3_loader
  120. */
  121. function tripal_analysis_go_gaf_load_form_validate ($form, &$form_state){
  122. $gaf_file = $form_state['values']['gaf_file'];
  123. $organism_id = $form_state['values']['organism_id'];
  124. $add_only = $form_state['values']['add_only'];
  125. $remove = $form_state['values']['remove'];
  126. $replace = $form_state['values']['replace'];
  127. $analysis_id = $form_state['values']['analysis_id'];
  128. $type = trim($form_state['values']['type']);
  129. $re_name = trim($form_state['values']['re_name']);
  130. $re_uname = trim($form_state['values']['re_uname']);
  131. // check to see if the file is located local to Drupal
  132. $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $gaf_file;
  133. if(!file_exists($dfile)){
  134. // if not local to Drupal, the file must be someplace else, just use
  135. // the full path provided
  136. $dfile = $gaf_file;
  137. }
  138. if(!file_exists($dfile)){
  139. form_set_error('gff_file',t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
  140. }
  141. if (($add_only and ($remove or $replace)) or
  142. ($replace and ($add_only or $remove)) or
  143. ($remove and ($replace or $add_only))){
  144. form_set_error('add_only',t("Please select only one checkbox from the import options section"));
  145. }
  146. if($re_name and $re_uname){
  147. form_set_error('re_name',t("Please provide a regular expression for the name or the unique name only, not both."));
  148. }
  149. // check to make sure the types exists
  150. $cvtermsql = "SELECT CVT.cvterm_id
  151. FROM {cvterm} CVT
  152. INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
  153. LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
  154. WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
  155. $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$type,$type));
  156. if(!$cvterm){
  157. form_set_error('type',t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
  158. }
  159. if($rel_type){
  160. $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$parent_type,$parent_type));
  161. if(!$cvterm){
  162. form_set_error('parent_type',t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
  163. }
  164. }
  165. }
  166. /**
  167. *
  168. * @ingroup gff3_loader
  169. */
  170. function tripal_analysis_go_gaf_load_form_submit ($form, &$form_state){
  171. global $user;
  172. $gaf_file = $form_state['values']['gaf_file'];
  173. $organism_id = $form_state['values']['organism_id'];
  174. $add_only = $form_state['values']['add_only'];
  175. $remove = $form_state['values']['remove'];
  176. $replace = $form_state['values']['replace'];
  177. $analysis_id = $form_state['values']['analysis_id'];
  178. $type = trim($form_state['values']['type']);
  179. $re_name = trim($form_state['values']['re_name']);
  180. $re_uname = trim($form_state['values']['re_uname']);
  181. $args = array($gaf_file,$organism_id,$analysis_id,$add_only,$replace,$remove,$re_name,$re_uname,$type);
  182. if($add_only){
  183. $type = 'add GO terms';
  184. }
  185. if($replace){
  186. $type = 'replace and add GO terms';
  187. }
  188. if($remove){
  189. $type = 'remove GO terms';
  190. }
  191. tripal_add_job("Import GAF 2.0 file $gaf_file and $type",'tripal_analysis_go',
  192. 'tripal_analysis_go_load_gaf',$args,$user->uid);
  193. return '';
  194. }
  195. /**
  196. *
  197. *
  198. * @ingroup gff3_loader
  199. */
  200. function tripal_analysis_go_load_gaf($gaf_file, $organism_id,$analysis_id,$add_only =0,
  201. $replace = 0, $remove = 0, $re_name, $re_uname, $type, $job = NULL)
  202. {
  203. print "Opening GAF file $gaf_file\n";
  204. $lines = file($gaf_file,FILE_SKIP_EMPTY_LINES);
  205. $i = 0;
  206. $name = '';
  207. $residues = '';
  208. $num_lines = sizeof($lines);
  209. $interval = intval($num_lines * 0.01);
  210. if($interval == 0){
  211. $interval = 1;
  212. }
  213. foreach ($lines as $line_num => $line) {
  214. $i++; // update the line count
  215. // skip comments
  216. if(preg_match('/^\!/',$line)){
  217. continue;
  218. }
  219. // update the job status every 1% features
  220. if($job and $i % $interval == 0){
  221. tripal_job_set_progress($job,intval(($i/$num_lines)*100));
  222. }
  223. // split the line into it's columns
  224. $cols = explode("\t",$line);
  225. if(sizeof($cols) < 15){
  226. print "ERROR: improper number of columns on line $i\n";
  227. print_r($cols);
  228. return '';
  229. }
  230. $db = $cols[0];
  231. $object = $cols[1];
  232. $symbol = $cols[2];
  233. $qualifier = $cols[3];
  234. $go_id = $cols[4];
  235. $dbxref = $cols[5];
  236. $ecode = $cols[6];
  237. $with = $cols[7];
  238. $aspect = $cols[8];
  239. $obj_name = $cols[9];
  240. $obj_syn = $cols[10];
  241. $obj_type = $cols[11];
  242. $taxon = $cols[12];
  243. $date = $cols[13];
  244. $assigned = $cols[14];
  245. $exten = $cols[15];
  246. $product = $cols[16];
  247. // get the name or uniquename for the feature
  248. $uname = $object;
  249. $name = '';
  250. if($re_name){
  251. if(!preg_match("/$re_name/",$object,$matches)){
  252. print "Regular expression for the feature name finds nothing\n";
  253. } else {
  254. $name = trim($matches[1]);
  255. }
  256. } else {
  257. preg_match("/^\s*(.*?)[\s\|].*$/",$object,$matches);
  258. $name = trim($matches[1]);
  259. }
  260. if($re_uname){
  261. if(!preg_match("/$re_uname/",$object,$matches)){
  262. print "Regular expression for the feature unique name finds nothing\n";
  263. } else {
  264. $uname = trim($matches[1]);
  265. }
  266. }
  267. // get the feature
  268. $values = array(
  269. 'type_id' => array(
  270. 'cv_id' => array(
  271. 'name' => 'sequence'
  272. ),
  273. 'name' => $type,
  274. ),
  275. 'organism_id' => $organism_id,
  276. );
  277. if($name){
  278. $values['name'] = $name;
  279. }
  280. if($uname){
  281. $values['uniquename'] = $uname;
  282. }
  283. $feature = tripal_core_chado_select('feature',array('*'),$values);
  284. // add the GO term to the feature
  285. tripal_analysis_go_load_gaff_go_term($feature[0],$go_id,$remove);
  286. }
  287. return 1;
  288. }
  289. /**
  290. *
  291. */
  292. function tripal_analysis_go_load_gaff_go_term($feature,$dbxref,$remove){
  293. // get the database name from the reference. If it doesn't exist then create one.
  294. $ref = explode(":",$dbxref);
  295. $dbname = $ref[0];
  296. $accession = $ref[1];
  297. // first look for the database name
  298. $db = tripal_core_chado_select('db',array('db_id'),array('name' => "DB:$dbname"));
  299. if(sizeof($db) == 0){
  300. $db = tripal_core_chado_select('db',array('db_id'),array('name' => "$dbname"));
  301. }
  302. if(sizeof($db) == 0){
  303. print "ERROR: Database, $dbname is missing for reference: $dbname:$accession\n";
  304. return 0;
  305. }
  306. $db = $db[0];
  307. // now check to see if the accession exists
  308. $dbxref = tripal_core_chado_select('dbxref',array('dbxref_id'),array(
  309. 'accession' => $accession,'db_id' => $db->db_id));
  310. if(sizeof($dbxref) == 0){
  311. print "ERROR: Accession, $accession is missing for reference: $dbname:$accession\n";
  312. return 0;
  313. }
  314. $dbxref = $dbxref[0];
  315. // now check to see if the cvterm exists
  316. $cvterm = tripal_core_chado_select('cvterm',array('cvterm_id'),array(
  317. 'dbxref_id' => $dbxref->dbxref_id));
  318. // if it doesn't exist in the cvterm table, look for an alternate id
  319. if(sizeof($cvterm) == 0){
  320. $cvterm = tripal_core_chado_select('cvterm_dbxref',array('cvterm_id'),array(
  321. 'dbxref_id' => $dbxref->dbxref_id));
  322. }
  323. if(sizeof($cvterm) == 0){
  324. print "ERROR: CVTerm is missing for reference: $dbname:$accession\n";
  325. return 0;
  326. }
  327. $cvterm = $cvterm[0];
  328. // check to see if this feature cvterm already exists
  329. $fcvt = tripal_core_chado_select('feature_cvterm',array('feature_cvterm_id'),
  330. array('cvterm_id' => $cvterm->cvterm_id,'feature_id' => $feature->feature_id));
  331. // now associate this feature with the cvterm if it doesn't already exist
  332. if(sizeof($fcvt)==0){
  333. $values = array(
  334. 'feature_id' => $feature->feature_id,
  335. 'cvterm_id' => $cvterm->cvterm_id,
  336. 'pub_id' => array(
  337. 'uniquename' => 'null',
  338. ),
  339. );
  340. $ret = tripal_core_chado_insert('feature_cvterm',$values);
  341. if($ret){
  342. print " Added ontology term $dbname:$accession to feature $feature->uniquename\n";
  343. } else {
  344. print "ERROR: failed to insert ontology term: $dbname:$accession\n";
  345. return 0;
  346. }
  347. } else {
  348. if($remove){
  349. $status = tripal_core_chado_delete('feature_cvterm',
  350. array('cvterm_id' => $cvterm->cvterm_id,
  351. 'feature_id' => $feature->feature_id));
  352. if(!$status){
  353. print "ERROR: Failed to delete ontology term $dbname:$accession from feature $feature->uniquename\n";
  354. } else {
  355. print " Deleted ontology term $dbname:$accession from feature $feature->uniquename\n";
  356. }
  357. } else {
  358. print " Ontology term already associated to feature $feature->uniquename, skipping $dbname:$accession\n";
  359. }
  360. }
  361. return 1;
  362. }